Repository: graphcore/poptorch Branch: sdk-release-3.4 Commit: c2a8b17762f1 Files: 537 Total size: 3.4 MB Directory structure: gitextract_ur1femal/ ├── .ci/ │ └── view_component_trigger/ │ ├── Jenkinsfile │ └── jobs.groovy ├── .clang-format ├── .clang-tidy ├── .github/ │ ├── CODEOWNERS │ └── workflows/ │ └── apply_linters.yml ├── .gitignore ├── .pre-commit-config.yaml ├── .pylintrc ├── .style.yapf ├── CMakeLists.txt ├── License.txt ├── MANIFEST.in ├── README.md ├── config.buildenv.py ├── docs/ │ ├── common/ │ │ ├── _static/ │ │ │ └── css/ │ │ │ └── custom_rtd.css │ │ ├── conf.py │ │ └── custom_dic │ ├── poptorch_geometric/ │ │ ├── common/ │ │ │ └── conf.py │ │ └── user_guide/ │ │ ├── index.rst │ │ ├── installation.rst │ │ ├── intro.rst │ │ ├── legal.rst │ │ ├── performance.rst │ │ ├── reference.rst │ │ ├── supported_operations.rst │ │ └── tutorials.rst │ └── user_guide/ │ ├── CMakeLists.txt │ ├── api.py │ ├── batching.rst │ ├── buffers.py │ ├── debugging.py │ ├── debugging.rst │ ├── device_iterations.py │ ├── error_handling.py │ ├── example.rst │ ├── experimental.rst │ ├── hostio_optimisation.rst │ ├── index.rst │ ├── inferenceModel.py │ ├── installation.rst │ ├── intro.rst │ ├── legal.rst │ ├── mnist.py │ ├── overview.rst │ ├── phased_execution.py │ ├── pipeline_simple.py │ ├── poptorch.conf │ ├── poptorch_training_simple.py │ ├── precompilation.py │ ├── pytorch_to_poptorch.rst │ ├── reference.rst │ ├── replica_grouped_weights.py │ ├── sumAnchorReturnType.py │ ├── supported_ops.rst │ └── trainingModel.py ├── examples/ │ ├── CMakeLists.txt │ ├── bert_ipu.py │ ├── lstm.py │ ├── mnist.py │ └── simple_adder.py ├── popart_compiler/ │ ├── CMakeLists.txt │ ├── include/ │ │ └── popart_compiler/ │ │ ├── CodeletsCompilation.hpp │ │ ├── Compiler.hpp │ │ ├── CompilerOperationMacros.inc.hpp │ │ ├── ManuallyAddedOperations.inc.hpp │ │ ├── SupportedOperations.inc.hpp │ │ └── Utils.hpp │ ├── source/ │ │ ├── CodeletsCompilation.cpp │ │ ├── Compiler.cpp │ │ ├── CompilerImpl.cpp │ │ ├── SessionOptions.cpp │ │ ├── Utils.cpp │ │ ├── custom_operations/ │ │ │ ├── Embedding.cpp │ │ │ ├── FastGatherLastDim.cpp │ │ │ ├── FastGatherLastDim.hpp │ │ │ ├── FastGatherLastDimBwdCodelets.inc.cpp │ │ │ ├── FastGatherLastDimFwdCodelets.inc.cpp │ │ │ ├── HostOp.cpp │ │ │ ├── TorchSoftplus.cpp │ │ │ ├── TorchSoftplus.hpp │ │ │ ├── UpsampleBilinear2d.cpp │ │ │ └── UpsampleBilinear2dCodelets.inc.cpp │ │ └── include/ │ │ └── popart_compiler/ │ │ ├── CompilerImpl.hpp │ │ ├── CompilerOptions.hpp │ │ ├── CustomOps.hpp │ │ ├── MultiConvBuilder.hpp │ │ └── SessionOptionsImpl.hpp │ └── types/ │ └── include/ │ └── popart_compiler/ │ ├── CompilerTypes.hpp │ └── PopartEnums.hpp ├── poptorch/ │ ├── CMakeLists.txt │ ├── include/ │ │ └── poptorch/ │ │ ├── DispatchTracer.hpp │ │ ├── InplaceOps.hpp │ │ ├── LowerToPopart.hpp │ │ ├── LowerToPopartFactories.hpp │ │ ├── PoplarExecutable.hpp │ │ ├── SessionOptionsParser.hpp │ │ └── Utils.hpp │ └── source/ │ ├── AddDetachOperations.cpp │ ├── AddSubgraphConnectionNodes.cpp │ ├── AliasProcessing.cpp │ ├── CPUOffloadingCleanUp.cpp │ ├── CompilerOps.cpp.inc │ ├── ErrorOnUnsupportedAten.cpp │ ├── FixupSetAvailableMemory.cpp │ ├── GNNOptimizations.cpp │ ├── GatherWithExpandedIndicesOptimization.cpp │ ├── ImplicitCasting.cpp │ ├── InplaceOps.cpp │ ├── LowerToPopart.cpp │ ├── LowerToPopartFactories.cpp │ ├── OpBuilder.cpp │ ├── OverlappedIO.cpp │ ├── PopartCanonicalization.cpp │ ├── PopartLateCanonicalization.cpp │ ├── PoplarExecutable.cpp │ ├── PoptorchStaticInit.hpp │ ├── PoptorchSymbols.cpp │ ├── PoptorchSymbols.hpp │ ├── RemoveSurplusIdentityLosses.cpp │ ├── RequiresGrad.cpp │ ├── SessionOptionsParser.cpp │ ├── Utils.cpp │ ├── dispatch_tracer/ │ │ ├── CMakeLists.txt │ │ ├── CommonHelperFunctions.cpp │ │ ├── CommonHelperFunctions.hpp │ │ ├── InplaceAliasMapper.cpp │ │ ├── InplaceAliasMapper.hpp │ │ ├── README.md │ │ ├── RegisterAtenOverloads.cpp │ │ ├── RegisterMetaOps.cpp.inc │ │ ├── RegisterOptionalAtenOps.cpp.inc │ │ ├── Tensor.cpp │ │ ├── Tensor.hpp │ │ ├── TypeInferenceHandler.cpp │ │ ├── TypeInferenceHandler.hpp │ │ ├── ValueMapper.cpp │ │ ├── ValueMapper.hpp │ │ └── dispatchers/ │ │ ├── IDispatch.cpp │ │ ├── IDispatch.hpp │ │ ├── JitDispatch.cpp │ │ └── JitDispatch.hpp │ ├── include/ │ │ └── poptorch/ │ │ ├── AliasProcessing.hpp │ │ ├── CompilerOps.inc.hpp │ │ ├── ImplicitCasting.hpp │ │ ├── InplaceOpsPyTorch.hpp_nolint │ │ ├── OpBuilder.hpp │ │ ├── OverlappedIO.hpp │ │ ├── PopartCanonicalization.hpp │ │ ├── RequiresGrad.hpp │ │ └── TypeAndConstantCanonicalization.hpp │ ├── popart_canonicalization/ │ │ ├── ActivationOps.cpp │ │ ├── ArithmeticOps.cpp │ │ ├── AtenHandlers.gen.cpp │ │ ├── BilinearOps.cpp │ │ ├── BitwiseOps.cpp │ │ ├── BlasOps.cpp │ │ ├── ConstantOps.cpp │ │ ├── ConvolutionOps.cpp │ │ ├── CustomOps.cpp │ │ ├── DistanceOps.cpp │ │ ├── DropoutOps.cpp │ │ ├── EinsumOp.cpp │ │ ├── EinsumOp.hpp │ │ ├── EmbeddingOps.cpp │ │ ├── IndexOps.cpp │ │ ├── LossOps.cpp │ │ ├── NormalizationOps.cpp │ │ ├── OtherOps.cpp │ │ ├── PoolingOps.cpp │ │ ├── PopartCanonicalizationUtils.cpp │ │ ├── PopartCanonicalizationUtils.hpp │ │ ├── PoptorchHandlers.gen.cpp │ │ ├── PyGTorchScatterOps.cpp │ │ ├── PyGTorchSplineConvOps.cpp │ │ ├── RNNOps.cpp │ │ ├── RandomSamplingOps.cpp │ │ ├── ReduceOps.cpp │ │ ├── ReshapeOps.cpp │ │ ├── ScatterReduction.cpp │ │ ├── ScatterReduction.hpp │ │ ├── SliceOps.cpp │ │ ├── SoftmaxOps.cpp │ │ ├── TensorOps.cpp │ │ └── pyg_torch_cluster/ │ │ ├── FpsOp.cpp │ │ ├── GridOp.cpp │ │ └── NearestOp.cpp │ └── type_and_constant_canonicalization/ │ ├── AddListNumElements.cpp │ ├── CanonicaliseConstants.cpp │ ├── CastUnsupportedInputs.cpp │ ├── CheckAndChangeOutputTypes.cpp │ ├── EvaluateConstexprs.cpp │ └── MakeConstantIntParams.cpp ├── poptorch_compiler/ │ └── pytorch_bridge/ │ ├── CMakeLists.txt │ ├── IpuSession.cpp │ └── include/ │ └── pytorch_bridge/ │ ├── CompilerOptions.hpp │ ├── CompilerTypes.hpp │ ├── DebugInfo.hpp │ └── IpuSession.hpp ├── poptorch_err/ │ ├── CMakeLists.txt │ ├── exception_info/ │ │ └── poptorch_err/ │ │ └── ExceptionInfo.hpp │ ├── include/ │ │ └── poptorch_err/ │ │ └── ExceptionHandling.hpp │ └── source/ │ └── ExceptionHandling.cpp ├── poptorch_geometric/ │ ├── CMakeLists.txt │ ├── License.txt │ ├── MANIFEST.in │ ├── README.md │ ├── config.buildenv.py │ ├── poptorch_geometric_third_party_licenses.txt │ ├── pyproject.toml │ ├── python/ │ │ ├── CMakeLists.txt │ │ ├── __init__.py │ │ ├── cluster_loader.py │ │ ├── collate.py │ │ ├── common.py │ │ ├── dataloader.py │ │ ├── fixed_size_options.py │ │ ├── masker.py │ │ ├── neighbor_loader.py │ │ ├── ops/ │ │ │ ├── __init__.py │ │ │ ├── aggregation_base.py │ │ │ ├── cluster_gcn_conv.py │ │ │ ├── hetero_linear.py │ │ │ ├── instance_norm.py │ │ │ ├── knn.py │ │ │ ├── knn_graph.py │ │ │ ├── knn_interpolate.py │ │ │ ├── mf_conv.py │ │ │ └── radius.py │ │ ├── override.py │ │ ├── py.typed │ │ ├── pyg_cluster_loader.py │ │ ├── pyg_collate.py │ │ ├── pyg_dataloader.py │ │ ├── stream_packing_sampler.py │ │ ├── types.py │ │ └── utils.py │ ├── requirements.txt │ ├── setup.cfg │ └── setup.py ├── poptorch_logging/ │ ├── CMakeLists.txt │ ├── include/ │ │ └── poptorch_logging/ │ │ ├── Error.hpp │ │ ├── Logging.hpp │ │ ├── LoggingLight.hpp │ │ └── Tracepoint.hpp │ └── source/ │ ├── Error.cpp │ ├── Logging.cpp │ └── Tracepoint.cpp ├── poptorch_third_party_licenses.txt ├── pyproject.toml ├── python/ │ ├── CMakeLists.txt │ ├── __init__.py │ ├── _args_parser.py │ ├── _dataloader.py │ ├── _impl.py │ ├── _logging.py │ ├── _optimizer_attributes.py │ ├── _options_config.py │ ├── _options_impl.py │ ├── _poplar_executor.py │ ├── _poptorch_data.py │ ├── _printing.py │ ├── _utils.py │ ├── enums.py │ ├── ops.py │ ├── optim.py │ ├── options.py │ ├── poptorch.cpp │ ├── profiling.py │ ├── py.typed │ └── testing.py ├── requirements.txt ├── scripts/ │ ├── PopAtenHandlers.py │ ├── PopParse.py │ ├── PopTorchHandlers.py │ ├── __init__.py │ ├── apply_linters.py │ ├── check_spelling.py │ ├── create_buildenv.py │ ├── docs_build.py │ ├── download_external_datasets.py │ ├── enable.sh.in │ ├── generate_poppyg_package.py │ ├── generate_python_package.py │ ├── popgen/ │ │ ├── __init__.py │ │ ├── api.py │ │ ├── generator.py │ │ ├── helpers.py │ │ ├── onnx.py │ │ ├── operatorfactory.py │ │ ├── poptorch.py │ │ ├── registry.py │ │ ├── transform.py │ │ └── values.py │ ├── set_version.py │ └── utils/ │ └── _utils.py ├── setup.cfg ├── setup.py ├── tests/ │ ├── .gitignore │ ├── CMakeLists.txt │ ├── activations_test.py │ ├── attach_detach_test.py │ ├── attach_detach_wait_for_ipu_test.py │ ├── batching_test.py │ ├── bert_small_and_medium_test.py │ ├── blas_test.py │ ├── bool_support_test.py │ ├── buffers_test.py │ ├── conftest.py │ ├── convs_test.py │ ├── cpp/ │ │ ├── CMakeLists.txt │ │ └── GNNOptimizationsTest.cpp │ ├── cpu_op_test.py │ ├── ctc_decoder_test.py │ ├── custom_loss_test.py │ ├── custom_ops/ │ │ ├── CMakeLists.txt │ │ ├── custom_add_scalar_op.cpp │ │ ├── custom_add_scalar_vec_op.cpp │ │ ├── custom_add_vec_scalar_mul_op.cpp │ │ ├── custom_cube_op.cpp │ │ ├── custom_leaky_relu_op.cpp │ │ ├── custom_many_attribute_op.cpp │ │ ├── custom_reduce_op.cpp │ │ └── custom_three_input_reduce_op.cpp │ ├── custom_ops_attributes_test.py │ ├── custom_ops_test.py │ ├── dataloader_test.py │ ├── debug_tensors_test.py │ ├── distance_ops_test.py │ ├── exception_test.py │ ├── fine_tuning_test.py │ ├── functional_test.py │ ├── generate_test_file.py │ ├── gnn/ │ │ ├── .gitignore │ │ ├── benchgnn/ │ │ │ ├── README.md │ │ │ ├── benchgnn.py │ │ │ ├── datasets.py │ │ │ ├── models.py │ │ │ ├── requirements.txt │ │ │ └── utils.py │ │ ├── benchgnn_ops/ │ │ │ ├── README.md │ │ │ ├── benchgnn_ops.py │ │ │ ├── builder.py │ │ │ ├── example_configs/ │ │ │ │ ├── common.yaml │ │ │ │ ├── scatter_testcase1.yaml │ │ │ │ └── scatter_testcase2.yaml │ │ │ ├── metrics.py │ │ │ ├── ops.py │ │ │ └── requirements.txt │ │ ├── conftest.py │ │ ├── nn/ │ │ │ ├── aggr/ │ │ │ │ ├── aggr_utils.py │ │ │ │ ├── conftest.py │ │ │ │ ├── test_attention.py │ │ │ │ ├── test_basic.py │ │ │ │ ├── test_deep_sets.py │ │ │ │ ├── test_equilibrium.py │ │ │ │ ├── test_fused.py │ │ │ │ ├── test_gmt.py │ │ │ │ ├── test_gru.py │ │ │ │ ├── test_lstm.py │ │ │ │ ├── test_mlp_aggr.py │ │ │ │ ├── test_multi.py │ │ │ │ ├── test_quantile.py │ │ │ │ ├── test_scaler.py │ │ │ │ ├── test_set2set.py │ │ │ │ ├── test_set_transformer.py │ │ │ │ └── test_sort.py │ │ │ ├── conftest.py │ │ │ ├── conv/ │ │ │ │ ├── conv_utils.py │ │ │ │ ├── test_agnn_conv.py │ │ │ │ ├── test_antisymmetric_conv.py │ │ │ │ ├── test_appnp.py │ │ │ │ ├── test_arma_conv.py │ │ │ │ ├── test_cg_conv.py │ │ │ │ ├── test_cheb_conv.py │ │ │ │ ├── test_cluster_gcn_conv.py │ │ │ │ ├── test_dna_conv.py │ │ │ │ ├── test_edge_conv.py │ │ │ │ ├── test_eg_conv.py │ │ │ │ ├── test_fa_conv.py │ │ │ │ ├── test_feast_conv.py │ │ │ │ ├── test_film_conv.py │ │ │ │ ├── test_gat_conv.py │ │ │ │ ├── test_gated_graph_conv.py │ │ │ │ ├── test_gatv2_conv.py │ │ │ │ ├── test_gcn2_conv.py │ │ │ │ ├── test_gcn_conv.py │ │ │ │ ├── test_gen_conv.py │ │ │ │ ├── test_general_conv.py │ │ │ │ ├── test_gin_conv.py │ │ │ │ ├── test_gmm_conv.py │ │ │ │ ├── test_gps_conv.py │ │ │ │ ├── test_graph_conv.py │ │ │ │ ├── test_gravnet_conv.py │ │ │ │ ├── test_han_conv.py │ │ │ │ ├── test_heat_conv.py │ │ │ │ ├── test_hetero_conv.py │ │ │ │ ├── test_hgt_conv.py │ │ │ │ ├── test_hypergraph_conv.py │ │ │ │ ├── test_le_conv.py │ │ │ │ ├── test_lg_conv.py │ │ │ │ ├── test_mf_conv.py │ │ │ │ ├── test_nn_conv.py │ │ │ │ ├── test_pan_conv.py │ │ │ │ ├── test_pdn_conv.py │ │ │ │ ├── test_pna_conv.py │ │ │ │ ├── test_point_conv.py │ │ │ │ ├── test_point_gnn_conv.py │ │ │ │ ├── test_point_transformer_conv.py │ │ │ │ ├── test_ppf_conv.py │ │ │ │ ├── test_res_gated_graph_conv.py │ │ │ │ ├── test_rgat_conv.py │ │ │ │ ├── test_rgcn_conv.py │ │ │ │ ├── test_sage_conv.py │ │ │ │ ├── test_sg_conv.py │ │ │ │ ├── test_signed_conv.py │ │ │ │ ├── test_simple_conv.py │ │ │ │ ├── test_spline_conv.py │ │ │ │ ├── test_ssg_conv.py │ │ │ │ ├── test_supergat_conv.py │ │ │ │ ├── test_tag_conv.py │ │ │ │ ├── test_transformer_conv.py │ │ │ │ ├── test_wl_conv.py │ │ │ │ ├── test_wl_conv_continuous.py │ │ │ │ └── test_x_conv.py │ │ │ ├── dense/ │ │ │ │ ├── dense_utils.py │ │ │ │ └── test_convs.py │ │ │ ├── functional/ │ │ │ │ ├── test_bro.py │ │ │ │ └── test_gini.py │ │ │ ├── kge/ │ │ │ │ ├── kge_utils.py │ │ │ │ ├── test_complex.py │ │ │ │ ├── test_distmult.py │ │ │ │ ├── test_rotate.py │ │ │ │ └── test_transe.py │ │ │ ├── nn_utils.py │ │ │ ├── norm/ │ │ │ │ ├── norm_utils.py │ │ │ │ ├── test_batch_norm.py │ │ │ │ ├── test_diff_group_norm.py │ │ │ │ ├── test_graph_norm.py │ │ │ │ ├── test_graph_size_norm.py │ │ │ │ ├── test_instance_norm.py │ │ │ │ ├── test_layer_norm.py │ │ │ │ ├── test_mean_subtraction_norm.py │ │ │ │ ├── test_msg_norm.py │ │ │ │ └── test_pair_norm.py │ │ │ ├── pool/ │ │ │ │ ├── pool_utils.py │ │ │ │ ├── test_asap.py │ │ │ │ ├── test_avg_pool.py │ │ │ │ ├── test_consecutive.py │ │ │ │ ├── test_decimation.py │ │ │ │ ├── test_edge_pool.py │ │ │ │ ├── test_fps.py │ │ │ │ ├── test_glob.py │ │ │ │ ├── test_graclus.py │ │ │ │ ├── test_max_pool.py │ │ │ │ ├── test_mem_pool.py │ │ │ │ ├── test_pan_pool.py │ │ │ │ ├── test_pool_knn.py │ │ │ │ ├── test_radius.py │ │ │ │ ├── test_sag_pool.py │ │ │ │ ├── test_select_topk.py │ │ │ │ ├── test_topk_pool.py │ │ │ │ └── test_voxel_grid.py │ │ │ ├── test_linear.py │ │ │ ├── test_loss.py │ │ │ ├── test_mish.py │ │ │ ├── test_sequential.py │ │ │ └── unpool/ │ │ │ └── test_interpolate.py │ │ ├── ops/ │ │ │ ├── test_knn.py │ │ │ ├── test_knn_graph.py │ │ │ ├── test_knn_interpolate.py │ │ │ ├── test_nearest.py │ │ │ ├── test_radius_op.py │ │ │ ├── test_spline_conv_ops.py │ │ │ └── test_to_dense_batch.py │ │ ├── test_basic_gnn.py │ │ ├── test_cluster_loader.py │ │ ├── test_collate.py │ │ ├── test_dataloader.py │ │ ├── test_encoding.py │ │ ├── test_fixed_size_options.py │ │ ├── test_masker.py │ │ ├── test_model_args.py │ │ ├── test_neighbor_loader.py │ │ ├── test_register_custom_args.py │ │ ├── test_stream_packing_sampler.py │ │ └── utils.py │ ├── grouping_scatters_gathers_test.py │ ├── gru_test.py │ ├── half_float_test.py │ ├── half_test.py │ ├── helpers.py │ ├── hooks_test.py │ ├── if_test.py │ ├── index_ops_test.py │ ├── inplace_test.py │ ├── inputs_test.py │ ├── io_performance_test.py │ ├── ipu_print_tensor_test.py │ ├── loop_test.py │ ├── losses_test.py │ ├── lstm_test.py │ ├── math_ops_test.py │ ├── misc_nn_layers_test.py │ ├── misc_test.py │ ├── multiconv_test.py │ ├── non_contiguous_tensors_test.py │ ├── norms_test.py │ ├── ops_test.py │ ├── optimizers_test.py │ ├── options_test.py │ ├── other_ops_test.py │ ├── outputs_test.py │ ├── overlapped_io_test.py │ ├── phased_execution_test.py │ ├── pipelining_test.py │ ├── pooling_and_padding_test.py │ ├── popdist_test.py │ ├── poplar_executor_test.py │ ├── precompilation_test.py │ ├── pyg_torch_scatter_test.py │ ├── random_sampling_test.py │ ├── reduce_ops_test.py │ ├── replicated_graph_test.py │ ├── requires_grad_test.py │ ├── rnn_test.py │ ├── sharding_test.py │ ├── slice_test.py │ ├── tensor_ops_test.py │ ├── test_doc_urls.py │ ├── test_perf_counters.py │ ├── timeout_handler.py │ ├── torch_nn_test.py │ ├── torchvision_inference_test.py │ ├── type_support_test.py │ └── weights_writing_test.py └── version.json ================================================ FILE CONTENTS ================================================ ================================================ FILE: .ci/view_component_trigger/Jenkinsfile ================================================ @Library('sw-jenkins-library@view-component-trigger') _ viewComponentTrigger(jobsFilepath: '.ci/view_component_trigger/jobs.groovy') ================================================ FILE: .ci/view_component_trigger/jobs.groovy ================================================ [ [ job: '/poptorch/poptorch_pr', parameters: [ string(name: 'GCCI_BRANCH', value: 'mk2-main') ] ], ] ================================================ FILE: .clang-format ================================================ Language: Cpp BasedOnStyle: llvm ================================================ FILE: .clang-tidy ================================================ Checks: '*, -abseil*, -altera*, -android*, -cppcoreguidelines*, -cert*, -modernize*, -boost*, -google*, -fuchsia*, -hicpp*, -objc*, -llvm*, -bugprone-exception-escape, -readability-uppercase-literal-suffix, -misc-non-private-member-variables-in-classes, -fuchsia-default-arguments-declarations, -fuchsia-default-arguments-calls, -readability-magic-numbers, -fuchsia-overloaded-operator, -performance-noexcept-move-constructor, -concurrency-mt-unsafe, -readability-function-cognitive-complexity, -misc-throw-by-value-catch-by-reference, -misc-no-recursion, -bugprone-narrowing-conversions, -bugprone-easily-swappable-parameters, -readability-make-member-function-const, -readability-use-anyofallof, -readability-identifier-length,-misc-confusable-identifiers,-bugprone-reserved-identifier,-misc-unused-using-decls' WarningsAsErrors: '*' HeaderFilterRegex: '' AnalyzeTemporaryDtors: false CheckOptions: - key: readability-identifier-naming.NamespaceCase value: lower_case - key: readability-identifier-naming.ClassCase value: CamelCase - key: readability-identifier-naming.StructCase value: CamelCase - key: readability-identifier-naming.PrivateMemberPrefix value: _ - key: readability-identifier-naming.ProtectedMemberPrefix value: _ - key: readability-identifier-naming.MemberCase value: lower_case - key: readability-identifier-naming.StructCase value: CamelCase - key: readability-identifier-naming.MethodCase value: camelBack - key: readability-identifier-naming.FunctionCase value: camelBack - key: readability-identifier-naming.VariableCase value: lower_case - key: misc-throw-by-value-catch-by-reference.MaxSize value: '8' ================================================ FILE: .github/CODEOWNERS ================================================ * @Software-GCAI/poptorch ================================================ FILE: .github/workflows/apply_linters.yml ================================================ name: apply_linters.py git trailer check on: push: branches: [mk2-main] pull_request: branches: [mk2-main] jobs: apply_linters: timeout-minutes: 10 name: apply_linters.py git trailer check runs-on: [self-hosted, linux] steps: - uses: actions/checkout@v3 with: # 0 indicates fetch history for all branches and tags. # By default the checkout action only checks out the PR # ref. However apply_linters.py needs run git commands # that reference origin/mk2-main. fetch-depth: 0 # Checkout the head instead of the merge commit ref: ${{ github.event.pull_request.head.sha }} - name: Verify most recent commit's git trailer run: python scripts/apply_linters.py --check-trailer ================================================ FILE: .gitignore ================================================ build __pycache__ .linters .cache .vscode test_data ================================================ FILE: .pre-commit-config.yaml ================================================ repos: - repo: https://github.com/pre-commit/pre-commit-hooks rev: v2.3.0 hooks: - id: check-yaml - id: end-of-file-fixer - id: check-merge-conflict - id: trailing-whitespace - repo: local hooks: - id: apply_linters name: apply_linters entry: scripts/apply_linters.py language: python args: [-a, --add-trailer-on-success, --debug, --git-strategy=pre-commit] additional_dependencies: [pyyaml==6.0.0, packaging==23.0.0, colorama==0.4.6] # For the git trailer to be correct apply_linters.py must be applied to all the files. ================================================ FILE: .pylintrc ================================================ [MASTER] # Prevent pylint from incorrectly reporting 'has no member' for C modules # by allowing them to be loaded (does not happen by default due to security # concerns) extension-pkg-whitelist=numpy,torch # Specify a configuration file. #rcfile= # Python code to execute, usually for sys.path manipulation such as # pygtk.require(). # This adds /tests to the path (Needed for tests in subfolders to find helpers.py) init-hook="from pylint.config import find_pylintrc; import os, sys; sys.path.append(os.path.join(os.path.dirname(os.path.realpath(find_pylintrc())), 'tests'))" # Profiled execution. profile=no # Add files or directories to the blacklist. They should be base names, not # paths. ignore=git # Pickle collected data for later comparisons. persistent=yes # List of plugins (as comma separated values of python modules names) to load, # usually to register additional checkers. load-plugins= [MESSAGES CONTROL] # Enable the message, report, category or checker with the given id(s). You can # either give multiple identifier separated by comma (,) or put this option # multiple time. See also the "--disable" option for examples. enable=indexing-exception,old-raise-syntax # Disable the message, report, category or checker with the given id(s). You # can either give multiple identifiers separated by comma (,) or put this # option multiple times (only on the command line, not in the configuration # file where it should appear only once).You can also use "--disable=all" to # disable everything first and then reenable specific checks. For example, if # you want to run only the similarities checker, you can use "--disable=all # --enable=similarities". If you want to run only the classes checker, but have # no Warning level messages displayed, use"--disable=all --enable=classes # --disable=W" disable=global-statement,no-self-use,import-error,missing-docstring,invalid-name,redefined-builtin,too-few-public-methods,redefined-outer-name,no-name-in-module,not-callable,too-many-instance-attributes,too-many-branches,too-many-locals,too-many-arguments,too-many-statements # not-callable leads to false positives due to PyTorch's tensor.py # Set the cache size for astng objects. cache-size=500 [REPORTS] # Set the output format. Available formats are text, parseable, colorized, msvs # (visual studio) and html. You can also give a reporter class, eg # mypackage.mymodule.MyReporterClass. output-format=text # Put messages in a separate file for each module / package specified on the # command line instead of printing them on stdout. Reports (if any) will be # written in a file name "pylint_global.[txt|html]". files-output=no # Tells whether to display a full report or only the messages reports=no # Python expression which should return a note less than 10 (10 is the highest # note). You have access to the variables errors warning, statement which # respectively contain the number of errors / warnings messages and the total # number of statements analyzed. This is used by the global evaluation report # (RP0004). evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10) # Add a comment according to your evaluation note. This is used by the global # evaluation report (RP0004). comment=no # Template used to display messages. This is a python new-style format string # used to format the message information. See doc for all details #msg-template= [TYPECHECK] # Tells whether missing members accessed in mixin class should be ignored. A # mixin class is detected if its name ends with "mixin" (case insensitive). ignore-mixin-members=yes # List of classes names for which member attributes should not be checked # (useful for classes with attributes dynamically set). # Workaround for pylint incorrectly reporting 'has no member' for torch # https://github.com/pytorch/pytorch/issues/701 ignored-classes=SQLObject,torch ignored-modules=torch # When zope mode is activated, add a predefined set of Zope acquired attributes # to generated-members. zope=no # List of members which are set dynamically and missed by pylint inference # system, and so shouldn't trigger E0201 when accessed. Python regular # expressions are accepted. generated-members=REQUEST,acl_users,aq_parent # List of decorators that create context managers from functions, such as # contextlib.contextmanager. contextmanager-decorators=contextlib.contextmanager,contextlib2.contextmanager [VARIABLES] # Tells whether we should check for unused import in __init__ files. init-import=no # A regular expression matching the beginning of the name of dummy variables # (i.e. not used). dummy-variables-rgx=^\*{0,2}(_$|unused_|dummy_) # List of additional names supposed to be defined in builtins. Remember that # you should avoid to define new builtins when possible. additional-builtins= [BASIC] # Required attributes for module, separated by a comma required-attributes= # List of builtins function names that should not be used, separated by a comma bad-functions=apply,input,reduce # Disable the report(s) with the given id(s). # All non-Google reports are disabled by default. disable-report=R0001,R0002,R0003,R0004,R0101,R0102,R0201,R0202,R0220,R0401,R0402,R0701,R0801,R0901,R0902,R0903,R0904,R0911,R0912,R0913,R0914,R0915,R0921,R0922,R0923 # Regular expression which should only match correct module names module-rgx=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$ # Regular expression which should only match correct module level names const-rgx=^(_?[A-Z][A-Z0-9_]*|__[a-z0-9_]+__|_?[a-z][a-z0-9_]*)$ # Regular expression which should only match correct class names class-rgx=^_?[A-Z][a-zA-Z0-9]*$ # Regular expression which should only match correct function names function-rgx=^(?:(?P_?[A-Z][a-zA-Z0-9]*)|(?P_?[a-z][a-z0-9_]*))$ # Regular expression which should only match correct method names method-rgx=^(?:(?P__[a-z0-9_]+__|next)|(?P_{0,2}[A-Z][a-zA-Z0-9]*)|(?P_{0,2}[a-z][a-z0-9_]*))$ # Regular expression which should only match correct instance attribute names attr-rgx=^_{0,2}[a-z][a-z0-9_]*$ # Regular expression which should only match correct argument names argument-rgx=^[a-z][a-z0-9_]*$ # Regular expression which should only match correct variable names variable-rgx=^[a-z][a-z0-9_]*$ # Regular expression which should only match correct attribute names in class # bodies class-attribute-rgx=^(_?[A-Z][A-Z0-9_]*|__[a-z0-9_]+__|_?[a-z][a-z0-9_]*)$ # Regular expression which should only match correct list comprehension / # generator expression variable names inlinevar-rgx=^[a-z][a-z0-9_]*$ # Good variable names which should always be accepted, separated by a comma good-names=main,_ # Bad variable names which should always be refused, separated by a comma bad-names= # Regular expression which should only match function or class names that do # not require a docstring. no-docstring-rgx=(__.*__|main) # Minimum line length for functions/classes that require docstrings, shorter # ones are exempt. docstring-min-length=10 [FORMAT] # Maximum number of characters on a single line. max-line-length=80 # Regexp for a line that is allowed to be longer than the limit. ignore-long-lines=(?x) (^\s*(import|from)\s |\$Id:\s\/\/depot\/.+#\d+\s\$ |^[a-zA-Z_][a-zA-Z0-9_]*\s*=\s*("[^"]\S+"|'[^']\S+') |^\s*\#\ LINT\.ThenChange |^[^#]*\#\ type:\ [a-zA-Z_][a-zA-Z0-9_.,[\] ]*$ |pylint |""" |\# |lambda |(https?|ftp):) # Allow the body of an if to be on the same line as the test if there is no # else. single-line-if-stmt=y # List of optional constructs for which whitespace checking is disabled no-space-check=trailing-comma # Maximum number of lines in a module max-module-lines=99999 # String used as indentation unit. This is usually " " (4 spaces) or "\t" (1 # tab). indent-string=' ' [SIMILARITIES] # Minimum lines number of a similarity. min-similarity-lines=4 # Ignore comments when computing similarities. ignore-comments=yes # Ignore docstrings when computing similarities. ignore-docstrings=yes # Ignore imports when computing similarities. ignore-imports=no [MISCELLANEOUS] # List of note tags to take in consideration, separated by a comma. notes= [IMPORTS] # Deprecated modules which should not be used, separated by a comma deprecated-modules=regsub,TERMIOS,Bastion,rexec,sets # Create a graph of every (i.e. internal and external) dependencies in the # given file (report RP0402 must not be disabled) import-graph= # Create a graph of external dependencies in the given file (report RP0402 must # not be disabled) ext-import-graph= # Create a graph of internal dependencies in the given file (report RP0402 must # not be disabled) int-import-graph= [CLASSES] # List of interface methods to ignore, separated by a comma. This is used for # instance to not check methods defines in Zope's Interface base class. ignore-iface-methods=isImplementedBy,deferred,extends,names,namesAndDescriptions,queryDescriptionFor,getBases,getDescriptionFor,getDoc,getName,getTaggedValue,getTaggedValueTags,isEqualOrExtendedBy,setTaggedValue,isImplementedByInstancesOf,adaptWith,is_implemented_by # List of method names used to declare (i.e. assign) instance attributes. defining-attr-methods=__init__,__new__,setUp # List of valid names for the first argument in a class method. valid-classmethod-first-arg=cls,class_ # List of valid names for the first argument in a metaclass class method. valid-metaclass-classmethod-first-arg=mcs [DESIGN] # Maximum number of arguments for function / method max-args=5 # Argument names that match this expression will be ignored. Default to name # with leading underscore ignored-argument-names=_.* # Maximum number of locals for function / method body max-locals=15 # Maximum number of return / yield for function / method body max-returns=6 # Maximum number of branch for function / method body max-branches=12 # Maximum number of statements in function / method body max-statements=50 # Maximum number of parents for a class (see R0901). max-parents=7 # Maximum number of attributes for a class (see R0902). max-attributes=7 # Minimum number of public methods for a class (see R0903). min-public-methods=2 # Maximum number of public methods for a class (see R0904). max-public-methods=20 [EXCEPTIONS] # Exceptions that will emit a warning when being caught. Defaults to # "Exception" overgeneral-exceptions=Exception,StandardError,BaseException [AST] # Maximum line length for lambdas short-func-length=1 # List of module members that should be marked as deprecated. # All of the string functions are listed in 4.1.4 Deprecated string functions # in the Python 2.4 docs. deprecated-members=string.atof,string.atoi,string.atol,string.capitalize,string.expandtabs,string.find,string.rfind,string.index,string.rindex,string.count,string.lower,string.split,string.rsplit,string.splitfields,string.join,string.joinfields,string.lstrip,string.rstrip,string.strip,string.swapcase,string.translate,string.upper,string.ljust,string.rjust,string.center,string.zfill,string.replace,sys.exitfunc [DOCSTRING] # List of exceptions that do not need to be mentioned in the Raises section of # a docstring. ignore-exceptions=AssertionError,NotImplementedError,StopIteration,TypeError [TOKENS] # Number of spaces of indent required when the last token on the preceding line # is an open (, [, or {. indent-after-paren=4 [GOOGLE LINES] # Regexp for a proper copyright notice. copyright=Copyright \d{4} The TensorFlow Authors\. +All [Rr]ights [Rr]eserved\. ================================================ FILE: .style.yapf ================================================ [style] # Align closing bracket with visual indentation. align_closing_bracket_with_visual_indent=True # Allow dictionary keys to exist on multiple lines. For example: # # x = { # ('this is the first element of a tuple', # 'this is the second element of a tuple'): # value, # } allow_multiline_dictionary_keys=False # Allow lambdas to be formatted on more than one line. allow_multiline_lambdas=False # Allow splits before the dictionary value. allow_split_before_dict_value=True # Number of blank lines surrounding top-level function and class # definitions. blank_lines_around_top_level_definition=2 # Insert a blank line before a class-level docstring. blank_line_before_class_docstring=False # Insert a blank line before a module docstring. blank_line_before_module_docstring=False # Insert a blank line before a 'def' or 'class' immediately nested # within another 'def' or 'class'. For example: # # class Foo: # # <------ this blank line # def method(): # ... blank_line_before_nested_class_or_def=False # Do not split consecutive brackets. Only relevant when # dedent_closing_brackets is set. For example: # # call_func_that_takes_a_dict( # { # 'key1': 'value1', # 'key2': 'value2', # } # ) # # would reformat to: # # call_func_that_takes_a_dict({ # 'key1': 'value1', # 'key2': 'value2', # }) coalesce_brackets=False # The column limit. column_limit=79 # The style for continuation alignment. Possible values are: # # - SPACE: Use spaces for continuation alignment. This is default behavior. # - FIXED: Use fixed number (CONTINUATION_INDENT_WIDTH) of columns # (ie: CONTINUATION_INDENT_WIDTH/INDENT_WIDTH tabs) for continuation # alignment. # - LESS: Slightly left if cannot vertically align continuation lines with # indent characters. # - VALIGN-RIGHT: Vertically align continuation lines with indent # characters. Slightly right (one more indent character) if cannot # vertically align continuation lines with indent characters. # # For options FIXED, and VALIGN-RIGHT are only available when USE_TABS is # enabled. continuation_align_style=SPACE # Indent width used for line continuations. continuation_indent_width=4 # Put closing brackets on a separate line, dedented, if the bracketed # expression can't fit in a single line. Applies to all kinds of brackets, # including function definitions and calls. For example: # # config = { # 'key1': 'value1', # 'key2': 'value2', # } # <--- this bracket is dedented and on a separate line # # time_series = self.remote_client.query_entity_counters( # entity='dev3246.region1', # key='dns.query_latency_tcp', # transform=Transformation.AVERAGE(window=timedelta(seconds=60)), # start_ts=now()-timedelta(days=3), # end_ts=now(), # ) # <--- this bracket is dedented and on a separate line dedent_closing_brackets=False # Disable the heuristic which places each list element on a separate line # if the list is comma-terminated. disable_ending_comma_heuristic=False # Place each dictionary entry onto its own line. each_dict_entry_on_separate_line=True # The regex for an i18n comment. The presence of this comment stops # reformatting of that line, because the comments are required to be # next to the string they translate. i18n_comment= # The i18n function call names. The presence of this function stops # reformattting on that line, because the string it has cannot be moved # away from the i18n comment. i18n_function_call= # Indent the dictionary value if it cannot fit on the same line as the # dictionary key. For example: # # config = { # 'key1': # 'value1', # 'key2': value1 + # value2, # } indent_dictionary_value=False # The number of columns to use for indentation. indent_width=4 # Join short lines into one line. E.g., single line 'if' statements. join_multiple_lines=True # Do not include spaces around selected binary operators. For example: # # 1 + 2 * 3 - 4 / 5 # # will be formatted as follows when configured with *,/: # # 1 + 2*3 - 4/5 # no_spaces_around_selected_binary_operators=set([]) # Use spaces around default or named assigns. spaces_around_default_or_named_assign=False # Use spaces around the power operator. spaces_around_power_operator=False # The number of spaces required before a trailing comment. spaces_before_comment=2 # Insert a space between the ending comma and closing bracket of a list, # etc. space_between_ending_comma_and_closing_bracket=True # Split before arguments split_all_comma_separated_values=False # Split before arguments if the argument list is terminated by a # comma. split_arguments_when_comma_terminated=False # Set to True to prefer splitting before '&', '|' or '^' rather than # after. split_before_bitwise_operator=True # Split before the closing bracket if a list or dict literal doesn't fit on # a single line. split_before_closing_bracket=True # Split before a dictionary or set generator (comp_for). For example, note # the split before the 'for': # # foo = { # variable: 'Hello world, have a nice day!' # for variable in bar if variable != 42 # } split_before_dict_set_generator=True # Split after the opening paren which surrounds an expression if it doesn't # fit on a single line. split_before_expression_after_opening_paren=False # If an argument / parameter list is going to be split, then split before # the first argument. split_before_first_argument=False # Set to True to prefer splitting before 'and' or 'or' rather than # after. split_before_logical_operator=True # Split named assignments onto individual lines. split_before_named_assigns=True # Set to True to split list comprehensions and generators that have # non-trivial expressions and multiple clauses before each of these # clauses. For example: # # result = [ # a_long_var + 100 for a_long_var in xrange(1000) # if a_long_var % 10] # # would reformat to something like: # # result = [ # a_long_var + 100 # for a_long_var in xrange(1000) # if a_long_var % 10] split_complex_comprehension=False # The penalty for splitting right after the opening bracket. split_penalty_after_opening_bracket=30 # The penalty for splitting the line after a unary operator. split_penalty_after_unary_operator=10000 # The penalty for splitting right before an if expression. split_penalty_before_if_expr=0 # The penalty of splitting the line around the '&', '|', and '^' # operators. split_penalty_bitwise_operator=300 # The penalty for splitting a list comprehension or generator # expression. split_penalty_comprehension=80 # The penalty for characters over the column limit. split_penalty_excess_character=4500 # The penalty incurred by adding a line split to the unwrapped line. The # more line splits added the higher the penalty. split_penalty_for_added_line_split=30 # The penalty of splitting a list of "import as" names. For example: # # from a_very_long_or_indented_module_name_yada_yad import (long_argument_1, # long_argument_2, # long_argument_3) # # would reformat to something like: # # from a_very_long_or_indented_module_name_yada_yad import ( # long_argument_1, long_argument_2, long_argument_3) split_penalty_import_names=0 # The penalty of splitting the line around the 'and' and 'or' # operators. split_penalty_logical_operator=300 # Use the Tab character for indentation. use_tabs=False ================================================ FILE: CMakeLists.txt ================================================ cmake_minimum_required(VERSION 3.14 FATAL_ERROR) project(poptorch) include(GNUInstallDirs) set(CMAKE_EXPORT_COMPILE_COMMANDS ON) if(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT) set(CMAKE_INSTALL_PREFIX ${CMAKE_BINARY_DIR}/install CACHE PATH "Default to local install prefix" FORCE) endif() set(USE_PYTORCH_PACKAGE_HEADERS ON CACHE BOOL "Use the Torch headers distributed with the pytorch package.") set(POPLAR_DIR CACHE PATH "Path to a Poplar install") set(POPART_DIR CACHE PATH "Path to a Popart install") set(SNAPSHOT "" CACHE STRING "Snapshot ID to use for the documentation") set(SDK_DIR CACHE PATH "Path to an extracted SDK archive or to a Poplar & Popart install directory (Will populate POPLAR_DIR and POPART_DIR)") set(BUILD_DOCS OFF CACHE BOOL "Build PopTorch's documentation") set(COPY_TESTS OFF CACHE BOOL "Copy tests files to the build folder (instead of running them from the sources folder)") set(ENABLE_WERROR ON CACHE BOOL "Treat C++ warnings as errors") set(EXTRA_PYTEST_ARGS "" CACHE STRING "Extra arguments to pass to pytest when generating the list of tests to run") # Always use the gold linker to avoid segfaults with PopART / Poplar on some OSes. if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU") find_program(GOLD_EXECUTABLE ld.gold REQUIRED) set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fuse-ld=gold") set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -fuse-ld=gold") set(CMAKE_MODULE_LINKER_FLAGS "${CMAKE_MODULE_LINKER_FLAGS} -fuse-ld=gold") endif() # Note: The next line is also parsed by scripts/utils/_utils.py set(TORCH_VERSION 2.0.1) # Convert to cmake list string(REPLACE "." ";" TORCH_VERSION_AS_LIST ${TORCH_VERSION}) # Get the minor component. (Versions are Major.Minor.Patch) list(GET TORCH_VERSION_AS_LIST 1 TORCH_MINOR_VERSION) set(CMAKE_CONFIGURATION_TYPES "Release" "Debug" "MinSizeRel" "RelWithDebInfo") set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS ${CMAKE_CONFIGURATION_TYPES}) if(NOT CMAKE_BUILD_TYPE) list(GET CMAKE_CONFIGURATION_TYPES 0 CMAKE_BUILD_TYPE) message(STATUS "Setting build type to '${CMAKE_BUILD_TYPE}' as none was specified") endif() if(NOT CMAKE_BUILD_TYPE IN_LIST CMAKE_CONFIGURATION_TYPES) message(FATAL_ERROR "CMAKE_BUILD_TYPE must be one of ${CMAKE_CONFIGURATION_TYPES}") endif() if(USE_PYTORCH_PACKAGE_HEADERS) execute_process(COMMAND python3 -c "import torch; from pathlib import Path; print(Path(torch.__file__).parent, end='')" OUTPUT_VARIABLE TORCH_PATH) list(APPEND CMAKE_PREFIX_PATH ${TORCH_PATH}) if(NOT TORCH_PATH) message(FATAL_ERROR "python3 -c \"import torch\" failed: check that your virtual environment is active and torch is installed") endif() execute_process(COMMAND python3 -c "import torch; import sys; sys.exit(3 if torch.version.debug else 4)" RESULT_VARIABLE TORCH_DEBUG) if(TORCH_DEBUG LESS 3 OR TORCH_DEBUG GREATER 4) message(FATAL_ERROR "python3 -c \"import torch\" failed: check that your virtual environment is active and torch is installed") endif() if (TORCH_DEBUG EQUAL 4) # We include torch header files which respond to this flag, so we have to # set it correctly in order to get consistent torch behaviour. add_compile_definitions(NDEBUG) endif() endif() if(EXISTS ${SDK_DIR}) execute_process(COMMAND find ${SDK_DIR} -maxdepth 1 -type d -name "popart*" OUTPUT_VARIABLE POPART_DIR OUTPUT_STRIP_TRAILING_WHITESPACE) execute_process(COMMAND find ${SDK_DIR} -maxdepth 1 -type d -name "poplar-*" -o -name "poplar" OUTPUT_VARIABLE POPLAR_DIR OUTPUT_STRIP_TRAILING_WHITESPACE) if(NOT IS_DIRECTORY "${POPLAR_DIR}") message(FATAL_ERROR "Couldn't find a \"poplar\" or \"poplar-*\" folder in '${SDK_DIR}'") endif() if(NOT IS_DIRECTORY "${POPART_DIR}") message(FATAL_ERROR "Couldn't find a \"popart*\" folder in '${SDK_DIR}'") endif() endif() if(EXISTS ${POPLAR_DIR}) list(APPEND CMAKE_PREFIX_PATH ${POPLAR_DIR}) set(ENABLE_POPLAR_CMD "source ${POPLAR_DIR}/enable.sh") else() # Check the package is not already in the path find_package(poplar) if(NOT poplar_FOUND) message(FATAL_ERROR "You must provide a path to a Poplar install using -DPOPLAR_DIR=/path/to/popart/build/install") endif() endif() if( EXISTS ${POPART_DIR} ) list(APPEND CMAKE_PREFIX_PATH ${POPART_DIR}) set(ENABLE_POPART_CMD "source ${POPART_DIR}/enable.sh") else() find_package(popart COMPONENTS popart-only) if(NOT popart_FOUND) message(FATAL_ERROR "You must provide a path to a Popart build using -DPOPART_DIR=/path/to/popart/build") endif() endif() if(NOT popart_FOUND) find_package(popart REQUIRED COMPONENTS popart-only) endif() if(NOT poplar_FOUND) find_package(poplar REQUIRED) endif() get_target_property(POPLAR_LIB poplar LOCATION) get_filename_component(POPLAR_DIR ${POPLAR_LIB} DIRECTORY) # Run an install command that requires PopTorch, PopArt and Poplar to be in the PATH. function(run_poptorch_install_command cmd working_directory cmd_name) install(CODE "set(ENV{LD_LIBRARY_PATH} ${popart_LIB_DIR}:${POPLAR_DIR}:$ENV{LD_LIBRARY_PATH}) set(ENV{POPTORCH_SMALL_IPU_MODEL} 1) execute_process( COMMAND ${cmd} WORKING_DIRECTORY ${working_directory} RESULT_VARIABLE RETVAL OUTPUT_VARIABLE OUTPUT ERROR_VARIABLE OUTPUT) if(RETVAL AND NOT RETVAL EQUAL 0) message(FATAL_ERROR \"${cmd_name} FAILED: \${OUTPUT}\") endif()") endfunction() function(remove_use_distributed_definition target) get_target_property(compile_options ${target} INTERFACE_COMPILE_DEFINITIONS) if(NOT compile_options) message(FATAL_ERROR "Could not get property INTERFACE_COMPILE_DEFINITIONS from target '${target}'.") endif() list(REMOVE_ITEM compile_options USE_DISTRIBUTED) list(REMOVE_ITEM compile_options USE_RPC) set_target_properties(${target} PROPERTIES INTERFACE_COMPILE_DEFINITIONS "${compile_options}") endfunction() file(WRITE ${CMAKE_BINARY_DIR}/tmp/test.sh "#!$ENV{SHELL} source ${CMAKE_BINARY_DIR}/enable.sh export POPTORCH_TEST_TIMEOUT=3600 export TORCH_SHOW_CPP_STACKTRACES=1 ctest --output-on-failure --timeout $POPTORCH_TEST_TIMEOUT $@ ") file(COPY ${CMAKE_BINARY_DIR}/tmp/test.sh DESTINATION ${CMAKE_BINARY_DIR} FILE_PERMISSIONS OWNER_READ OWNER_WRITE OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ WORLD_EXECUTE) enable_testing() add_compile_options( -Wall -pedantic -Wextra -Wdisabled-optimization -Wshadow -Wswitch -Wformat=2 -Wimplicit-fallthrough -Winit-self -Wcomment -Wsequence-point -Wundef -Wuninitialized -DTORCH_MINOR_VERSION=${TORCH_MINOR_VERSION}) if(${ENABLE_WERROR}) add_compile_options(-Werror) endif() set(CMAKE_CXX_STANDARD 17) set(INSTALL_PYDIR ${CMAKE_INSTALL_PREFIX}/poptorch) if(${CMAKE_SYSTEM_NAME} STREQUAL Darwin) set(CMAKE_INSTALL_RPATH "@loader_path") set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE) else() # $ORIGIN/lib is needed by the standalone wheel: by default libraries expect # their dependencies to be in $ORIGIN/../lib therefore for Poplar # and Popart to work they need to be stored in a folder named "lib". # However the poptorch shared libraries which are loaded from python must be at # the root of the poptorch package, which is why we need to add $ORIGIN/lib # to the poptorch libraries. set(CMAKE_INSTALL_RPATH "$ORIGIN:$ORIGIN/lib") endif() find_package(Torch ${TORCH_VERSION} EXACT REQUIRED) remove_use_distributed_definition(torch_cpu) add_subdirectory(poptorch_err) add_subdirectory(poptorch_logging) add_subdirectory(poptorch_compiler/pytorch_bridge) add_subdirectory(poptorch/source/dispatch_tracer) add_subdirectory(popart_compiler) add_subdirectory(poptorch) add_subdirectory(python) add_subdirectory(poptorch_geometric) # Examples and tests add_subdirectory(tests) add_subdirectory(examples) add_subdirectory(docs/user_guide) add_custom_target(poptorch_standalone_wheel WORKING_DIRECTORY ${CMAKE_INSTALL_PREFIX} COMMAND python3 ${PROJECT_SOURCE_DIR}/scripts/generate_python_package.py bdist_wheel --include-dir ${CMAKE_INSTALL_PREFIX}/include --lib-dir ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR} --output-dir ${CMAKE_INSTALL_PREFIX}/dist --python-dir ${INSTALL_PYDIR} --standalone "${popart_LIB_DIR}:${POPLAR_DIR}" DEPENDS poptorch ) add_custom_target(poptorch_standalone_sdist WORKING_DIRECTORY ${CMAKE_INSTALL_PREFIX} COMMAND python3 ${PROJECT_SOURCE_DIR}/scripts/generate_python_package.py sdist --include-dir ${CMAKE_INSTALL_PREFIX}/include --lib-dir ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR} --output-dir ${CMAKE_INSTALL_PREFIX}/dist --python-dir ${INSTALL_PYDIR} --standalone "${popart_LIB_DIR}:${POPLAR_DIR}" DEPENDS poptorch ) add_custom_target(poptorch_wheel WORKING_DIRECTORY ${CMAKE_INSTALL_PREFIX} COMMAND python3 ${PROJECT_SOURCE_DIR}/scripts/generate_python_package.py bdist_wheel --include-dir ${CMAKE_INSTALL_PREFIX}/include --lib-dir ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR} --output-dir ${CMAKE_INSTALL_PREFIX}/dist --python-dir ${INSTALL_PYDIR} DEPENDS poptorch ) add_custom_target(poptorch_sdist WORKING_DIRECTORY ${CMAKE_INSTALL_PREFIX} COMMAND python3 ${PROJECT_SOURCE_DIR}/scripts/generate_python_package.py sdist --include-dir ${CMAKE_INSTALL_PREFIX}/include --lib-dir ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR} --output-dir ${CMAKE_INSTALL_PREFIX}/dist --python-dir ${INSTALL_PYDIR} DEPENDS poptorch ) install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/README.md DESTINATION .) configure_file(${CMAKE_CURRENT_SOURCE_DIR}/scripts/enable.sh.in ${PROJECT_BINARY_DIR}/enable.sh @ONLY) install(FILES ${CMAKE_CURRENT_BINARY_DIR}/enable.sh DESTINATION .) if(${CMAKE_VERSION} VERSION_GREATER "3.15.0") # Building poptorch without installing it doesn't make sense: the python # module cannot be used so always install after a build. add_custom_target(install_poptorch ALL COMMAND ${CMAKE_COMMAND} --install ${CMAKE_BINARY_DIR} DEPENDS poptorch custom_cube_op custom_leaky_relu_op custom_add_scalar_op custom_add_scalar_vec_op custom_add_vec_scalar_mul_op custom_reduce_op custom_three_input_reduce_op custom_many_attribute_op ) endif() ================================================ FILE: License.txt ================================================ The MIT License (MIT) Copyright (c) 2020 Graphcore Limited Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: MANIFEST.in ================================================ recursive-include include *.hpp include poptorch/lib/* include poptorch/lib/poplar_rt/* include poptorch/lib/graphcore/lib/*.a include *.py include *.toml include License.txt include poptorch_third_party_licenses.txt ================================================ FILE: README.md ================================================ # PopTorch and PopTorch Geometric. ## PopTorch - PyTorch integration for the Graphcore IPU PopTorch is a set of extensions for PyTorch enabling models to be trained, evaluated and used on the Graphcore IPU. More information can be found in the [PopTorch User Guide](https://docs.graphcore.ai/projects/poptorch-user-guide/). ## PopTorch Geometric - PyTorch Geometric integration for the Graphcore IPU PopTorch Geometric is a set of extensions for PyTorch Geometric, enabling Graph Neural Network models to be trained, evaluated and used on the Graphcore IPU. PopTorch Geometric depends on the functionality provided by PopTorch. More information can be found in the [PopTorch Geometric User Guide](https://docs.graphcore.ai/projects/poptorch-geometric-user-guide). ## Prerequisites These instructions assume you are building PopTorch and PopTorch Geometric on Ubuntu 20.04. To install and run PopTorch and PopTorch Geometric you will need: - Python 3.8 - pip3 >= 18.1 - The Poplar SDK ```sh sudo apt install -y python3 python3-pip ``` To build PopTorch and PopTorch Geometric from sources you will need all of the above and: - git - curl - g++ ```sh sudo apt install -y git curl g++ ``` To build the documentation you will also need LaTeX: ```sh sudo apt install -y texlive-full ``` ## Install the Poplar SDK The Poplar SDK can be downloaded from: https://www.graphcore.ai/downloads. Set the following environment variable to point to the installed Poplar SDK: ```sh export SDK_PATH=/path/to/poplar_sdk-ubuntu_20_04* ``` PopTorch must be built against a compatible version of the SDK. For example, the "sdk-release-3.2" branch of PopTorch must be built against Poplar SDK 3.2. ## Installation Make sure `pip3` is up to date (You need `pip3 >= 18.1`): ```sh pip3 install -U pip --user ``` Install the PopTorch wheel (Torch will automatically be installed in the process): ```sh pip3 install ${SDK_PATH}/poptorch-*.whl ``` Once the PopTorch wheel has been installed, PopTorch Geometric wheel can be installed if needed (PyTorch Geometric will automatically be installed in the process): ```sh pip3 install ${SDK_PATH}/poptorch_geometric-*.whl ``` ## Usage The PopTorch wheel doesn't include the PopART and Poplar binaries, so you need to make sure they are in your path before loading PopTorch or PopTorch Geometric. This is done by sourcing their respective `enable.sh` scripts: ```sh . ${SDK_PATH}/poplar-ubuntu_20_04*/enable.sh . ${SDK_PATH}/popart-ubuntu_20_04*/enable.sh ``` You can check everything is in order by running: ```sh python3 -c "import poptorch;print(poptorch.__version__)" ``` And similarly for PopTorch Geometric: ```sh python3 -c "import poptorch_geometric;print(poptorch_geometric.__version__)" ``` More information can be found in the [PopTorch User Guide](https://docs.graphcore.ai/projects/poptorch-user-guide/) ## Build instructions We use [Mambaforge](https://github.com/conda-forge/miniforge#mambaforge) as build environment manager. 1. Clone the PopTorch repository ```sh git clone https://github.com/graphcore/poptorch.git ``` 2. Create a folder for your build ```sh mkdir build cd build ``` 3. Create a build environment and install the dependencies. ```sh ../poptorch/scripts/create_buildenv.py ``` 4. Activate the build environment ```sh . activate_buildenv.sh ``` 5. Configure the build ```sh cmake ../poptorch -DSDK_DIR=${SDK_PATH} -GNinja ``` By default, PopTorch will be built in release mode. To build in debug mode add `-DCMAKE_BUILD_TYPE=Debug`. To build the documentation, add `-DBUILD_DOCS=ON`. The HTML and PDF documentation will be generated in `docs/`. 6. Compile the PopTorch and PopTorch Geometric libraries ```sh ninja install ``` If you're only going to use PopTorch or PopTorch Geometric for development purposes then you can stop here. Source the enable script in the PopTorch build folder and you can start using PopTorch: ```sh . enable.sh python3 -c "import poptorch;print(poptorch.__version__)" ``` Similarly for PopTorch Geometric: ```sh . enable.sh python3 -c "import poptorch_geometric;print(poptorch_geometric.__version__)" ``` 7. (Optional) Build the PopTorch wheel ```sh ninja poptorch_wheel ``` The wheel will be created in `install/dist`. 8. (Optional) Build the PopTorch Geometric wheel ```sh ninja poptorch_geometric_wheel ``` The wheel will be created in `install/dist`. ### Run the tests To run the tests: ```sh # Run all the tests, print the output only on failure, run 80 tests in parallel ./test.sh -j80 # PopTorch has 3 test labels: examples, short, long. To run all the tests except the long ones: ./test.sh -j80 -LE long # To run only the short tests ./test.sh -j80 -L short # Filter the tests by name using -R ./test.sh -j80 -R half_ # For more information: ./test.sh --help ``` Note: If you run the tests in parallel, make sure to tell PopTorch to wait for an IPU to become available if they are all in use: ```sh export POPTORCH_WAIT_FOR_IPU=1 ``` Tests can also be run individually using `pytest`: ```sh . enable.sh python3 -m pytest ../poptorch/tests/options_test.py # add -s to get the whole output # -k to filter the tests by name python3 -m pytest ../poptorch/tests/options_test.py -s -k popart ``` Tests specific for Graph Neural Networks are located in `tests/gnn/` subdirectory: ```sh . enable.sh python3 -m pytest ../poptorch/tests/gnn/test_basic_gnn.py # add -s to get the whole output # -k to filter the tests by name python3 -m pytest ../poptorch/tests/gnn/test_basic_gnn.py -s -k GraphSAGE ``` ## Feedback / issues Please create issues [here](https://github.com/graphcore/poptorch/issues) ================================================ FILE: config.buildenv.py ================================================ # Copyright (c) 2020 Graphcore Ltd. All rights reserved. _llvm_version = "13.0.1" config.setDefault(build_documentation=True) installers.add(PipRequirements("requirements.txt")) installers.add(PipRequirements("poptorch_geometric/requirements.txt")) installers.add( CondaPackages( "boost-cpp=1.72.0", "ccache=4.3", "cmake=3.18.2", "libstdcxx-ng=11.2.0", "make=4.3", "ninja=1.10.2", "pybind11=2.6.1", "pyyaml=5.3.1", # Mamba overwrites that package, so it must explicitly # appear here with the correct version. "setuptools=58.0.4", "spdlog=1.8.0", # Mamba overwrites that package, so it must explicitly # appear here with the correct version. "typing-extensions=4.1.1", # Mamba overwrites that package, so it must explicitly # appear here with the correct version. "wheel=0.34.2", "zip=3.0")) if config.build_documentation: installers.add( CondaPackages( "breathe=4.25.1", "docutils==0.16", "hunspell=1.7.0", # Indirect dependency of sphinx which # doesn't get automatically installed. "jinja2=3.0.3", "latexmk=4.55", "sphinx=3.3.1", "sphinx_rtd_theme=0.5.0", )) if config.install_linters: installers.add( CondaPackages( "pre-commit=3.3.3", "clang-tools=" + _llvm_version, "pylint=2.7.2", "yapf=0.27.0", # To preserve the comments when updating the schemas "ruamel.yaml=0.17.21", )) class DownloadExternalDatasets(Installer): def __init__(self, **kwargs): super().__init__(**kwargs) self.downloader_path = os.path.join(_utils.sources_dir(), 'scripts', 'download_external_datasets.py') if not os.path.exists(self.downloader_path): raise RuntimeError(f'Path {self.downloader_path} not exists.') def hashString(self): with open(self.downloader_path, "r") as f: return f.read() def install(self, env): datasets_path = os.path.join(env.prefix, "external_datasets") env.run_commands(f"mkdir {datasets_path}", f"python3 {self.downloader_path} {datasets_path}") installers.add(DownloadExternalDatasets()) ================================================ FILE: docs/common/_static/css/custom_rtd.css ================================================ /* Copyright (c) 2020 Graphcore Ltd. All rights reserved. Override the sphinx-readthedocs-theme settings */ /* improve table layout, allowing cells to wrap */ .rst-content table.docutils col { width: auto; } .wy-table-responsive table td, .wy-table-responsive table th, .rst-content .wy-table-responsive table td, .rst-content .wy-table-responsive table th { white-space: normal; word-wrap: normal; border: solid 2px #e1e4e5; } th.head p { margin-bottom: 0; } /* make inline code/literal text less ugly */ /* .rst-content tt.literal, .rst-content tt.literal, .rst-content code.literal { color: #565656; font-size: 90%; padding: 0; background: #ffffff; border: none; } */ /* make image captions a sensible size and format */ .rst-content .toctree-wrapper p.caption { font-size: 14px; font-weight: 700; font-family: "Graphik", "Lato", "Helvetica Neue", Arial, sans-serif; text-align: center; margin-top: 14px; } /* change background colour for code samples */ div.highlight, .rst-content pre.literal-block, .rst-content pre.literal-block div[class^='highlight'], .rst-content div[class^='highlight'], .rst-content div[class^='highlight'] pre, .rst-content div[class^='highlight'] div[class^='highlight'], .rst-content div[class^='highlight'] td.code { background-color: white ; color: #292c31; } /* remove ugly top border from definition lists */ .rst-content dl:not(.docutils) dt { border-top: none; } /* Style nav menu in Graphcore colours and fonts */ .wy-menu-vertical li.toctree-l1 span.toctree-expand, .wy-menu-vertical li span.toctree-expand, .wy-menu-vertical li.on a span.toctree-expand, .wy-menu-vertical li a.current span.toctree-expand { color: white; visibility: visible; display: inline-block; mix-blend-mode: difference; } .fa-home:before, .icon-home:before { display: none; } a.icon, a.icon:visited, a.icon:hover, a.icon-home, a.icon-home:hover, a.icon-home:visited { color: white; font-weight: bold; } .btn { font-family: "Graphik", "Lato", "proxima-nova", "Helvetica Neue", Arial, sans-serif; } input[type="button"], input[type="reset"], input[type="submit"], input[type="text"], input[type="password"], input[type="email"], input[type="url"], input[type="date"], input[type="month"], input[type="time"], input[type="datetime"], input[type="datetime-local"], input[type="week"], input[type="number"], input[type="search"], input[type="tel"], input[type="color"] { font-family: "Graphik", "Lato", "proxima-nova", "Helvetica Neue", Arial, sans-serif; } textarea { font-family: "Graphik", "Lato", "proxima-nova", "Helvetica Neue", Arial, sans-serif; } a:link { color: #ff6f79; } a:visited { color: #ff6f79; } a:hover { color: #fbc3aa; } body { font-family: "Graphik", "Lato", "Helvetica Neue", Arial, sans-serif; color: #565656; } h1, h2, h3, h4, h5, h6, legend { font-family: "Graphik", "Lato", "Helvetica Neue", Arial, sans-serif; color: #292c31; } h1 { text-transform: uppercase; font-family: "GC Quantized", "Roboto Slab", "ff-tisa-web-pro", "Georgia", Arial, sans-serif; } .rst-content tt.literal, .rst-content tt.literal, .rst-content code.literal, .rst-content pre.literal-block, .rst-content .linenodiv pre, .rst-content div[class^='highlight'] pre, code, .rst-content tt, .rst-content code, .rst-content pre, .rst-content kbd, .rst-content samp, footer span.commit code, footer span.commit .rst-content tt, .rst-content footer span.commit tt { font-family: Consolas, "Andale Mono WT", "Andale Mono", "Lucida Console", "Lucida Sans Typewriter", "DejaVu Sans Mono", "Bitstream Vera Sans Mono", "Liberation Mono", "Nimbus Mono L", Monaco, "Courier New", Courier, monospace; color: #292c31; } .wy-menu-vertical li.toctree-l1 >a, .wy-menu-vertical li.toctree-l1 >a:visited { color: #e9e9e9 !important; } .wy-menu-vertical li.toctree-l1.current a, .wy-menu-vertical li a { color: #292c31 !important; } .wy-menu-vertical a:hover { background: #ff6f79 !important; color: #fff !important; } .wy-menu, .wy-menu-vertical, .wy-nav-side, .wy-side-nav-search { background-color: #292c31; } .wy-side-nav-search>div.version { color: #fff; text-align: left; padding: 0 .75em; } .rst-content .sidebar .sidebar-title { font-family: "GC Quantized", "Roboto Slab", "ff-tisa-web-pro", "Georgia", Arial, sans-serif; } .rst-content .footnote-reference, .rst-content .citation-reference { vertical-align: super; } .rst-content table.docutils.citation, .rst-content table.docutils.footnote { color: #999; } .wy-breadcrumbs { display: none; } footer p { color: #565656; font-size: 15px; } ================================================ FILE: docs/common/conf.py ================================================ # Copyright (c) 2020 Graphcore Ltd. All rights reserved. # Configuration file for the Sphinx documentation builder. # # -- Path setup -------------------------------------------------------------- # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. # import os import pathlib import json import sys import datetime sys.path.insert(0, os.path.abspath('.')) # -- Project information ----------------------------------------------------- project = 'Project' author = 'Graphcore Ltd' # The full version, including alpha/beta/rc tags # Looks like html uses 'version' and latex uses 'release' version = 'v0.0.0' # -- General configuration --------------------------------------------------- # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ 'sphinx.ext.graphviz', 'sphinx.ext.autodoc', 'sphinx.ext.extlinks', ] def get_current_release() -> str: format_str = "{major}.{minor}" version_file = pathlib.Path(__file__).parents[2].resolve() / "version.json" return format_str.format(**json.load(open(version_file))) SDK_RELEASE = get_current_release() extlinks = { 'tutorials-repo': (f'https://github.com/graphcore/tutorials/tree/sdk-release-{SDK_RELEASE}/%s', None), 'github-poptorch': (f'https://github.com/graphcore/poptorch/tree/sdk-release-{SDK_RELEASE}/%s', None), } # Add any paths that contain templates here, relative to this directory. templates_path = [] # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This pattern also affects html_static_path and html_extra_path. exclude_patterns = [] nitpick_ignore = [ ('py:class', 'bool'), ('py:class', 'dict'), ('py:class', 'int'), ('py:class', 'iterable'), ('py:class', 'optional'), ('py:class', 'str'), ('py:class', 'T_co'), ('py:class', 'datetime.timedelta'), ('py:class', 'torch.Tensor'), ('py:class', 'torch.dtype'), ('py:class', 'torch.nn.Module'), ('py:class', 'torch.optim.Optimizer'), ('py:class', 'torch.optim.optimizer.Optimizer'), ('py:class', 'torch.utils.data.Dataset'), ('py:class', 'torch.utils.data.sampler.Sampler'), # Enums already described in functions that use them ('py:class', 'poptorch.OutputMode'), ('py:class', 'poptorch.ConnectionType'), ('py:class', 'poptorch.HalfFloatCastingBehavior'), ('py:class', 'poptorch.MatMulSerializationMode'), ('py:class', 'poptorch.OverlapMode'), ('py:class', 'poptorch.ReductionType'), ('py:class', 'poptorch.SyncPattern'), ('py:class', 'poptorch.MeanReductionStrategy'), # Type hints ('py:data', 'typing.Optional'), ('py:data', 'typing.Callable'), ('py:class', 'typing.ForwardRef'), ] # Define abbreviations for IPU-PODn names with subscripts # These use non-breaking hyphens & spaces, so be careful if editing or adding new definitions pod_sizes = [2**i for i in range(2, 14)] + ["N"] pod_definitions = [ f".. |POD{i}| replace:: IPU‑POD\\ :subscript:`{i}`" for i in pod_sizes ] bow_definitions = [ f".. |BOW{i}| replace:: Bow Pod\\ :subscript:`{i}`" for i in pod_sizes ] rst_epilog = ("\n".join(pod_definitions) + "\n" + "\n".join(bow_definitions) + r""" .. role:: raw-html(raw) :format: html .. |POD| replace:: IPU‑POD .. |BOW| replace:: Bow Pod .. |newpage| raw:: latex \newpage .. |LEGAL:EULA| replace:: This software is made available under the terms of the `Graphcore End User License Agreement (EULA) `__ and the `Graphcore Container License Agreement `__. Please ensure you have read and accept the terms of the corresponding license before using the software. The Graphcore EULA applies unless indicated otherwise. .. |LEGAL:TRADEMARkS| replace:: Graphcloud®, Graphcore®, Poplar® and PopVision® are registered trademarks of Graphcore Ltd. :raw-html:`

` Bow™, Bow-2000™, Bow Pod™, Colossus™, In-Processor-Memory™, IPU-Core™, IPU-Exchange™, IPU-Fabric™, IPU-Link™, IPU-M2000™, IPU-Machine™, IPU-POD™, IPU-Tile™, PopART™, PopDist™, PopLibs™, PopRun™, PopTorch™, Streaming Memory™ and Virtual-IPU™ are trademarks of Graphcore Ltd. :raw-html:`

` All other trademarks are the property of their respective owners. """ + f".. |YEAR| replace:: {datetime.date.today().year}" + "\n" + f".. |SDK_RELEASE| replace:: {SDK_RELEASE}" "\n") # -- Options for HTML output ------------------------------------------------- # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. # html_theme = 'sphinx_rtd_theme' html_theme_options = {'logo_only': False, 'navigation_depth': 4} numfig = True numfig_format = { 'section': 'Section {number}, {name}', 'figure': 'Fig. %s', 'table': 'Table %s', 'code-block': 'Listing %s' } # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". html_static_path = ['_static'] # CSS file to create the Graphcore style html_css_files = [ 'css/custom_rtd.css', ] # The name for this set of Sphinx documents. If None, it defaults to # " v documentation". html_title = 'Document Title' # The name of an image file (relative to this directory) to place at the top # of the sidebar. html_logo = 'graphcorelogo-html.png' # If true, SmartyPants will be used to convert quotes and dashes to # typographically correct entities. html_use_smartypants = True # If true, links to the reST sources are added to the pages. html_show_sourcelink = False # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. html_show_sphinx = False # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. html_show_copyright = False # -- Options for LaTeX output --------------------------------------------- # Don't know how much of this is necessary. It's a bit of a mess. # pifont required for tick and cross characters # use array stretch to get taller table rows. Also consider sing extrarowheight # \\setlength{\\extrarowheight}{1pt} ADDITIONAL_PREAMBLE = r""" \setcounter{secnumdepth}{5} \setcounter{tocdepth}{5} \usepackage{threeparttable} \usepackage{pifont} \usepackage{array} \usepackage{charter} \usepackage[defaultsans]{lato} \usepackage{inconsolata} \usepackage{listings} \usepackage{verbatim} \usepackage{multicol} \usepackage{float} \usepackage{fancyhdr} %% Obtain access to ssmall font size \usepackage[10pt]{moresize} %% adjustbox used to set max width and height for images \usepackage[export]{adjustbox} %% Define a right-justified table column type \usepackage{ragged2e} \newcolumntype{R}[1]{>{\RaggedLeft\arraybackslash}p{#1}} \renewcommand{\arraystretch}{1.4} \usepackage{booktabs} \usepackage{graphicx} %% Push footnotes to the bottom of the page \usepackage[bottom]{footmisc} \usepackage{pdfpages} \usepackage{pdflscape} \usepackage{transparent} \usepackage[normalem]{ulem} %% Watermark stuff \usepackage{draftwatermark} \SetWatermarkFontSize{2cm} \SetWatermarkColor[gray]{0.96} \SetWatermarkText{} \SetWatermarkScale{2} \SetWatermarkAngle{30} %% Ensure API descriptions are all tt family \let\fulllineitemsOld\fulllineitems \let\endfulllineitemsOld\endfulllineitems \renewenvironment{fulllineitems}{\ttfamily\small\fulllineitemsOld}{\endfulllineitemsOld} %% Change the Sphinx verbatim to not put the box around it and to indent \renewcommand{\Verbatim}[1][1]{% % list starts new par, but we don't want it to be set apart vertically \bgroup\parskip=0pt% \medskip % The list environment is needed to control perfectly the vertical % space. \list{}{% \setlength\parskip{0pt}% \setlength\itemsep{0ex}% \setlength\topsep{0ex}% \setlength\partopsep{0pt}% \setlength\leftmargin{0pt}% }% \OriginalVerbatim[#1,xleftmargin=0.5cm,formatcom=\normalsize]% } \renewcommand{\endVerbatim}{% \endOriginalVerbatim% \endlist% % close group to restore \parskip \egroup% } \newcommand{\VerbBorders}{% \renewcommand{\Verbatim}[1][1]{% % list starts new par, but we don't want it to be set apart vertically \bgroup\parskip=0pt% \smallskip% % The list environment is needed to control perfectly the vertical % space. \list{}{% \setlength\parskip{0pt}% \setlength\itemsep{0ex}% \setlength\topsep{0ex}% \setlength\partopsep{0pt}% \setlength\leftmargin{0pt}% }% \item\MakeFramed {\FrameRestore}% \small% \OriginalVerbatim[##1]% } \renewcommand{\endVerbatim}{% \endOriginalVerbatim% \endMakeFramed% \endlist% % close group to restore \parskip \egroup% } \definecolor{VerbatimColor}{rgb}{0.95,0.95,0.95} \definecolor{VerbatimBorderColor}{rgb}{1.0,1.0,1.0} } \makeatletter \DeclareTextCommandDefault{\textleftarrow}{\mbox{$\m@th\leftarrow$}} \makeatother """ ADDITIONAL_PREAMBLE += r""" %% Redefine sphinxstylethead (used only for table headers) to bold font \usepackage{letltxmacro} \LetLtxMacro{\oldtextsf}{\sphinxstyletheadfamily} \renewcommand{\sphinxstyletheadfamily}[0]{\oldtextsf \bf } """ ADDITIONAL_PREAMBLE += r""" \makeatletter \fancypagestyle{normal}{ \fancyhf{} \fancyfoot[RE,RO]{{\py@HeaderFamily\thepage}} \fancyfoot[LE,LO]{%(footer)s} \renewcommand{\headrulewidth}{0.4pt} \renewcommand{\footrulewidth}{0.4pt} } \fancypagestyle{plain}{ \fancyhf{} \fancyfoot[RE,RO]{{\py@HeaderFamily\thepage}} \fancyfoot[LE,LO]{%(footer)s} \renewcommand{\headrulewidth}{0pt} \renewcommand{\footrulewidth}{0.4pt} } \makeatother """ % { 'footer': '' } # From Sphinx 1.5 onwards, there are certain macros which are used which became # deprecated (e.g. \code). These macros should be upgraded in the future so # that we can move away from using the old macro names. latex_keep_old_macro_names = True latex_elements = { # Options to pass to packages 'passoptionstopackages': r'\PassOptionsToPackage{dvipsnames, table, xcdraw}{xcolor}', # Set up margins for geometry 'sphinxsetup': 'hmargin={0.75in, 0.75in}, vmargin={0.75in, 0.75in}', # The paper size ('letterpaper' or 'a4paper'). 'papersize': 'a4paper', # Single sided to save paper and improve display 'extraclassoptions': 'openany,oneside', # The font size ('10pt', '11pt' or '12pt'). 'pointsize': '10pt', # Disable figure floating 'figure_align': 'H', # Additional stuff for the LaTeX preamble. 'preamble': ADDITIONAL_PREAMBLE, } # Grouping the document tree into LaTeX files. List of tuples # (source start file, target name, title, # author, documentclass [howto, manual, or own class]). latex_class = 'manual' if 'DOC_TITLE' in os.environ: latex_title = os.environ['DOC_TITLE'] else: latex_title = "Document title" latex_documents = [ ('index', 'doc.tex', latex_title, author, latex_class), ] # The name of an image file (relative to this directory) to place at the top of # the title page. latex_logo = 'graphcorelogo-pdf.png' # For "manual" documents, if this is true, then toplevel headings are parts, # not chapters. latex_use_parts = False # If true, show page references after internal links. latex_show_pagerefs = False # If true, show URL addresses after external links. # latex_show_urls = False # Documents to append as an appendix to all manuals. # latex_appendices = [] # If false, no module index is generated. latex_domain_indices = False # https://www.sphinx-doc.org/en/master/usage/extensions/autodoc.html autodoc_default_options = { 'undoc-members': True, } autodoc_inherit_docstrings = True autodoc_typehints = 'description' ================================================ FILE: docs/common/custom_dic ================================================ accessor AdamW AMSGrad AsyncRebatched autograd backend booleans bwd checkpointed checkpointing codepaths config connectionist const constness CTC dict EOF float16 float32 FP16 InputChannels ints IO ipu IPU IPUs iterable L2 libpvti matmul Mk1 Mk2 mpirun Nesterov num OpenMPI OutputChannels PopART PopART's PopDist PopLibs PopRun PopTorch precompile pvti PyTorch PyTorch's rebatch rebatched rebatching recomputation ReducingDim replan RMSprop RTS serializable SGD sharded sharding stdout str submodules TODO tracepoint tracepoints tracepointsints unrounded unroundedPopRunsubmodules bool ================================================ FILE: docs/poptorch_geometric/common/conf.py ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. # Configuration file for the Sphinx documentation builder. # # -- Path setup -------------------------------------------------------------- # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. # import os import pathlib import json import sys import datetime sys.path.insert(0, os.path.abspath('.')) # -- Project information ----------------------------------------------------- project = 'Project' author = 'Graphcore Ltd' # The full version, including alpha/beta/rc tags # Looks like html uses 'version' and latex uses 'release' version = 'v0.0.0' # -- General configuration --------------------------------------------------- # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ 'sphinx.ext.napoleon', 'sphinx.ext.graphviz', 'sphinx.ext.autodoc', 'sphinx.ext.extlinks', ] def get_current_release() -> str: format_str = "{major}.{minor}" version_file = pathlib.Path(__file__).parents[3].resolve() / "version.json" return format_str.format(**json.load(open(version_file))) SDK_RELEASE = get_current_release() extlinks = { 'tutorials-repo': (f'https://github.com/graphcore/tutorials/tree/sdk-release-{SDK_RELEASE}/%s', None), 'github-poptorch': (f'https://github.com/graphcore/poptorch/tree/sdk-release-{SDK_RELEASE}/%s', None), } # Add any paths that contain templates here, relative to this directory. templates_path = [] # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This pattern also affects html_static_path and html_extra_path. exclude_patterns = [] nitpick_ignore = [ ('py:obj', 'num_nodes * (num_nodes - 1)'), ('py:obj', '0.0'), ('py:obj', '1'), ('py:obj', '2'), ('py:obj', 'graphs_mask'), ('py:obj', 'nodes_mask'), ('py:obj', 'edges_mask'), ('py:obj', 'True'), ('py:obj', 'False'), ('py:obj', 'num_nodes'), ('py:obj', 'num_edges'), ('py:obj', 'num_graphs'), ('py:obj', 'None'), ('py:obj', 'data_source'), ('py:obj', 'follow_batch'), ('py:obj', 'exclude_keys'), ('py:obj', 'batch_sampler'), ('py:obj', 'shuffle'), ('py:obj', 'Data'), ('py:class', 'bool'), ('py:class', 'dict'), ('py:class', 'int'), ('py:class', 'iterable'), ('py:class', 'optional'), ('py:class', 'str'), ('py:class', 'T_co'), ('py:class', 'Dataset'), ('py:class', 'ClusterData'), ('py:class', 'Sampler'), ('py:class', '..'), ('py:class', 'torch.Tensor'), ('py:class', 'torch.utils.data.dataset.Dataset'), ('py:class', 'torch.utils.data.sampler.Sampler'), ('py:class', 'torch_geometric.data.Batch'), ('py:class', 'torch_geometric.data.Data'), ('py:class', 'torch_geometric.data.Dataset'), ('py:class', 'torch_geometric.data.dataset.Dataset'), ('py:class', 'torch_geometric.data.HeteroData'), ('py:class', 'torch_geometric.loader.cluster.ClusterData'), ('py:class', 'torch_geometric.loader.ClusterData'), ('py:class', 'poptorch.AsynchronousDataAccessor'), # Enums already described in functions that use them ('py:class', 'poptorch.DataLoader'), ('py:class', 'poptorch.Dataset'), ('py:class', 'poptorch.Options'), ('py:class', 'bool'), ('py:class', 'dict'), ('py:class', 'int'), ('py:class', 'iterable'), ('py:class', 'optional'), ('py:class', 'str'), ('py:class', 'T_co'), ('py:class', 'torch.Tensor'), ('py:class', 'torch.dtype'), ('py:class', 'torch.nn.Module'), ('py:class', 'torch.optim.Optimizer'), ('py:class', 'torch.optim.optimizer.Optimizer'), ('py:class', 'torch.utils.data.Dataset'), ('py:class', 'torch.utils.data.sampler.Sampler'), # Enums already described in functions that use them ('py:class', 'poptorch.OutputMode'), ('py:class', 'poptorch.ConnectionType'), ('py:class', 'poptorch.HalfFloatCastingBehavior'), ('py:class', 'poptorch.MatMulSerializationMode'), ('py:class', 'poptorch.OverlapMode'), ('py:class', 'poptorch.ReductionType'), ('py:class', 'poptorch.SyncPattern'), ('py:class', 'poptorch.MeanReductionStrategy'), # Type hints ('py:data', 'typing.Optional'), ('py:data', 'typing.Callable'), ('py:class', 'typing.ForwardRef'), ] # Define abbreviations for IPU-PODn names with subscripts # These use non-breaking hyphens & spaces, so be careful if editing or adding new definitions pod_sizes = [2**i for i in range(2, 14)] + ["N"] pod_definitions = [ f".. |POD{i}| replace:: IPU‑POD\\ :subscript:`{i}`" for i in pod_sizes ] bow_definitions = [ f".. |BOW{i}| replace:: Bow Pod\\ :subscript:`{i}`" for i in pod_sizes ] rst_epilog = ("\n".join(pod_definitions) + "\n" + "\n".join(bow_definitions) + r""" .. role:: raw-html(raw) :format: html .. |POD| replace:: IPU‑POD .. |BOW| replace:: Bow Pod .. |newpage| raw:: latex \newpage .. |LEGAL:EULA| replace:: This software is made available under the terms of the `Graphcore End User License Agreement (EULA) `__ and the `Graphcore Container License Agreement `__. Please ensure you have read and accept the terms of the corresponding license before using the software. The Graphcore EULA applies unless indicated otherwise. .. |LEGAL:TRADEMARkS| replace:: Graphcloud®, Graphcore®, Poplar® and PopVision® are registered trademarks of Graphcore Ltd. :raw-html:`

` Bow™, Bow-2000™, Bow Pod™, Colossus™, In-Processor-Memory™, IPU-Core™, IPU-Exchange™, IPU-Fabric™, IPU-Link™, IPU-M2000™, IPU-Machine™, IPU-POD™, IPU-Tile™, PopART™, PopDist™, PopLibs™, PopRun™, PopTorch™, Streaming Memory™ and Virtual-IPU™ are trademarks of Graphcore Ltd. :raw-html:`

` All other trademarks are the property of their respective owners. """ + f".. |YEAR| replace:: {datetime.date.today().year}" + "\n" + f".. |SDK_RELEASE| replace:: {SDK_RELEASE}" "\n") # -- Options for HTML output ------------------------------------------------- # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. # html_theme = 'sphinx_rtd_theme' html_theme_options = {'logo_only': False, 'navigation_depth': 4} numfig = True numfig_format = { 'section': 'Section {number}, {name}', 'figure': 'Fig. %s', 'table': 'Table %s', 'code-block': 'Listing %s' } # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". html_static_path = ['../../common/_static'] # CSS file to create the Graphcore style html_css_files = [ 'css/custom_rtd.css', ] # The name for this set of Sphinx documents. If None, it defaults to # " v documentation". html_title = 'Document Title' # The name of an image file (relative to this directory) to place at the top # of the sidebar. html_logo = '../../common/graphcorelogo-html.png' # If true, SmartyPants will be used to convert quotes and dashes to # typographically correct entities. html_use_smartypants = True # If true, links to the reST sources are added to the pages. html_show_sourcelink = False # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. html_show_sphinx = False # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. html_show_copyright = False # -- Options for LaTeX output --------------------------------------------- # Don't know how much of this is necessary. It's a bit of a mess. # pifont required for tick and cross characters # use array stretch to get taller table rows. Also consider sing extrarowheight # \\setlength{\\extrarowheight}{1pt} ADDITIONAL_PREAMBLE = r""" \setcounter{secnumdepth}{5} \setcounter{tocdepth}{5} \usepackage{threeparttable} \usepackage{pifont} \usepackage{array} \usepackage{charter} \usepackage[defaultsans]{lato} \usepackage{inconsolata} \usepackage{listings} \usepackage{verbatim} \usepackage{multicol} \usepackage{float} \usepackage{fancyhdr} %% Obtain access to ssmall font size \usepackage[10pt]{moresize} %% adjustbox used to set max width and height for images \usepackage[export]{adjustbox} %% Define a right-justified table column type \usepackage{ragged2e} \newcolumntype{R}[1]{>{\RaggedLeft\arraybackslash}p{#1}} \renewcommand{\arraystretch}{1.4} \usepackage{booktabs} \usepackage{graphicx} %% Push footnotes to the bottom of the page \usepackage[bottom]{footmisc} \usepackage{pdfpages} \usepackage{pdflscape} \usepackage{transparent} \usepackage[normalem]{ulem} %% Watermark stuff \usepackage{draftwatermark} \SetWatermarkFontSize{2cm} \SetWatermarkColor[gray]{0.96} \SetWatermarkText{} \SetWatermarkScale{2} \SetWatermarkAngle{30} %% Ensure API descriptions are all tt family \let\fulllineitemsOld\fulllineitems \let\endfulllineitemsOld\endfulllineitems \renewenvironment{fulllineitems}{\ttfamily\small\fulllineitemsOld}{\endfulllineitemsOld} %% Change the Sphinx verbatim to not put the box around it and to indent \renewcommand{\Verbatim}[1][1]{% % list starts new par, but we don't want it to be set apart vertically \bgroup\parskip=0pt% \medskip % The list environment is needed to control perfectly the vertical % space. \list{}{% \setlength\parskip{0pt}% \setlength\itemsep{0ex}% \setlength\topsep{0ex}% \setlength\partopsep{0pt}% \setlength\leftmargin{0pt}% }% \OriginalVerbatim[#1,xleftmargin=0.5cm,formatcom=\normalsize]% } \renewcommand{\endVerbatim}{% \endOriginalVerbatim% \endlist% % close group to restore \parskip \egroup% } \newcommand{\VerbBorders}{% \renewcommand{\Verbatim}[1][1]{% % list starts new par, but we don't want it to be set apart vertically \bgroup\parskip=0pt% \smallskip% % The list environment is needed to control perfectly the vertical % space. \list{}{% \setlength\parskip{0pt}% \setlength\itemsep{0ex}% \setlength\topsep{0ex}% \setlength\partopsep{0pt}% \setlength\leftmargin{0pt}% }% \item\MakeFramed {\FrameRestore}% \small% \OriginalVerbatim[##1]% } \renewcommand{\endVerbatim}{% \endOriginalVerbatim% \endMakeFramed% \endlist% % close group to restore \parskip \egroup% } \definecolor{VerbatimColor}{rgb}{0.95,0.95,0.95} \definecolor{VerbatimBorderColor}{rgb}{1.0,1.0,1.0} } \makeatletter \DeclareTextCommandDefault{\textleftarrow}{\mbox{$\m@th\leftarrow$}} \makeatother """ ADDITIONAL_PREAMBLE += r""" %% Redefine sphinxstylethead (used only for table headers) to bold font \usepackage{letltxmacro} \LetLtxMacro{\oldtextsf}{\sphinxstyletheadfamily} \renewcommand{\sphinxstyletheadfamily}[0]{\oldtextsf \bf } """ ADDITIONAL_PREAMBLE += r""" \makeatletter \fancypagestyle{normal}{ \fancyhf{} \fancyfoot[RE,RO]{{\py@HeaderFamily\thepage}} \fancyfoot[LE,LO]{%(footer)s} \renewcommand{\headrulewidth}{0.4pt} \renewcommand{\footrulewidth}{0.4pt} } \fancypagestyle{plain}{ \fancyhf{} \fancyfoot[RE,RO]{{\py@HeaderFamily\thepage}} \fancyfoot[LE,LO]{%(footer)s} \renewcommand{\headrulewidth}{0pt} \renewcommand{\footrulewidth}{0.4pt} } \makeatother """ % { 'footer': '' } # From Sphinx 1.5 onwards, there are certain macros which are used which became # deprecated (e.g. \code). These macros should be upgraded in the future so # that we can move away from using the old macro names. latex_keep_old_macro_names = True latex_elements = { # Options to pass to packages 'passoptionstopackages': r'\PassOptionsToPackage{dvipsnames, table, xcdraw}{xcolor}', # Set up margins for geometry 'sphinxsetup': 'hmargin={0.75in, 0.75in}, vmargin={0.75in, 0.75in}', # The paper size ('letterpaper' or 'a4paper'). 'papersize': 'a4paper', # Single sided to save paper and improve display 'extraclassoptions': 'openany,oneside', # The font size ('10pt', '11pt' or '12pt'). 'pointsize': '10pt', # Disable figure floating 'figure_align': 'H', # Additional stuff for the LaTeX preamble. 'preamble': ADDITIONAL_PREAMBLE, } # Grouping the document tree into LaTeX files. List of tuples # (source start file, target name, title, # author, documentclass [howto, manual, or own class]). latex_class = 'manual' if 'DOC_TITLE' in os.environ: latex_title = os.environ['DOC_TITLE'] else: latex_title = "Document title" latex_documents = [ ('index', 'doc.tex', latex_title, author, latex_class), ] # The name of an image file (relative to this directory) to place at the top of # the title page. latex_logo = '../../common/graphcorelogo-pdf.png' # For "manual" documents, if this is true, then toplevel headings are parts, # not chapters. latex_use_parts = False # If true, show page references after internal links. latex_show_pagerefs = False # If true, show URL addresses after external links. # latex_show_urls = False # Documents to append as an appendix to all manuals. # latex_appendices = [] # If false, no module index is generated. latex_domain_indices = False # https://www.sphinx-doc.org/en/master/usage/extensions/autodoc.html autodoc_default_options = { 'undoc-members': True, } autodoc_inherit_docstrings = True autodoc_typehints = 'description' ================================================ FILE: docs/poptorch_geometric/user_guide/index.rst ================================================ PyTorch Geometric for the IPU: User Guide ========================================= .. toctree:: :maxdepth: 4 :numbered: 3 intro installation performance tutorials supported_operations reference legal ================================================ FILE: docs/poptorch_geometric/user_guide/installation.rst ================================================ .. _installation: ============ Installation ============ PopTorch Geometric is included as part of the Poplar SDK (see the `Getting Started guide `_ for your system for how to install the Poplar SDK. ). PopTorch Geometric is packaged as a Python wheel file that can be installed using ``pip``. PopTorch Geometric requires the installation of PopTorch, which is also a part of the Poplar SDK. To use PopTorch Geometric you must first install the PopTorch wheel and then the PopTorch Geometric wheel. All the necessary dependencies (including ``torch`` and ``pytorch_geometric``) will be installed automatically. .. important:: pip >= 18.1 is required for PopTorch dependencies to be installed properly. To update `pip`: .. code-block:: bash $ pip install -U pip Version compatibility ~~~~~~~~~~~~~~~~~~~~~ PopTorch Geometric and PopTorch wheels should always come from the same Poplar SDK version to guarantee version compatibility. The following are the corresponding ``poptorch``, ``torch``, ``torchvision`` and ``torchaudio`` versions and supported Python versions. +------------------------+-----------------------+-------------+-----------------+----------------+------------+ | ``poptorch_geometric`` | ``pytorch_geometric`` | ``torch`` | ``torchvision`` | ``torchaudio`` | ``python`` | +========================+=======================+=============+=================+================+============+ | 3.3 | 2.4.0.dev20230613 | 2.0.1 | 0.15.2 | 2.0.1 | >= 3.8 | +------------------------+-----------------------+-------------+-----------------+----------------+------------+ | 3.2 | 2.3.0.dev20230222 | 1.13.1 | 0.14.1 | 0.13.1 | >= 3.7 | +------------------------+-----------------------+-------------+-----------------+----------------+------------+ .. note:: To ensure version compatibility, ``torchvision`` and ``torchaudio`` are automatically installed with PopTorch in Poplar SDK 3.3 and later. Installation using Python virtual environment ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ We recommend creating and activating a virtual environment to isolate your PopTorch Geometric environment from the system Python environment. You can use the Python ``virtualenv`` tool for this. .. code-block:: bash $ virtualenv -p python3 poptorch_test $ source poptorch_test/bin/activate After activating the virtual environment, you need to first install the PopTorch wheel. .. code-block:: bash $ pip install /poptorch-x.x.x.whl where ```` is the location of the Poplar SDK on your system. See the `PopTorch installation guide `_ for more information on installing the PopTorch wheel. Then, install the PopTorch Geometric wheel: .. code-block:: bash # Enable the Python environment containing PopTorch (if not already enabled) $ source poptorch_test/bin/activate $ pip install /poptorch_geometric-x.x.x.whl where ```` is the location of the Poplar SDK on your system. Setting the environment variables ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The PopART and Poplar runtime libraries are required to use PopTorch Geometric, so you will need to set the library search paths, using the scripts provided in the SDK: .. code-block:: bash # Enable the Python environment containing PopTorch (if not already enabled) $ source poptorch_test/bin/activate # Add the Poplar and PopART runtime libraries to the search path $ source /poplar-ubuntu_-+/enable.sh $ source /popart-ubuntu_-+/enable.sh where ```` is the location of the Poplar SDK on your system, ```` is the version of Ubuntu on your system, ```` is the software version number of the Poplar SDK and ```` is the build information. Validating the setup ~~~~~~~~~~~~~~~~~~~~ In order to validate that everything is installed correctly in your environment, you can run the following commands and see if they execute without an exception and the displayed version matches the packages that you installed: .. code-block:: bash $ python -c "import poptorch;print(poptorch.__version__)" $ python -c "import poptorch_geometric;print(poptorch_geometric.__version__)" ================================================ FILE: docs/poptorch_geometric/user_guide/intro.rst ================================================ ============ Introduction ============ .. admonition:: Experimental Release This is an experimental release of PopTorch Geometric. Not all features of PyTorch Geometric are supported, and some functions may not work as expected. The implementation may change without warning in future releases in ways that are not backwards compatible. PopTorch Geometric is a set of extensions for PyTorch Geometric, enabling Graph Neural Network models to be trained, evaluated and used on Graphcore IPU hardware. PopTorch Geometric has been designed to require as few changes as possible to your models to run on the IPU. However, it does have some differences from native PyTorch Geometric execution, in order to get the most out of IPU hardware. PopTorch Geometric depends on the functionality provided by PopTorch. PopTorch and PopTorch Geometric are included in the `Poplar SDK `__. See the `Getting Started guide `_ for your system for how to install the Poplar SDK. Refer to :numref:`installation` for how to install the PopTorch and PopTorch Geometric wheels. ================================================ FILE: docs/poptorch_geometric/user_guide/legal.rst ================================================ Legal notices ============= |LEGAL:TRADEMARKS| |LEGAL:EULA| © Copyright 2023 Graphcore Ltd. All rights reserved. ================================================ FILE: docs/poptorch_geometric/user_guide/performance.rst ================================================ ====================== Optimizing performance ====================== PopTorch Geometric is an extension of PyTorch Geometric allowing models to fully utilize the IPU hardware and provide the best performance. To achieve that, PopTorch Geometric uses PopTorch functionality. PopTorch Geometric is designed in such a way that users can run PyTorch Geometric models with the least amount of changes to the code and exploit the high performance of IPU systems. When working with the IPU, it is always recommended to use fixed-size tensors. This allows for the static compilation of the Poplar programs and using the same programs for all the iterations of training and/or inference. This constraint is not always met when working with Graph Neural Networks because graphs processed in subsequent iterations can have different numbers of nodes and/or edges, which results in tensors of different shapes. PopTorch Geometric provides ways to satisfy this constraint and reach the best performance. Currently, there are two ways to ensure that all the tensors have fixed shapes---using either the `Pad `_ transformation with data loader or the fixed-size data loaders. .. important:: When working with the IPU, it is required to always use the data loader from PopTorch Geometric, either :py:class:`poptorch_geometric.dataloader.DataLoader` or :py:class:`poptorch_geometric.dataloader.FixedSizeDataLoader`. All the data loaders in PopTorch Geometric take the `options` argument. It can be used to set `PopTorch options `_ to process data even more efficiently. It is recommended to read the `Efficient data batching `_ chapter of the PopTorch documentation, to understand the possible settings of the `options` argument. Pad transformation ================== `Pad `_ transformation is a graph transformation implemented in PyTorch Geometric. It sets the fixed number of nodes and edges for all the graphs in the dataset and pads the node- and edge-level feature tensors so their sizes match the number of nodes and edges, respectively. Thanks to that, when the data loader creates a batch of graphs, all the feature tensors of the batch have the same fixed size and computations can be performed with high efficiency. A dataset transformed using `Pad` must be used with the :py:class:`poptorch_geometric.dataloader.DataLoader` data loader to guarantee compatibility with the IPU. .. note:: If the dataset you are working on already has a fixed-size feature tensors, then using `Pad` transformation is not necessary and it is enough to use the :py:class:`poptorch_geometric.dataloader.DataLoader` data loader. Using `Pad` transformation with :py:class:`poptorch_geometric.dataloader.DataLoader` is recommended when the graphs in the dataset have a similar number of nodes and edges, so the number of padding nodes and edges is small. For examples of usage, refer to :numref:`examples_and_tutorials`. Fixed-size data loaders ======================= The alternative method is to use the :py:class:`poptorch_geometric.dataloader.FixedSizeDataLoader` class with the dataset without the `Pad` transformation. The data loader uses :py:class:`poptorch_geometric.collate.FixedSizeCollater` underneath to create mini-batches of graphs with a fixed number of nodes and edges from the initial graphs that do not necessarily have the same number of nodes and edges. The data loader combines graphs from the dataset and creates dummy graphs such that the whole mini-batch has a fixed number of nodes, edges and graphs. By default the `FixedSizeStrategy.PadToMax` strategy is used, which pads the mini-batches to a fixed-size where the resulting mini-batches have a fixed number of samples in each mini-batch and one padding graph at the end of the mini-batch. The data loader can also produce packed batches with a variable number of graphs in each mini-batch. This can help reduce the amount of space in each mini-batch assigned to padding. This is enabled by using `FixedSizeStrategy.StreamPack` which changes the underlying sampler to :py:class:`poptorch_geometric.stream_packing_sampler.StreamPackingSampler`. In this case, each mini-batch contains a certain number of dummy graphs, so that the total number of graphs in the mini-batch is constant. Compared to `Pad` transformation, instead of padding each sample in the batch, the data loader pads the entire batch, which is often more efficient and the created batches are easier to manage since all the padding nodes and edges are at the end. For examples of usage, refer to :numref:`examples_and_tutorials`. ================================================ FILE: docs/poptorch_geometric/user_guide/reference.rst ================================================ .. _reference: ============= API reference ============= .. _api_options: Data loaders ============ .. autoclass:: poptorch_geometric.dataloader.DataLoader .. autoclass:: poptorch_geometric.dataloader.FixedSizeDataLoader .. autoclass:: poptorch_geometric.pyg_dataloader.FixedSizeStrategy .. autoclass:: poptorch_geometric.pyg_dataloader.OverSizeStrategy Cluster data loaders ==================== .. autoclass:: poptorch_geometric.cluster_loader.FixedSizeClusterLoader Collators ========= .. autoclass:: poptorch_geometric.collate.FixedSizeCollater Batch samplers ============== .. autoclass:: poptorch_geometric.stream_packing_sampler.StreamPackingSampler Fixed size options ================== .. autoclass:: poptorch_geometric.fixed_size_options.FixedSizeOptions ================================================ FILE: docs/poptorch_geometric/user_guide/supported_operations.rst ================================================ .. _supported_operations: ==================== Supported operations ==================== This chapter contains a list of the PyTorch Geometric operations supported by the Poplar SDK. .. table:: Supported operations :widths: 15, 25, 17, 43 +-------------------------------+---------------------------+---------------+---------------------------------------------------+ | Layers kind | Layer name | Status | Notes | +===============================+===========================+===============+===================================================+ | Basic blocks | Linear | Supported | | | +---------------------------+---------------+---------------------------------------------------+ | | Sequential | Supported | | | +---------------------------+---------------+---------------------------------------------------+ | | HeteroLinear | Supported | | +-------------------------------+---------------------------+---------------+---------------------------------------------------+ | Convolution Layers | SimpleConv | Supported | | | +---------------------------+---------------+---------------------------------------------------+ | | ChebConv | Not supported | | | +---------------------------+---------------+---------------------------------------------------+ | | SAGEConv | Supported | | | +---------------------------+---------------+---------------------------------------------------+ | | CuGraphSAGEConv | Not supported | | | +---------------------------+---------------+---------------------------------------------------+ | | GraphConv | Supported | | | +---------------------------+---------------+---------------------------------------------------+ | | GatedGraphConv | Supported | | | +---------------------------+---------------+---------------------------------------------------+ | | ResGatedGraphConv | Supported | | | +---------------------------+---------------+---------------------------------------------------+ | | CuGraphGATConv | Not supported | | | +---------------------------+---------------+---------------------------------------------------+ | | FusedGATConv | Not supported | | | +---------------------------+---------------+---------------------------------------------------+ | | TransformerConv | Supported | | | +---------------------------+---------------+---------------------------------------------------+ | | TAGConv | Supported | | | +---------------------------+---------------+---------------------------------------------------+ | | GINConv | Supported | | | +---------------------------+---------------+---------------------------------------------------+ | | GINEConv | Supported | | | +---------------------------+---------------+---------------------------------------------------+ | | ARMAConv | Supported | | | +---------------------------+---------------+---------------------------------------------------+ | | APPNP | Supported | | | +---------------------------+---------------+---------------------------------------------------+ | | MFConv | Supported | | | +---------------------------+---------------+---------------------------------------------------+ | | RGCNConv | Not supported | | | +---------------------------+---------------+---------------------------------------------------+ | | CuGraphRGCNConv | Not supported | | | +---------------------------+---------------+---------------------------------------------------+ | | SplineConv | Supported | Only inference is supported. | | +---------------------------+---------------+---------------------------------------------------+ | | NNConv | Supported | | | +---------------------------+---------------+---------------------------------------------------+ | | CGConv | Supported | | | +---------------------------+---------------+---------------------------------------------------+ | | EdgeConv | Supported | | | +---------------------------+---------------+---------------------------------------------------+ | | DynamicEdgeConv | Supported | | | +---------------------------+---------------+---------------------------------------------------+ | | XConv | Supported | | | +---------------------------+---------------+---------------------------------------------------+ | | HypergraphConv | Supported | ``num_edges`` is required. | | +---------------------------+---------------+---------------------------------------------------+ | | LEConv | Supported | | | +---------------------------+---------------+---------------------------------------------------+ | | PNAConv | Supported | | | +---------------------------+---------------+---------------------------------------------------+ | | ClusterGCNConv | Supported | | | +---------------------------+---------------+---------------------------------------------------+ | | PANConv | Not supported | | | +---------------------------+---------------+---------------------------------------------------+ | | WLConv | Not supported | | | +---------------------------+---------------+---------------------------------------------------+ | | WLConvContinuous | Supported | | | +---------------------------+---------------+---------------------------------------------------+ | | SuperGATConv | Not supported | | | +---------------------------+---------------+---------------------------------------------------+ | | GeneralConv | Supported | | | +---------------------------+---------------+---------------------------------------------------+ | | HGTConv | Supported | | | +---------------------------+---------------+---------------------------------------------------+ | | HeteroConv | Supported | | | +---------------------------+---------------+---------------------------------------------------+ | | LGConv | Supported | | | +---------------------------+---------------+---------------------------------------------------+ | | PointGNNConv | Supported | | | +---------------------------+---------------+---------------------------------------------------+ | | GPSConv | Not supported | | | +---------------------------+---------------+---------------------------------------------------+ | | RGATConv | Supported | The ``attention_mechanism`` option | | | | | ``within-relation`` is not supported. | | +---------------------------+---------------+---------------------------------------------------+ | | FiLMConv | Supported | ``num_relations`` cannot be greater than 1. | | | | | | | +---------------------------+---------------+---------------------------------------------------+ | | GCNConv | Supported | ``add_self_loops`` must be set to False. | | +---------------------------+ | | | | GravNetConv | | | | +---------------------------+ | | | | GATConv | | | | +---------------------------+ | | | | GATv2Conv | | | | +---------------------------+ | | | | AGNNConv | | | | +---------------------------+ | | | | SGConv | | | | +---------------------------+ | | | | SSGConv | | | | +---------------------------+ | | | | FastRGCNConv | | | | +---------------------------+ | | | | SignedConv | | | | +---------------------------+ | | | | DNAConv | | | | +---------------------------+ | | | | PointNetConv | | | | +---------------------------+ | | | | GMMConv | | | | +---------------------------+ | | | | PPFConv | | | | +---------------------------+ | | | | FeaStConv | | | | +---------------------------+ | | | | PointTransformerConv | | | | +---------------------------+ | | | | GENConv | | | | +---------------------------+ | | | | GCN2Conv | | | | +---------------------------+ | | | | FAConv | | | | +---------------------------+ | | | | EGConv | | | | +---------------------------+ | | | | PDNConv | | | | +---------------------------+ | | | | HEATConv | | | | +---------------------------+ | | | | HANConv | | | | +---------------------------+ | | | | AntiSymmetricConv | | | +-------------------------------+---------------------------+---------------+---------------------------------------------------+ | Aggregation Layers | MultiAggregation | Supported | ``dim_size`` is required. | | +---------------------------+ | | | | SumAggregation | | | | +---------------------------+ | | | | MeanAggregation | | | | +---------------------------+ | | | | MaxAggregation | | | | +---------------------------+ | | | | MinAggregation | | | | +---------------------------+ | | | | MulAggregation | | | | +---------------------------+ | | | | VarAggregation | | | | +---------------------------+ | | | | StdAggregation | | | | +---------------------------+ | | | | SoftmaxAggregation | | | | +---------------------------+ | | | | PowerMeanAggregation | | | | +---------------------------+ | | | | DeepSetsAggregation | | | | +---------------------------+ | | | | AttentionalAggregation | | | | +---------------------------+ | | | | Set2Set | | | | +---------------------------+ | | | | DegreeScalerAggregation | | | | +---------------------------+ | | | | MedianAggregation | | | | +---------------------------+ | | | | QuantileAggregation | | | | +---------------------------+---------------+---------------------------------------------------+ | | SortAggregation | Supported | * ``dim_size`` is required. | | +---------------------------+ | * ``max_num_elements`` is required. | | | LSTMAggregation | | | | +---------------------------+ | | | | MLPAggregation | | | | +---------------------------+ | | | | GRUAggregation | | | | +---------------------------+---------------+---------------------------------------------------+ | | SetTransformerAggregation | Not supported | | | +---------------------------+---------------+---------------------------------------------------+ | | GraphMultisetTransformer | Not supported | | | +---------------------------+---------------+---------------------------------------------------+ | | EquilibriumAggregation | Not supported | | +-------------------------------+---------------------------+---------------+---------------------------------------------------+ | Normalization layers | BatchNorm | Supported | | | +---------------------------+---------------+---------------------------------------------------+ | | HeteroBatchNorm | Not supported | | | +---------------------------+---------------+---------------------------------------------------+ | | HeteroLayerNorm | Not supported | | | +---------------------------+---------------+---------------------------------------------------+ | | InstanceNorm | Supported | ``batch_size`` is required. | | +---------------------------+ | | | | GraphNorm | | | | +---------------------------+ | | | | LayerNorm | | | | +---------------------------+ | | | | GraphSizeNorm | | | | +---------------------------+ | | | | PairNorm | | | | +---------------------------+---------------+---------------------------------------------------+ | | MeanSubtractionNorm | Supported | ``dim_size`` is required. | | +---------------------------+---------------+---------------------------------------------------+ | | MessageNorm | Supported | | | +---------------------------+---------------+---------------------------------------------------+ | | DiffGroupNorm | Supported | | +-------------------------------+---------------------------+---------------+---------------------------------------------------+ | Pooling layers | Pooling | Supported | | | +---------------------------+---------------+---------------------------------------------------+ | | global_add_pool | Supported | ``size`` is required. | | +---------------------------+ | | | | global_mean_pool | | | | +---------------------------+ | | | | global_max_pool | | | | +---------------------------+---------------+---------------------------------------------------+ | | TopKPooling | Not supported | | | +---------------------------+---------------+---------------------------------------------------+ | | PANPooling | Not supported | | | +---------------------------+---------------+---------------------------------------------------+ | | SAGPooling | Not supported | | | +---------------------------+---------------+---------------------------------------------------+ | | EdgePooling | Not supported | | | +---------------------------+---------------+---------------------------------------------------+ | | ASAPooling | Not supported | | | +---------------------------+---------------+---------------------------------------------------+ | | MemPooling | Supported | * ``batch_size`` is required. | | | | | * ``max_num_nodes`` is required. | | +---------------------------+---------------+---------------------------------------------------+ | | max_pool | Not supported | | | +---------------------------+---------------+---------------------------------------------------+ | | avg_pool | Not supported | | | +---------------------------+---------------+---------------------------------------------------+ | | consecutive_cluster | Not supported | | | +---------------------------+---------------+---------------------------------------------------+ | | max_pool_neighbor_x | Not supported | | | +---------------------------+---------------+---------------------------------------------------+ | | avg_pool_neighbor_x | Not supported | | | +---------------------------+---------------+---------------------------------------------------+ | | max_pool_x | Supported | * ``batch_size`` is required. | | +---------------------------+ | * ``size`` is required. | | | avg_pool_x | | | | +---------------------------+---------------+---------------------------------------------------+ | | graclus | Not supported | | | +---------------------------+---------------+---------------------------------------------------+ | | voxel_grid | Supported | | | +---------------------------+---------------+---------------------------------------------------+ | | fps | Supported | * ``ptr`` is required. | | | | | * ``batch`` has to be ``None``. | | +---------------------------+---------------+---------------------------------------------------+ | | knn | Supported | | | +---------------------------+---------------+---------------------------------------------------+ | | knn_graph | Supported | ``loop`` must be set to True. | | | | | | | +---------------------------+---------------+---------------------------------------------------+ | | radius | Supported | ``batch_size`` is required. | | +---------------------------+ | | | | radius_graph | | | | +---------------------------+---------------+---------------------------------------------------+ | | nearest | Supported | * ``torch_cluster.nearest`` has to be replaced | | | | | with ``poptorch.nearest``. | | | | | * ``poptorch.nearest`` supports arguments | | | | | ``batch_x`` and ``batch_y`` in the original | | | | | form of ``torch.Tensor`` plus a regular list. | | | | | * Validation of batch indices is NOT performed if | | | | | ``batch_x`` and ``batch_y`` are passed as | | | | | ``torch.Tensor``. | | +---------------------------+---------------+---------------------------------------------------+ | | decimation_indices | Not supported | | +-------------------------------+---------------------------+---------------+---------------------------------------------------+ | Unspooling layers | knn_interpolate | Supported | | +-------------------------------+---------------------------+---------------+---------------------------------------------------+ | Functional | bro | Not supported | | | +---------------------------+---------------+---------------------------------------------------+ | | gini | Supported | | +-------------------------------+---------------------------+---------------+---------------------------------------------------+ | Dense convolutional layers | DenseGCNConv | Supported | | | +---------------------------+---------------+---------------------------------------------------+ | | DenseGINConv | Supported | | | +---------------------------+---------------+---------------------------------------------------+ | | DenseGraphConv | Supported | | + +---------------------------+---------------+---------------------------------------------------+ | | DenseSAGEConv | Supported | | + +---------------------------+---------------+---------------------------------------------------+ | | DenseGATConv | TBD | | +-------------------------------+---------------------------+---------------+---------------------------------------------------+ | Dense pooling layers | dense_diff_pool | Not supported | | | +---------------------------+---------------+---------------------------------------------------+ | | dense_mincut_pool | Not supported | | | +---------------------------+---------------+---------------------------------------------------+ | | DMoNPooling | Not supported | | +-------------------------------+---------------------------+---------------+---------------------------------------------------+ | Encoding | PositionalEncoding | Supported | | | +---------------------------+---------------+---------------------------------------------------+ | | TemporalEncoding | Supported | | +-------------------------------+---------------------------+---------------+---------------------------------------------------+ | KGE Models | TransE | Supported | | | +---------------------------+---------------+---------------------------------------------------+ | | ComplEx | Supported | | | +---------------------------+---------------+---------------------------------------------------+ | | DistMult | Supported | | | +---------------------------+---------------+---------------------------------------------------+ | | RotatE | Supported | | +-------------------------------+---------------------------+---------------+---------------------------------------------------+ ================================================ FILE: docs/poptorch_geometric/user_guide/tutorials.rst ================================================ .. _examples_and_tutorials: ====================== Examples and tutorials ====================== Examples demonstrating different use scenarios for PopTorch Geometric are available in the `Graphcore examples repository on GitHub `_. Tutorials in the form of Jupyter notebooks are available in the `PyTorch Geometric tutorials directory `__. These tutorials show how to get the maximum benefit from IPU systems with PopTorch Geometric. ================================================ FILE: docs/user_guide/CMakeLists.txt ================================================ set(LONG_TESTS mnist inferenceModel) function(add_poptorch_py_user_guide_example name path) message(STATUS "Adding python example '${name}'") set(extra_labels "") if("${name}" STREQUAL "pipeline_simple") set(extra_labels ";external_data") else() if("${name}" IN_LIST LONG_TESTS) set(extra_labels "") else() set(extra_labels ";short") endif() endif() add_test(NAME "${name}_user_guide_example" COMMAND python3 ${path}/${name}.py WORKING_DIRECTORY ${PROJECT_BINARY_DIR}) set_tests_properties("${name}_user_guide_example" PROPERTIES LABELS "user_guide_examples${extra_labels}") endfunction() install(FILES "poptorch.conf" DESTINATION "${PROJECT_BINARY_DIR}/tmp") file(GLOB EXAMPLES "${CMAKE_CURRENT_SOURCE_DIR}/*.py") if(COPY_TESTS) install(FILES ${EXAMPLES} DESTINATION "${CMAKE_CURRENT_BINARY_DIR}") set(DOC_EXAMPLES_PATH "${CMAKE_CURRENT_BINARY_DIR}") else() set(DOC_EXAMPLES_PATH "${CMAKE_CURRENT_SOURCE_DIR}") endif() foreach(EXAMPLE ${EXAMPLES}) get_filename_component(NAME ${EXAMPLE} NAME_WE) add_poptorch_py_user_guide_example(${NAME} ${DOC_EXAMPLES_PATH}) endforeach() if(BUILD_DOCS) run_poptorch_install_command( "python3 ${PROJECT_SOURCE_DIR}/scripts/docs_build.py --install-dir ${CMAKE_INSTALL_PREFIX} --add-to-sys-path ${CMAKE_INSTALL_PREFIX}" "${PROJECT_BINARY_DIR}" "docs_build.py") endif() ================================================ FILE: docs/user_guide/api.py ================================================ # Copyright (c) 2020 Graphcore Ltd. All rights reserved. import tempfile import os import torch import poptorch # ctc_beam_search_start class Model(torch.nn.Module): def forward(self, log_probs, lengths): return poptorch.ctc_beam_search_decoder(log_probs, lengths) # ctc_beam_search_end # print_tensor_start class ExampleModel(torch.nn.Module): def __init__(self): super().__init__() self.bias = torch.nn.Parameter(torch.zeros(())) def forward(self, x): x = x + 1 # It is important to make sure the result of the print is used. x = poptorch.ipu_print_tensor(x) return x + self.bias # print_tensor_end model = poptorch.inferenceModel(ExampleModel()) model(torch.tensor([1.0, 2.0, 3.0])) # identity_start def custom_loss(output, target): # Mean squared error with a scale loss = output - target loss = loss * loss * 5 return poptorch.identity_loss(loss, reduction="mean") class ExampleModelWithCustomLoss(torch.nn.Module): def __init__(self): super().__init__() self.model = ExampleModel() def forward(self, input, target): out = self.model(input) return out, custom_loss(out, target) # identity_end model_with_loss = ExampleModelWithCustomLoss() poptorch_model = poptorch.trainingModel(model_with_loss) print(f"Bias before training: {model_with_loss.model.bias}") for _ in range(100): out, loss = poptorch_model(input=torch.tensor([1.0, 2.0, 3.0]), target=torch.tensor([3.0, 4.0, 5.0])) print(f"Out = {out}, loss = {float(loss):.2f}") print(f"Bias after training: {model_with_loss.model.bias}") torch.testing.assert_close(model_with_loss.model.bias, torch.tensor(1.0, dtype=torch.float), rtol=1e-4, atol=1e-5) poptorch_model.destroy() model = ExampleModelWithCustomLoss() input = torch.tensor([1.0, 2.0, 3.0]) target = torch.tensor([3.0, 4.0, 5.0]) options = poptorch.Options() # optim_start opt = poptorch.optim.SGD(model.parameters(), lr=0.01, loss_scaling=2.0, use_combined_accum=False) poptorch_model = poptorch.trainingModel(model, options, opt) poptorch_model(input, target) # Update optimizer attribute opt.loss_scaling = 1.0 # Update param_group attribute opt.param_groups[0]["loss_scaling"] = 1.0 # Set the new optimizer in the model poptorch_model.setOptimizer(opt) poptorch_model(input, target) # optim_end poptorch_model.destroy() # optim_const_start # lr, momentum and loss_scaling will be marked as variable. opt = poptorch.optim.SGD(model.parameters(), lr=0.01, momentum=0.0, use_combined_accum=False) # momentum and loss_scaling will be marked as constant. opt = poptorch.optim.SGD(model.parameters(), lr=0.01, use_combined_accum=False) # lr and momentum will be marked as variable. # loss_scaling will be marked as constant. opt = poptorch.optim.SGD(model.parameters(), lr=0.01, momentum=0.0, loss_scaling=2.0, use_combined_accum=False) opt.variable_attrs.markAsConstant("loss_scaling") # lr, momentum and loss_scaling will be marked as variable. opt = poptorch.optim.SGD(model.parameters(), lr=0.01, loss_scaling=2.0, use_combined_accum=False) opt.variable_attrs.markAsVariable("momentum") # optim_const_end # torch_optim_const_start # momentum will be marked as constant (It's not set) opt = torch.optim.SGD(model.parameters(), lr=0.01) # lr will be marked as variable. # momentum will still be marked as constant (Because its default value is 0.0) opt = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.0) # lr and momentum will both be marked as variable. opt = torch.optim.SGD(model.parameters(), lr=0.01, momentum=1.0) # torch_optim_const_end # conf_load_start opts = poptorch.Options() opts.loadFromFile("tmp/poptorch.conf") # conf_load_end with tempfile.TemporaryDirectory() as d: PATH = os.path.join(d, "checkpoint.pt") # optim_state_dict_start optimizer = poptorch.optim.Adam(model.parameters()) poptorch_model = poptorch.trainingModel(model, optimizer=optimizer) poptorch_model(input, target) # Saving the optimizer state torch.save({'optimizer_state_dict': optimizer.state_dict()}, PATH) # Destroy original model to prevent an error when wrapping the model again poptorch_model.destroy() new_optimizer = poptorch.optim.Adam(model.parameters()) # Loading the optimizer state back checkpoint = torch.load(PATH) new_optimizer.load_state_dict(checkpoint['optimizer_state_dict']) # The new training model will use the loaded optimizer state new_poptorch_model = poptorch.trainingModel(model, optimizer=optimizer) # optim_state_dict_end ================================================ FILE: docs/user_guide/batching.rst ================================================ .. _efficient_data_batching: ======================= Efficient data batching ======================= By default, PopTorch will process the ``batch_size`` which you provided to the :py:class:`~poptorch.DataLoader`. This value is known as the micro-batch size. When using the other options below, the actual number of samples used per step varies to allow the IPU(s) to process data more efficiently. However, the effective batch size for operations which depend on it (for example the size of mini-batches, in PyTorch's terminology, when using Pytorch's `BatchNorm `__ layers) will not change. All that changes is how much data is actually sent for a single step. .. note:: Failure to use :py:class:`~poptorch.DataLoader` may result in accidentally changing the effective batch size for operations which depend on it, such as batch normalization. :tutorials-repo:`PopTorch tutorial: Efficient data loading ` is a detailed tutorial regarding efficient data loading, batching and tuning relevant hyperparameters in PopTorch. poptorch.DataLoader =================== PopTorch provides a thin wrapper around the traditional `torch.utils.data.DataLoader `_ to abstract away some of the batch sizes calculations. If :py:class:`~poptorch.DataLoader` is used in a distributed execution environment, it will ensure that each process uses a different subset of the dataset. If you set the :py:class:`~poptorch.DataLoader` ``batch_size`` to more than 1 then each operation in the model will process that number of elements at any given time. Please see the usage example below. poptorch.AsynchronousDataAccessor ================================= To reduce host overhead you can offload the data loading process to a separate thread by specifying :py:class:`mode=poptorch.DataLoaderMode.Async ` in the :py:class:`~poptorch.DataLoader` constructor. Internally this uses an :py:class:`~poptorch.AsynchronousDataAccessor`. Doing this allows you to reduce the host/IPU communication overhead by using the time that the IPU is running to load the next batch on the CPU. This means that when the IPU is finished executing and returns to host the data will be ready for the IPU to pull in again. .. literalinclude:: device_iterations.py :caption: Use of AsynchronousDataAccessor :start-after: data_accessor_start :end-before: data_accessor_end :emphasize-lines: 10 :linenos: .. warning:: Tensors being iterated over using an :py:class:`~poptorch.AsynchronousDataAccessor` use shared memory. You must clone tensors at each iteration if you wish to keep their references outside of each iteration. Consider the following example: .. code-block:: python :emphasize-lines: 5 predictions, labels = [], [] for data, label in dataloader: predictions += poptorch_model(data) labels += label The ``predictions`` list will be correct because it's producing a new tensor from the inputs. However, The list ``labels`` will contain identical references. This line would need to be replaced with the following: .. code-block:: python labels += label.detach().clone() Rebatching iterable datasets ---------------------------- There are `two types of datasets in PyTorch `_ : map-style datasets and iterable datasets. As explained in the notes of PyTorch's `Data Loading Order and Sampler `_: for `IterableDataset `_: "When fetching from iterable-style datasets with multi-processing, the drop_last argument drops the last non-full batch of each worker's dataset replica." This means that if the number of elements is naively divided among the number of workers (which is the default behaviour) then potentially a significant number of elements will be dropped. For example: .. code-block:: python num_tensors = 100 num_workers = 7 batch_size = 4 per_worker_tensors = ceil(100 / num_workers) = 15 last_worker_tensors = 100 - (num_workers - 1) * per_worker_tensors = 10 num_tensors_used = batch_size * (floor(per_worker_tensors / batch_size) * (num_workers - 1) + floor(last_worker_tensors / batch_size)) = 80 This means in this particular case 20% of the dataset will never be used. But, in general the larger the number of workers and the batch size, the more data will end up being unused. To work around this issue PopTorch has a :py:class:`mode=poptorch.DataLoaderMode.AsyncRebatched `. PopTorch will set the ``batch_size`` in the PyTorch Dataset and DataLoader to 1 and will instead create the batched tensors in its worker process. The shape of the tensors returned by the DataLoader will be the same as before, but the number of used tensors from the dataset will increase to ``floor(num_tensors / batch_size) * batch_size`` (which means all the tensors would be used in the example above). .. note:: This flag is not enabled by default because the behaviour is different from the upstream DataLoader. .. _device_iterations: poptorch.Options.deviceIterations ================================= When training, a device iteration corresponds to one iteration of the training loop executed on the IPU, starting with data loading, followed by the forward and backward passes, and ending with a weight update. If :ref:`gradient accumulation ` is not used then if you set :py:meth:`~poptorch.Options.deviceIterations` to `n`, PopTorch will carry out this loop `n` times (processing `n` micro-batches) on the IPU before returning control to the host, which will improve processing efficiency. If gradient accumulation is used then the number of micro-batches processed will be `n` multiplied by the value set using :py:meth:`~poptorch.options._TrainingOptions.gradientAccumulation`. For inference, a device iteration corresponds to data loading and the forward pass. Note that the returned output dimensions depend on :py:meth:`~poptorch.Options.outputMode`. The default value for :py:func:`~poptorch.trainingModel` is `Final`, since you will often not need to receive all or any of the output tensors and it is more efficient not to receive them. Therefore, only the last batch of data will be returned to the host under this setting. You can change this behaviour by setting the value of :py:meth:`~poptorch.Options.outputMode`. to `All`. This returns the result of every batch to the host. .. note:: When running an :py:class:`~poptorch.inferenceModel` with :py:class:`~poptorch.PipelinedExecution`, you must set :py:meth:`~poptorch.Options.deviceIterations` to at least the number of pipeline steps. .. literalinclude:: device_iterations.py :caption: Use of device iterations and batch size :start-after: iterations_start :end-before: iterations_end :emphasize-lines: 51, 57, 63 :linenos: poptorch.Options.replicationFactor ================================== :py:meth:`~poptorch.Options.replicationFactor` will replicate the model over multiple IPUs to allow automatic data parallelism across many IPUs. .. literalinclude:: device_iterations.py :caption: Use of replication factor :start-after: replication_start :end-before: replication_end :emphasize-lines: 13 :linenos: poptorch.Options.inputReplicaGrouping ===================================== :py:meth:`~poptorch.Options.inputReplicaGrouping` allows the input batches to be split between groups of replicas, in a similar way to what :py:func:`~replicaGrouping` does for weight tensors. See :numref:`grouping_tensor_weights`. .. _gradient_accumulation: poptorch.Options.Training.gradientAccumulation ============================================== You can use :py:meth:`~poptorch.options._TrainingOptions.gradientAccumulation` to run a number of micro-batches before updating parameters (weights) during training. The number of gradient accumulations is equal to the number of micro-batches (batches whose size is specified as the ``batch_size`` value provided to the :py:class:`~poptorch.DataLoader`) which are processed between model updates. After accumulation, PopTorch updates the model using the gradients accumulated from processing all the batches. .. note:: When running an :py:class:`~poptorch.inferenceModel`, you must set :py:meth:`~poptorch.options._TrainingOptions.gradientAccumulation` to 1. As mentioned in :numref:`pipelined_execution`, you need to use gradient accumulations when training with :py:class:`~poptorch.PipelinedExecution` because the parameters can only be updated between pipeline runs. You need to set the number of accumulations to at least the number of pipeline stages. However, with this value, the pipeline will switch into the "ramp-down" period as soon as it has finished the "ramp-up" period. Using a larger number of gradient accumulations means that the pipeline will run at full efficiency for longer. However, the increase in batches between parameter updates may reduce the overall training efficiency of your model. The optimal number of gradient accumulations is a trade off between these two factors. .. note:: :py:meth:`~poptorch.options._TrainingOptions.gradientAccumulation` is only needed by :py:class:`~poptorch.PipelinedExecution`. Other execution modes may benefit from it because the IPUs will spend less time updating parameters during training. .. literalinclude:: device_iterations.py :caption: Use of gradient accumulation :start-after: gradient_acc_start :end-before: gradient_acc_end :emphasize-lines: 12 :linenos: In the code example below, :py:class:`~poptorch.Block` introduced in :numref:`execution_strategies` is used to divide up a different model into disjoint subsets of layers. These blocks can be shared among multiple parallel execution strategies. .. literalinclude:: mnist.py :language: python :linenos: :start-after: annotations_start :end-before: annotations_end :emphasize-lines: 12, 14, 16, 18, 34 :caption: A training model making use of :py:class:`~poptorch.Block` You can see the code examples of :py:class:`~poptorch.SerialPhasedExecution`, :py:class:`~poptorch.PipelinedExecution`, and :py:class:`~poptorch.ShardedExecution` below. An instance of class :py:class:`~poptorch.PipelinedExecution` defines an execution strategy that assigns layers to multiple IPUs as a pipeline. Gradient accumulation is used to push multiple batches through the pipeline allowing IPUs to run in parallel. .. literalinclude:: mnist.py :caption: An example of different parallel execution strategies :language: python :linenos: :start-after: annotations_strategy_start :end-before: annotations_strategy_end :emphasize-lines: 6, 13, 19, 21 :numref:`figPipeline` shows the pipeline execution for multiple batches on IPUs. There are 4 pipeline stages running on 4 IPUs respectively. Gradient accumulation enables us to keep the same number of pipeline stages, but with a wider pipeline. This helps hide the latency, which is the total time for one item to go through the whole system, as highlighted. .. _figPipeline: .. figure:: IPU-pipeline.jpg :width: 400 Pipeline execution with gradient accumulation .. _trainingOutputMode: poptorch.Options.outputMode ========================================== When you use a :py:func:`~poptorch.inferenceModel`, you will usually want to receive all the output tensors. For this reason, PopTorch will return them all to you by default. However, you can change this behaviour using :py:func:`~poptorch.Options.outputMode`. When you use a :py:func:`~poptorch.trainingModel`, you will often not need to receive all or any of the output tensors and it is more efficient not to receive them. For this reason, PopTorch only returns the last batch of tensors by default. As in the the case of ``inferenceModel``, you can change this behaviour using :py:func:`~poptorch.Options.outputMode`. If you want to monitor training using a metric such as loss or accuracy, you may wish to take into account all tensors. To do this with minimal or no overhead, you can use ``poptorch.OutputMode.Sum``. For example: .. literalinclude:: sumAnchorReturnType.py :caption: A model which returns training accuracy as a tensor :language: python :linenos: :start-after: model_returning_accuracy_start :end-before: model_returning_accuracy_end .. literalinclude:: sumAnchorReturnType.py :caption: Efficient calculation of training accuracy across all batches :language: python :linenos: :start-after: sum_accuracy_start :end-before: sum_accuracy_end ================================================ FILE: docs/user_guide/buffers.py ================================================ # Copyright (c) 2021 Graphcore Ltd. All rights reserved. import torch import poptorch # counter_model_wrong_start class CounterModel(torch.nn.Module): def __init__(self): super().__init__() self.i = torch.tensor([0.], dtype=torch.float) def forward(self): self.i += 1 return self.i model = CounterModel() poptorch_model = poptorch.inferenceModel(model) print(poptorch_model()) # tensor([1.]) print(poptorch_model()) # tensor([1.]) # counter_model_wrong_end torch.testing.assert_close(model.i, torch.tensor([1.], dtype=torch.float)) # pragma pylint: disable=function-redefined,no-member # counter_model_correct_start class CounterModel(torch.nn.Module): def __init__(self): super().__init__() self.register_buffer("i", torch.tensor([0.], dtype=torch.float)) def forward(self): self.i += 1 return self.i model = CounterModel() poptorch_model = poptorch.inferenceModel(model) print(poptorch_model()) # tensor([1.]) print(poptorch_model()) # tensor([2.]) # counter_model_correct_end # Because the model is running in inference mode, we will need to manually # call copyWeightsToHost poptorch_model.copyWeightsToHost() torch.testing.assert_close(model.i, torch.tensor([2.], dtype=torch.float)) ================================================ FILE: docs/user_guide/debugging.py ================================================ #!/usr/bin/env python3 # Copyright (c) 2021 Graphcore Ltd. All rights reserved. import torch import poptorch class Model(torch.nn.Module): def __init__(self): super().__init__() self.fc1 = torch.nn.Linear(10, 10) self.relu = torch.nn.ReLU() self.fc2 = torch.nn.Linear(10, 10) self.loss = torch.nn.MSELoss(reduction="mean") def forward(self, x, labels=None): out = self.fc2(self.relu(self.fc1(x))) if self.training: return self.loss(out, labels) return out # tensor_names_start input = torch.rand(10, 10) label = torch.rand(10, 10) model = Model() poptorch_model = poptorch.trainingModel(model) poptorch_model(input, label) tensor_names = poptorch_model.getTensorNames() # tensor_names_end # tensor_anchor_start opts = poptorch.Options() opts.anchorTensor('grad_bias', 'Gradient___fc2.bias') opts.anchorTensor('update_weight', 'UpdatedVar___fc2.weight') # tensor_anchor_end poptorch_model.destroy() # tensor_retrieve_start poptorch_model = poptorch.trainingModel(model, opts) poptorch_model(input, label) grad = poptorch_model.getAnchoredTensor('grad_bias') update = poptorch_model.getAnchoredTensor('update_weight') # tensor_retrieve_end poptorch_model.destroy() # optim_state_dict_start optim = poptorch.optim.SGD(model.parameters(), lr=0.01) poptorch_model = poptorch.trainingModel(model, opts, optim) poptorch_model(input, label) state = optim.state_dict() # optim_state_dict_end ================================================ FILE: docs/user_guide/debugging.rst ================================================ ===================== Debugging your model ===================== Inspecting tensors ================== PopTorch allows you to inspect arbitrary tensors in both inference and training models. This is very useful for debugging conditions such as overflows, underflows or vanishing gradients. Numerous tensors are generated during model compilation. In order to inspect their values, you first have to find their names. You can retrieve the complete list of tensor names in your model by calling :py:func:`~poptorch.PoplarExecutor.getTensorNames`. Note that the model must first be compiled. .. literalinclude:: debugging.py :caption: Retrieving the list of tensor names :start-after: tensor_names_start :end-before: tensor_names_end :emphasize-lines: 8 Anchoring tensors ================= Once you have chosen a few tensors of interest, the next step is to create anchors. Anchoring enables a tensor to be observed by the application without it having to be a model output. You can create an anchor by calling :py:func:`~poptorch.Options.anchorTensor`. It takes two mandatory string parameters: a convenient user-defined name for the anchor and the name of the chosen tensor. Optionally, you may specify the output mode as well as the output return period. In order for these option settings to take effect, they must be set before model compilation. In the example below, two anchors are created: one for a bias gradient tensor and one for the updated weights of a linear layer. .. literalinclude:: debugging.py :caption: Anchoring tensors :start-after: tensor_anchor_start :end-before: tensor_anchor_end :emphasize-lines: 2, 3 Retrieving tensors ================== The anchored tensors will be updated after every model invocation. You can retrieve their values using :py:func:`~poptorch.PoplarExecutor.getAnchoredTensor`. The function takes a single parameter - the user-defined anchor name. In the example below, we execute one training run and retrieve the values of the two tensors we anchored previously. .. literalinclude:: debugging.py :caption: Anchoring tensors :start-after: tensor_retrieve_start :end-before: tensor_retrieve_end :emphasize-lines: 4, 5 For a more practical understanding around observing tensors, the `Graphcore GitHub examples repository `__ contains a tutorial you can follow about observing tensors, using anchoring and generating a gradient histogram: :tutorials-repo:`PopTorch tutorial: Observing tensors `. Inspecting optimiser state ========================== You can inspect the optimiser state without using anchoring. After you instantiate a :py:func:`~poptorch.trainingModel`, the optimiser's `state_dict()` function will return the internal optimiser's state. This state dictionary will be populated when the training model is compiled, and is updated after each training step. .. literalinclude:: debugging.py :caption: Inspecting optimiser state :start-after: optim_state_dict_start :end-before: optim_state_dict_end :emphasize-lines: 5 .. note:: The entries in PopTorch's optimiser `state_dict()` may differ from those in PyTorch in both name and structure. ================================================ FILE: docs/user_guide/device_iterations.py ================================================ # Copyright (c) 2020 Graphcore Ltd. All rights reserved. import sys import poptorch if not poptorch.ipuHardwareIsAvailable(): print("Replicated top level graphs are not supported on the IPU model") sys.exit(0) # pylint: disable=unused-variable, wrong-import-position, reimported, ungrouped-imports, wrong-import-order # iterations_start from functools import reduce from operator import mul import torch import poptorch class ExampleModelWithLoss(torch.nn.Module): def __init__(self, data_shape, num_classes): super().__init__() self.fc = torch.nn.Linear(reduce(mul, data_shape), num_classes) self.loss = torch.nn.CrossEntropyLoss() def forward(self, x, target=None): reshaped = x.reshape([x.shape[0], -1]) fc = self.fc(reshaped) if target is not None: return fc, self.loss(fc, target) return fc class ExampleDataset(torch.utils.data.Dataset): def __init__(self, shape, length): super().__init__() self._shape = shape self._length = length self._all_data = [] self._all_labels = [] torch.manual_seed(0) for _ in range(length): label = 1 if torch.rand(()) > 0.5 else 0 data = torch.rand(self._shape) + label data[0] = -data[0] self._all_data.append(data) self._all_labels.append(label) def __len__(self): return self._length def __getitem__(self, index): return self._all_data[index], self._all_labels[index] def device_iterations_example(): # Set the number of samples for which activations/gradients are computed # in parallel on a single IPU model_batch_size = 2 # Create a poptorch.Options instance to override default options opts = poptorch.Options() # Run a 100 iteration loop on the IPU, fetching a new batch each time opts.deviceIterations(100) # Set up the DataLoader to load that much data at each iteration training_data = poptorch.DataLoader(opts, dataset=ExampleDataset(shape=[3, 2], length=10000), batch_size=model_batch_size, shuffle=True, drop_last=True) model = ExampleModelWithLoss(data_shape=[3, 2], num_classes=2) # Wrap the model in a PopTorch training wrapper poptorch_model = poptorch.trainingModel(model, options=opts) # Run over the training data, 100 batches at a time (specified in # opts.deviceIterations()) for batch_number, (data, labels) in enumerate(training_data): # Execute the device with a 100 iteration loop of batchsize 2. # "output" and "loss" will be the respective output and loss of the # final batch (the default OutputMode). output, loss = poptorch_model(data, labels) print(f"{labels[-1]}, {output}, {loss}") # iterations_end poptorch_model.destroy() # release the IPUs # replication_start def replication_factor_example(): # Set the number of samples for which activations/gradients are computed # in parallel on a single IPU model_batch_size = 2 # replication_start # Create a poptorch.Options instance to override default options opts = poptorch.Options() # Run a 100 iteration loop on the IPU, fetching a new batch each time opts.deviceIterations(100) # Duplicate the model over 4 replicas. opts.replicationFactor(4) training_data = poptorch.DataLoader(opts, dataset=ExampleDataset(shape=[3, 2], length=100000), batch_size=model_batch_size, shuffle=True, drop_last=True) model = ExampleModelWithLoss(data_shape=[3, 2], num_classes=2) # Wrap the model in a PopTorch training wrapper poptorch_model = poptorch.trainingModel(model, options=opts) # Run over the training data, 100 batches at a time (specified in # opts.deviceIterations()) for batch_number, (data, labels) in enumerate(training_data): # Execute the device with a 100 iteration loop of model batchsize 2 # across 4 IPUs (global batchsize = 2 * 4 = 8). "output" and "loss" # will be the respective output and loss of the final batch of each # replica (the default OutputMode). output, loss = poptorch_model(data, labels) print(f"{labels[-1]}, {output}, {loss}") # replication_end poptorch_model.destroy() # release the IPUs # gradient_acc_start def gradient_accumulation_example(): # Set the number of samples for which activations/gradients are computed # in parallel on a single IPU model_batch_size = 2 # Create a poptorch.Options instance to override default options opts = poptorch.Options() # Run a 400 iteration loop on the IPU, fetching a new batch each time opts.deviceIterations(400) # Accumulate the gradient 8 times before applying it. opts.Training.gradientAccumulation(8) training_data = poptorch.DataLoader(opts, dataset=ExampleDataset(shape=[3, 2], length=100000), batch_size=model_batch_size, shuffle=True, drop_last=True) # Wrap the model in a PopTorch training wrapper poptorch_model = poptorch.trainingModel(model, options=opts) # Run over the training data, 400 batches at a time (specified in # opts.deviceIterations()) for batch_number, (data, labels) in enumerate(training_data): # Execute the device with a 100 iteration loop of model batchsize 2 # with gradient updates every 8 iterations (global batchsize = 2 * 8 = 16). # "output" and "loss" will be the respective output and loss of the # final batch of each replica (the default OutputMode). output, loss = poptorch_model(data, labels) print(f"{labels[-1]}, {output}, {loss}") # gradient_acc_end poptorch_model.destroy() # release the IPUs def data_accessor_example(): # Not displayed: just to keep the linter happy shape = [3, 2] num_tensors = 100 batch_size = 1 num_workers = 0 device_iterations = 1 replication_factor = 1 # Example starts here: # data_accessor_start opts = poptorch.Options() opts.deviceIterations(device_iterations) opts.replicationFactor(replication_factor) loader = poptorch.DataLoader(opts, ExampleDataset(shape=shape, length=num_tensors), batch_size=batch_size, num_workers=num_workers, mode=poptorch.DataLoaderMode.Async) poptorch_model = poptorch.inferenceModel(model, opts) for it, (data, _) in enumerate(loader): out = poptorch_model(data) # data_accessor_end poptorch_model.destroy() # release the IPUs model = ExampleModelWithLoss(data_shape=[3, 2], num_classes=2) model_batch_size = 2 # distributed_execution_start def process(process_id=0, num_processes=1): # Create a poptorch.Options instance to override default options opts = poptorch.Options() # Run a 100 iteration loop on the IPU, fetching a new batch each time opts.deviceIterations(400) # Replicate the graph across 2 IPUs in each process. opts.replicationFactor(2) # Set the id of the current process and the total number of processes. opts.Distributed.configureProcessId(process_id, num_processes) # Accumulate the gradient 8 times before applying it. opts.Training.gradientAccumulation(8) # Optional: All the processes must use the same seed if shuffle=True is used for the DataLoader. opts.randomSeed(42) training_data = poptorch.DataLoader(opts, dataset=ExampleDataset(shape=[3, 2], length=100000), batch_size=model_batch_size, shuffle=True, drop_last=True) # Wrap the model in a PopTorch training wrapper poptorch_model = poptorch.trainingModel(model, options=opts) # Run over the training data with "batch_size" 200 essentially. for batch_number, (data, labels) in enumerate(training_data): # Execute the device with a 100 iteration loop of batchsize 8 across # 4 IPUs (batch-size 2 per replica). "output" and "loss" will be the # respective output and loss of the final batch of each replica # (the default OutputMode). output, loss = poptorch_model(data, labels) print(f"{batch_number} {labels[-1]}, {output}, {loss}") # distributed_execution_end poptorch_model.destroy() # release the IPUs # AsynchronousDataAccessor must run in the main process if __name__ == "__main__": device_iterations_example() replication_factor_example() gradient_accumulation_example() data_accessor_example() process() ================================================ FILE: docs/user_guide/error_handling.py ================================================ # Copyright (c) 2021 Graphcore Ltd. All rights reserved. import torch import poptorch from poptorch.poptorch_core import TestErrorType # pragma pylint: disable=broad-except # This is a fake model which actually throws an exception class PytorchModel(torch.nn.Module): def __init__(self, error): super().__init__() if error is not None: poptorch.poptorch_core._throwTestError(error) def forward(self, x, y): return x + y def run_example(model_param=None): rebooted = False shutdown = False def reboot_server(): nonlocal rebooted rebooted = True def shutdown_system(): nonlocal shutdown shutdown = True # error_handling_start try: m = PytorchModel(model_param) inference_model = poptorch.inferenceModel(m) t1 = torch.tensor([1.]) t2 = torch.tensor([2.]) assert inference_model(t1, t2) == 3.0 except poptorch.RecoverableError as e: print(e) if e.recovery_action == "FULL_RESET": reboot_server() elif e.recovery_action == "IPU_RESET": print("Need to reset the IPU") elif e.recovery_action == "PARITION_RESET": print("Need to reset the partition") except poptorch.UnrecoverableError as e: print(f"Unrecoverable error: machine needs to be taken offline: {e}") shutdown_system() except poptorch.Error as e: print(f"Received {e.message} from component {e.type}, " f"location: {e.location}") # Or you could just print all the information at once: print(e) except Exception as e: print(e) # error_handling_end if model_param == TestErrorType.PoplarRecoverableFullReset: assert rebooted elif model_param == TestErrorType.PoplarUnrecoverable: assert shutdown else: assert not rebooted assert not shutdown if __name__ == "__main__": # Check the example is valid run_example() for t in TestErrorType.__members__.values(): run_example(t) ================================================ FILE: docs/user_guide/example.rst ================================================ Examples ======== You can find PyTorch examples and tutorials in the Graphcore GitHub `examples repository `__. This contains: * Examples of popular machine learning models for training and inference * :tutorials-repo:`Tutorials ` * :tutorials-repo:`Examples of PopTorch and IPU features ` * :tutorials-repo:`Examples of simple models ` * Source code from videos, blogs and other documents MNIST example _____________ The example in :numref:`mnist-example-code` shows how an MNIST model can be run on the IPU. The highlighted lines show the PopTorch-specific code required to run the example on multiple IPUs. You can download the full source code from GitHub: :github-poptorch:`mnist.py `. To run this example you will need to install the Poplar SDK (see the `Getting Started Guide `_ for your IPU system) and the appropriate version of ``torchvision``: .. code-block:: console $ python3 -m pip install torchvision==0.11.1 .. literalinclude:: ../../examples/mnist.py :caption: MNIST example :name: mnist-example-code :start-after: mnist_start :end-before: mnist_end :emphasize-lines: 12, 15, 17, 20, 35, 96, 99 :language: python :dedent: 3 :linenos: :lineno-match: ================================================ FILE: docs/user_guide/experimental.rst ================================================ ===================== Experimental features ===================== Distributed execution without PopRun ==================================== PopTorch supports distributed execution on a Pod using the IPU over Fabric (IPUoF). If you run a program using your own distributed processing tool instead of PopRun, the only change you need to make to your code is to set the ID of the current process and the total number of processes the execution is distributed across, using :py:meth:`~poptorch.options._DistributedOptions.configureProcessId`. Note that :py:meth:`~poptorch.Options.replicationFactor` should be used to set the number of local replicas (per host) not the total (global) number of replicas. .. literalinclude:: device_iterations.py :caption: Changes required for distributed execution :start-after: distributed_execution_start :end-before: distributed_execution_end :emphasize-lines: 9, 12, 18 :linenos: .. note:: ``DataLoader`` will automatically select a different subset of the dataset based on the process ID. .. warning:: All the processes must use the same seed if ``shuffle=True`` is used for the ``DataLoader``. torch.nn.CTCLoss ================ The CTCLoss operator is supported, with some limitations: #. The ``reduction`` parameter must be set to either ``sum`` or ``mean`` #. The ``targets`` tensor must be 2D, corresponding to stacked, padded layout ================================================ FILE: docs/user_guide/hostio_optimisation.rst ================================================ ===================== Efficient IPU I/O ===================== When developing applications for the IPU, maximising I/O performance is important. If an application is still I/O-bound after optimising host data loading, then you can explore further optimisations of the movement of data into the IPU. This chapter will cover two specific optimisations that can improve I/O performance: * prefetch and multibuffering * overlapping compute and I/O Prefetch and multibuffering =========================== Poplar supports prefetching and multibuffering to improve I/O performance. For more details, see `Optimising host data transfers `__ in the Poplar and PopLibs User Guide. Prefetch is enabled by default in Poplar. The default buffer depth is 1. You can increase the value for the buffer depth to improve I/O performance: .. code-block:: python opts = poptorch.Options() opts._Popart.set("defaultPrefetchBufferingDepth", 3) Using multibuffering is especially useful when you see large ``StreamCopyBegin`` or ``StreamCopyEnd`` phases in your application's profile. For example, :numref:`figNoBuffering` shows a profile of a simple program without using buffering. The program consists of a loop where the IPU gets data from the host, processes it and sends the result back. The ``StreamCopy``, in light orange represents the data transfer. The first one is the host to IPU transfer, the second one is the IPU to host transfer. They are split into a ``Begin``, a ``Mid``, and an ``End`` phase. In the ``Begin`` and ``End`` phases, the IPU waits for the host to become ready. In the ``Mid`` phase the IPU performs the transfer. Between the ``StreamCopy`` operations are the compute steps, shown in red. In this profile, you can see the IPU is waiting for data from the host for a significant amount of time. .. figure:: no-buffering-profile.png :name: figNoBuffering :width: 100% Profile with multibuffering disabled :numref:`figWithBuffering` shows the profile of the same program with buffering. You can see that the IPU no longer waits for the host: the ``Begin`` and ``End`` section of the ``StreamCopy`` are gone. .. figure:: with-buffering-profile.png :name: figWithBuffering :width: 100% Profile with multibuffering enabled and related improvements Overlapping compute and I/O =========================== To optimise I/O further, you can choose to dedicate a specified number of tiles to communication and leave the rest of the tiles for compute. Computation time will be adversely affected by having access to fewer tiles, so there is a trade-off between optimising I/O and optimising compute here. To overlap compute and I/O: #. In PopTorch's ``Options``, you must specify the number of I/O tiles and select one of ``ShardedExecution``, ``ParallelPhasedExecution`` or ``SerialPhasedExecution`` as the ``ExecutionStrategy``: .. code-block:: python opts.TensorLocations.numIOTiles(64) opts.setExecutionStrategy(poptorch.ShardedExecution()) #. In the forward method of the model, you must set the ``OverlapMode`` for the inputs and outputs of the model to ``OverlapDeviceIterationLoop``, as follows: .. code-block:: python def forward(self, x): x = poptorch.set_overlap_for_input(x, poptorch.OverlapMode.OverlapDeviceIterationLoop) x = some_compute(x) x = poptorch.set_overlap_for_output(x, poptorch.OverlapMode.OverlapDeviceIterationLoop) return x :numref:`figWithBufferingOverlap` shows the profile of our simple program with both compute I/O overlap and multibuffering enabled. The compute (in red) and the I/O (in orange) are stacked on top of each other since they both happen at the same time. .. _figWithBufferingOverlap: .. figure:: with-buffering-overlap-profile.png Profile with both multibuffering and I/O compute overlap enabled and related improvements ================================================ FILE: docs/user_guide/index.rst ================================================ PyTorch for the IPU: User Guide =============================== .. toctree:: :maxdepth: 4 :numbered: 3 intro installation pytorch_to_poptorch overview batching supported_ops debugging hostio_optimisation example experimental reference legal ================================================ FILE: docs/user_guide/inferenceModel.py ================================================ # Copyright (c) 2020 Graphcore Ltd. All rights reserved. import os import poptorch # If running on the model then make sure to run on the full size model to # avoid running out of memory. if not poptorch.ipuHardwareIsAvailable(): os.environ["POPTORCH_IPU_MODEL"] = "1" # pylint: disable=reimported # pylint: disable=ungrouped-imports # pylint: disable=wrong-import-order # pylint: disable=wrong-import-position # inference_model_start import torch import torchvision import poptorch # Some dummy imagenet sized input. picture_of_a_cat_here = torch.randn([1, 3, 224, 224]) # The model, in this case a MobileNet model with pretrained weights that comes # canned with PyTorch. model = torchvision.models.mobilenet_v2(pretrained=True) model.train(False) # Wrap in the PopTorch inference wrapper inference_model = poptorch.inferenceModel(model) # Execute on IPU. out_tensor = inference_model(picture_of_a_cat_here) # Get the top 5 ImageNet classes. top_five_classes = torch.topk(torch.softmax(out_tensor, 1), 5) print(top_five_classes) # Try the same on native PyTorch native_out = model(picture_of_a_cat_here) native_top_five_classes = torch.topk(torch.softmax(native_out, 1), 5) # Models should be very close to native output although some operations are # numerically different and floating point differences can accumulate. assert any(top_five_classes[1][0] == native_top_five_classes[1][0]) # inference_half_start model = torch.nn.Linear(1, 10) # Cast the parameters (weights) to half. model.half() t1 = torch.tensor([1.]).half() opts = poptorch.Options() inference_model = poptorch.inferenceModel(model, opts) out = inference_model(t1) assert out.dtype == torch.half # inference_half_end ================================================ FILE: docs/user_guide/installation.rst ================================================ .. _installation: ============ Installation ============ .. contents:: :local: PopTorch is included with the Poplar SDK (see the `Getting Started guide `_ for your system for how to install the Poplar SDK.). PopTorch is packaged as a Python wheel file that can be installed using ``pip``. .. important:: pip >= 18.1 is required for PopTorch dependencies to be installed properly. To update ``pip``: .. code-block:: bash $ pip install -U pip Version compatibility ===================== The following are the corresponding ``torch``, ``torchvision``, ``torchaudio`` and ``torch_scatter`` versions and supported Python versions. +--------------+-----------+-----------------+----------------+------------------------------+------------+ | ``poptorch`` | ``torch`` | ``torchvision`` | ``torchaudio`` | ``torch_scatter`` | ``python`` | +==============+===========+=================+================+==============================+============+ | 3.3 | 2.0.1 | 0.15.2 | 2.0.1 | >=2.0.9 and <=2.1.1 | >=3.8 | +--------------+-----------+-----------------+----------------+------------------------------+------------+ | 3.2 | 1.13.1 | 0.14.1 | 0.13.1 | >=2.0.9 and <=2.1.0 | >=3.7 | +--------------+-----------+-----------------+----------------+------------------------------+------------+ | 3.1 | 1.13.0 | 0.14.0 | 0.13.0 | >=2.0.9 and <=2.1.0 | >=3.7 | +--------------+-----------+-----------------+----------------+------------------------------+------------+ | 3.0 | 1.10.0 | 0.11.1 | 0.10.0 | N/A | >=3.6 | +--------------+-----------+-----------------+----------------+------------------------------+------------+ | 2.6 | 1.10.0 | 0.11.1 | 0.10.0 | N/A | >=3.6 | +--------------+-----------+-----------------+----------------+------------------------------+------------+ | 2.5 | 1.10.0 | 0.11.1 | 0.10.0 | N/A | >=3.6 | +--------------+-----------+-----------------+----------------+------------------------------+------------+ | 2.4 | 1.10.0 | 0.11.1 | 0.10.0 | N/A | >=3.6 | +--------------+-----------+-----------------+----------------+------------------------------+------------+ | 2.3 | 1.9.0 | 0.10.0 | 0.9.0 | N/A | >=3.6 | +--------------+-----------+-----------------+----------------+------------------------------+------------+ | 2.2 | 1.9.0 | 0.10.0 | 0.9.0 | N/A | >=3.6 | +--------------+-----------+-----------------+----------------+------------------------------+------------+ | 2.1 | 1.7.1 | 0.8.2 | 0.7.1 | N/A | >=3.6 | +--------------+-----------+-----------------+----------------+------------------------------+------------+ | 2.0 | 1.7.1 | 0.8.2 | 0.7.1 | N/A | >=3.6 | +--------------+-----------+-----------------+----------------+------------------------------+------------+ | 1.4 | 1.6.0 | 0.7.0 | 0.6.0 | N/A | >=3.6 | +--------------+-----------+-----------------+----------------+------------------------------+------------+ Based on https://github.com/pytorch/vision/blob/master/README.md .. note:: To ensure version compatibility, ``torchvision`` and ``torchaudio`` are automatically installed with PopTorch in Poplar SDK 3.3 and later. Using a Python virtual environment ================================== We recommend creating and activating a virtual environment to isolate your PopTorch environment from the system Python environment. You can use the Python tool ``virtualenv`` for this. You can create a virtual environment and install PopTorch as shown below: .. code-block:: bash $ virtualenv -p python3 poptorch_test $ source poptorch_test/bin/activate $ pip install -U pip $ pip install /poptorch_x.x.x.whl .. _setting_env: Setting the environment variables ================================= The PopART and Poplar runtime libraries are required to use PopTorch, so you will need to set the library search paths, using the scripts provided in the SDK: .. code-block:: bash # Enable the Python environment containing PopTorch (if not already enabled) $ source poptorch_test/bin/activate # Add the Poplar and PopART runtime libraries to the search path $ source /poplar-ubuntu_-+/enable.sh $ source /popart-ubuntu_-+/enable.sh where ```` is the location of the Poplar SDK on your system. ```` is the version of Ubuntu on your system, ```` is the software version number of the Poplar SDK and ```` is the build information. Validating the setup ==================== You can run this simple example to verify that the system is working as expected. This example can be found in the Poplar SDK ``examples`` directory. .. literalinclude:: ../../examples/simple_adder.py :caption: Simple adder example :language: python :linenos: ================================================ FILE: docs/user_guide/intro.rst ================================================ ============ Introduction ============ PopTorch is a set of extensions for PyTorch to enable PyTorch models to run directly on the Graphcore IPU. PopTorch has been designed to require as few changes as possible to your models in order to run on the IPU. However, it does have some differences from native PyTorch execution, to get the most out of IPU hardware. The `IPU Programmer's Guide `__ provides an introduction to the IPU architecture, programming model and tools available. PopTorch is included with the `Poplar SDK `__. See the `Getting Started guide `_ for your system for how to install the Poplar SDK. Refer to :numref:`installation` for how to install the PopTorch wheel. In the Graphcore software stack, PyTorch sits at the highest level of abstraction. Poplar and PopLibs provide a software interface to operations running on the IPU. PopTorch compiles PyTorch models into Poplar executables and also provides IPU-specific functions. .. figure:: pytorch-software-stack.png :align: center :width: 100% PyTorch, PopTorch and the Poplar software stack PopTorch supports executing native PyTorch models for both inference and training. To run a PyTorch model on the IPU, you must wrap your model with either: * :py:func:`~poptorch.inferenceModel` * :py:func:`~poptorch.trainingModel` Both of these functions accept a PyTorch model (`torch.nn.Module `_) and create a representation of the model that can be executed on the IPU hardware. In training mode, PopTorch uses its own automatic differentiation engine (autograd) that differs from native PyTorch. The input model (`torch.nn.Module `_) is required to have at least one loss built into the forward pass. PopTorch backpropagates the gradients from the loss value(s) to update the model parameters. This is all taken care of automatically so your training loop does not need to call ``.backward()`` on the loss value(s) or ``.step()`` on the optimiser. The following example shows a typical native PyTorch training loop. The model incorporates a loss criterion within the ``.forward()`` method, and returns the loss value as a second output (along with the prediction). This native PyTorch training loop manually invokes the ``.backward()`` method to backpropagate the gradients. The loop also manually updates the optimiser by calling the ``.step()`` method. .. literalinclude:: poptorch_training_simple.py :caption: A simple example of training using PyTorch on the CPU :linenos: :start-after: simple_cpu_start :end-before: simple_cpu_end Data batching ============= An equivalent training loop executing the model on the IPU with PopTorch is shown below. The :py:class:`~poptorch.DataLoader` class is used to efficiently load data batches on the IPU. PopTorch follows the data batching semantics of `PopART `__. By default, this means you will just pass in data of the normal batch size. However, there are a number of options provided in PopTorch which will enable more efficient data loading. See :numref:`efficient_data_batching` for more information. Notice that the `torch.optim.AdamW `_ optimiser is passed as an input argument to the :py:func:`~poptorch.trainingModel` wrapper which applies the optimiser algorithm during training on the IPU. The optimiser state is automatically managed by the PopART framework so there is no need to call the ``.step()`` method. Another significant change from the native training loop is there is no ``loss.backward()``. As mentioned above, PopTorch uses its own automatic differentiation engine and will detect the loss value to backpropagate the gradients from. .. literalinclude:: poptorch_training_simple.py :caption: Equivalent code using PopTorch to train on the IPU :linenos: :start-after: simple_ipu_start :end-before: simple_ipu_end Parallel and Distributed execution ================================== To scale your models, you can enable :ref:`execution_strategies` using the PopTorch :ref:`annotation_tools` to label or wrap individual parts of your model and assign parts of the model to an individual IPU or execution phase. You can also use PopTorch's :ref:`available_execution_strategies` to determine how the model executes the phases. Having assigned the model to run on one or more IPUs, you can add additional parallelism with replication. Each replica represents an additional copy of the entire model. These copies run in parallel. PopTorch can also run across multiple hosts. This is necessary for using more than 64 IPUs across IPU Pod systems and may be beneficial when using a smaller number of IPUs, for example with models that involve intensive pre-processing on the CPU. We recommend using the PopRun command-line tool and and PopDist configuration library, which can automatically set up PopTorch to run across multiple IPU-POD hosts. Refer to the `PopDist and PopRun User Guide `__ for more information, including details about the installation of Horovod if you are using the MPI communication protocol. .. _constraints: Constraints =========== The following constraints apply when using PopTorch: * All tensor data types and shapes must be constant for the entire dataset. * As PopTorch compiles to a static graph, it cannot handle control flow variations within the model. This means that the inputs passed at run-time cannot vary the control flow of the model or the shapes or sizes of results. If this is attempted, the graph will be frozen to whichever control flow path was activated as a result of the first inputs given to the wrapped model. * Not all PyTorch operations are implemented within the PopTorch compiler. See :numref:`supported_ops` for a list of operators that are supported on the IPU. Please also report any unsupported operators to support@graphcore.ai so that these ops may be incorporated into a future release. * Whilst any argument type can be used in the forward method, only tensor arguments may change between model invocations, as other types will be statically compiled inside the executable. Other resources =============== `Switching from GPUs to IPUs for Machine Learning Models `__ provides a high-level overview of the programming changes required when switching from GPUs to IPUs and `Memory and Performance Optimisation on the IPU `__ presents guidelines to help you develop high-performance machine learning models running on the IPU. The Graphcore `Examples GitHub repository `_ contains PopTorch applications, :tutorials-repo:`feature examples `, :tutorials-repo:`tutorials ` and :tutorials-repo:`simple applications `. Further developer resources can be found on `Graphcore's developer portal `_. ================================================ FILE: docs/user_guide/legal.rst ================================================ Trademarks & copyright ====================== |LEGAL:TRADEMARKS| |LEGAL:EULA| Copyright © 2020-|YEAR| Graphcore Ltd. All rights reserved. ================================================ FILE: docs/user_guide/mnist.py ================================================ #!/usr/bin/env python3 # Copyright (c) 2020 Graphcore Ltd. All rights reserved. import argparse import sys import torch import torch.nn as nn import torch.optim as optim import torchvision from tqdm import tqdm import poptorch def get_mnist_data(opts): options = poptorch.Options() training_data = poptorch.DataLoader( options, torchvision.datasets.MNIST('mnist_data/', train=True, download=True, transform=torchvision.transforms.Compose([ torchvision.transforms.ToTensor(), torchvision.transforms.Normalize( (0.1307, ), (0.3081, )) ])), batch_size=opts.batch_size * opts.batches_per_step, shuffle=True, drop_last=True) validation_data = poptorch.DataLoader( options, torchvision.datasets.MNIST('mnist_data/', train=False, download=True, transform=torchvision.transforms.Compose([ torchvision.transforms.ToTensor(), torchvision.transforms.Normalize( (0.1307, ), (0.3081, )) ])), batch_size=opts.test_batch_size, shuffle=True, drop_last=True) return training_data, validation_data #annotations_start class Network(nn.Module): def __init__(self): super(Network, self).__init__() self.layer1 = nn.Linear(784, 784) self.layer2 = nn.Linear(784, 784) self.layer3 = nn.Linear(784, 128) self.layer4 = nn.Linear(128, 10) self.softmax = nn.Softmax(1) def forward(self, x): x = x.view(-1, 784) with poptorch.Block("B1"): x = self.layer1(x) with poptorch.Block("B2"): x = self.layer2(x) with poptorch.Block("B3"): x = self.layer3(x) with poptorch.Block("B4"): x = self.layer4(x) x = self.softmax(x) return x class TrainingModelWithLoss(torch.nn.Module): def __init__(self, model): super().__init__() self.model = model self.loss = torch.nn.CrossEntropyLoss() def forward(self, args, loss_inputs=None): output = self.model(args) if loss_inputs is None: return output with poptorch.Block("B4"): loss = self.loss(output, loss_inputs) return output, loss #annotations_end def accuracy(predictions, labels): _, ind = torch.max(predictions, 1) # provide labels only for samples, where prediction is available (during the training, not every samples prediction is returned for efficiency reasons) labels = labels[-predictions.size()[0]:] accuracy = torch.sum(torch.eq(ind, labels)).item() / \ labels.size()[0] * 100.0 return accuracy def train(training_model, training_data, opts): nr_batches = len(training_data) for epoch in range(1, opts.epochs + 1): print("Epoch {0}/{1}".format(epoch, opts.epochs)) bar = tqdm(training_data, total=nr_batches) for data, labels in bar: preds, losses = training_model(data, labels) with torch.no_grad(): mean_loss = torch.mean(losses).item() acc = accuracy(preds, labels) bar.set_description("Loss:{:0.4f} | Accuracy:{:0.2f}%".format( mean_loss, acc)) if opts.profile: return def test(inference_model, test_data): nr_batches = len(test_data) sum_acc = 0.0 with torch.no_grad(): for data, labels in tqdm(test_data, total=nr_batches): output = inference_model(data) sum_acc += accuracy(output, labels) print("Accuracy on test set: {:0.2f}%".format(sum_acc / len(test_data))) if __name__ == '__main__': parser = argparse.ArgumentParser(description='MNIST training in PopTorch') parser.add_argument('--batch-size', type=int, default=4, help='batch size for training (default: 4)') parser.add_argument('--batches-per-step', type=int, default=8, help='device iteration (default:8)') parser.add_argument('--test-batch-size', type=int, default=2, help='batch size for testing (default: 4)') parser.add_argument('--epochs', type=int, default=1, help='number of epochs to train (default: 1)') parser.add_argument('--lr', type=float, default=1e-4, help='learning rate (default: 1e-4)') parser.add_argument( '--profile', type=str, help= "do a single iteration of training for profiling and place in a folder" ) parser.add_argument('--strategy', choices=['plain', 'pipelined', 'phased'], default='plain', help='execution strategy') parser.add_argument('--offload-opt', type=bool, help="offload optimizer state") opts = parser.parse_args() poptorch.setLogLevel("DEBUG") # Force debug logging #annotations_strategy_start training_data, test_data = get_mnist_data(opts) model = Network() model_with_loss = TrainingModelWithLoss(model) model_opts = poptorch.Options().deviceIterations(1) if opts.strategy == "phased": strategy = poptorch.SerialPhasedExecution("B1", "B2", "B3", "B4") strategy.stage("B1").ipu(0) strategy.stage("B2").ipu(0) strategy.stage("B3").ipu(0) strategy.stage("B4").ipu(0) model_opts.setExecutionStrategy(strategy) elif opts.strategy == "pipelined": strategy = poptorch.PipelinedExecution("B1", "B2", "B3", "B4") strategy.stage("B1").ipu(0) strategy.stage("B2").ipu(1) strategy.stage("B3").ipu(2) strategy.stage("B4").ipu(3) model_opts.setExecutionStrategy(strategy) model_opts.Training.gradientAccumulation(opts.batches_per_step) else: strategy = poptorch.ShardedExecution("B1", "B2", "B3", "B4") strategy.stage("B1").ipu(0) strategy.stage("B2").ipu(0) strategy.stage("B3").ipu(0) strategy.stage("B4").ipu(0) model_opts.setExecutionStrategy(strategy) if opts.offload_opt: model_opts.TensorLocations.setActivationLocation( poptorch.TensorLocationSettings().useOnChipStorage(True)) model_opts.TensorLocations.setWeightLocation( poptorch.TensorLocationSettings().useOnChipStorage(True)) model_opts.TensorLocations.setAccumulatorLocation( poptorch.TensorLocationSettings().useOnChipStorage(True)) model_opts.TensorLocations.setOptimizerLocation( poptorch.TensorLocationSettings().useOnChipStorage(False)) training_model = poptorch.trainingModel( model_with_loss, model_opts, optimizer=optim.AdamW(model.parameters(), lr=opts.lr)) # run training, on IPU train(training_model, training_data, opts) #annotations_strategy_end if opts.profile: sys.exit(1) # Update the weights in model by copying from the training IPU. This updates (model.parameters()) training_model.copyWeightsToHost() # Check validation loss on IPU once trained. Because PopTorch will be compiled on first call the # weights in model.parameters() will be copied implicitly. Subsequent calls will need to call # inference_model.copyWeightsToDevice() inf_opts = poptorch.Options().deviceIterations(opts.test_batch_size) strategy = poptorch.ShardedExecution("B1", "B2", "B3", "B4") strategy.stage("B1").ipu(0) strategy.stage("B2").ipu(0) strategy.stage("B3").ipu(0) strategy.stage("B4").ipu(0) inf_opts.setExecutionStrategy(strategy) inference_model = poptorch.inferenceModel(model, inf_opts) test(inference_model, test_data) ================================================ FILE: docs/user_guide/overview.rst ================================================ ======== Features ======== .. contents:: :local: :depth: 3 Options ======= You can change how PopTorch compiles and executes models using :py:class:`~poptorch.Options`. You can find a full list of options in :numref:`api_options`. Broadly speaking, the options fall into the following categories: #. General options (see :py:class:`~poptorch.Options`) #. Options related to half precision (see :py:class:`opts.Precision.* `) #. Management of the training process (see :py:class:`opts.Training.* `) #. Location of tensors (see: :py:class:`opts.TensorLocations.* ` and :py:class:`~poptorch.TensorLocationSettings`) #. Options relevant to the Torch JIT compiler (see :py:class:`opts.Jit.* `) #. Control of distributed execution environments when using tools other than `PopRun `__ (see :py:class:`opts.Distributed.* `) See :numref:`efficient_data_batching` for a full explanation of how :py:meth:`~poptorch.Options.deviceIterations`, :py:meth:`~poptorch.options._TrainingOptions.gradientAccumulation` and :py:meth:`~poptorch.Options.replicationFactor` interact with a model's input and output sizes. You can choose to use the IPU Model instead of IPU hardware with the :py:meth:`~poptorch.Options.useIpuModel` option. Setting options via config file ------------------------------- In addition to setting these options programmatically, you can also set them in a config text file by using :py:func:`~poptorch.Options.loadFromFile`. Each line in the file must contain a single command corresponding to setting an option in :py:class:`~poptorch.Options`. To set an option within the file, write the command as you would within a Python script but omit the ``options.`` prefix. For example: .. literalinclude:: poptorch.conf :language: python :caption: Example contents of a config file used to set options :linenos: Then, instantiate :py:class:`~poptorch.Options` and call :py:func:`~poptorch.Options.loadFromFile`: .. literalinclude:: api.py :language: python :caption: Setting options using a config file named "poptorch.conf" :linenos: :start-after: conf_load_start :end-before: conf_load_end :emphasize-lines: 2 Model wrapping functions ======================== The basis of PopTorch integration comes from the two model wrapping functions described in the following sections. .. note:: PopTorch makes a shallow copy of the model. Changes to the parameters in the models returned by these two model wrapping functions affect the original model and vice versa. However, primitive variable types will not be kept in sync. This includes the ``training`` bool of ``pytorch.nn.Module``. If your PyTorch model is named ``model``, call ``model.eval()`` or ``model.train()``, if required, before calling these wrapping functions. poptorch.trainingModel ---------------------- This function wraps a PyTorch model, yielding a PopTorch model that can be run on the IPU in training mode. See :py:func:`~poptorch.trainingModel` for more information. .. literalinclude:: trainingModel.py :language: python :caption: An example of the use of trainingModel :linenos: :emphasize-lines: 22 :start-after: training_model_start :end-before: training_model_end .. note:: By default, PopTorch will only return the final batch of outputs. Please see :numref:`trainingOutputMode` for details on what PopTorch returns when using :py:func:`~poptorch.trainingModel` and how you can calculate statistics such as training accuracy over all batches. poptorch.inferenceModel ----------------------- This function wraps a PyTorch model, yielding a PopTorch model that can be run on the IPU in inference mode. See :py:func:`~poptorch.inferenceModel` for more information. .. literalinclude:: inferenceModel.py :language: python :caption: An example of the use of inferenceModel :linenos: :start-after: inference_model_start :emphasize-lines: 14 poptorch.PoplarExecutor ----------------------- You should not create this class directly. It is a wrapper around the model that was passed into :py:func:`~poptorch.inferenceModel` or :py:func:`~poptorch.trainingModel`. It has a few methods which you can use to interface with the IPU. The :py:class:`~poptorch.PoplarExecutor` will implicitly keep in sync the parameters of the source PyTorch model and the PopTorch model(s). However, you need to explicitly copy the weights before you run a model on the IPU if you train the model on the CPU after you have already wrapped it for the IPU. You also need to explicitly copy the weights if you alter an already wrapped model parameter by some other means. See :py:class:`~poptorch.PoplarExecutor` for a complete description of the IPU interface functionality. .. literalinclude:: trainingModel.py :language: python :caption: Example contents of when explicit copies are needed :linenos: :start-after: explicit_copy_start :end-before: explicit_copy_end poptorch.isRunningOnIpu ----------------------- One useful utility function is :py:func:`~poptorch.isRunningOnIpu`. This returns ``True`` when executing on the IPU and ``False`` when executing the model outside IPU scope. This allows for different code paths within the model. A common use case is executing equivalent code to a PopART custom operator when running on the CPU. For example: .. code-block:: python class Network(torch.nn.Module): def forward(self, x, y): if poptorch.isRunningOnIpu(): # IPU path return my_custom_operator(x, y) else: # CPU path return my_torch_implementation(x,y) Error handling ============== Recoverable runtime errors -------------------------- This category of error is likely to be transient. Exception type raised by PopTorch: `poptorch.RecoverableError` (inherits from `poptorch.Error`) The exception contains the action required to recover from this error in its `recovery_action` string attribute. This attribute can contain: - `IPU_RESET`: Reset the IPU and reload the IPU memory. - `LINK_RESET`: Reset the IPU-Links in a non-Pod system. This retrains the IPU-Links between IPUs. - `PARTITION_RESET`: Reset the IPU partition in a Pod system. This retrains the IPU-Links between IPUs. - `FULL_RESET`: Power cycle the system. Unrecoverable runtime errors ---------------------------- These errors are likely to persist. You should take the system out of operation for analysis and repair. Exception type raised by PopTorch: `poptorch.UnrecoverableError` (inherits from `poptorch.Error`) Application and other errors ---------------------------- This kind of error is due to an error in the program or a misuse of an API. Exception type raised by PopTorch: `poptorch.Error` if the error was detected in the C++ backend, or some generic Python `Exception` if it happened in the Python layer. `poptorch.Error` has the following string attributes: - `message` The error message without any of the context. - `type` The part of the software stack that raised the exception and the category of the error if available. - `location` Where the exception was raised. Example: .. literalinclude:: error_handling.py :language: python :linenos: :start-after: error_handling_start :end-before: error_handling_end :caption: How to handle recoverable / unrecoverable errors .. _execution_strategies: Multi-IPU execution strategies ============================== This section describes strategies to run PopTorch code on more than one IPU. Some of these allow you to run code in parallel on multiple IPUs. You will need to use one of these execution strategies for PopTorch code that does not fit on a single IPU, but if you do not explicitly select one, PopTorch will use the default execution strategy, :py:class:`~poptorch.PipelinedExecution`. .. note:: In general, we advise pipelining over as few IPUs as possible. However, You may need to experiment to find the optimal pipeline length. In some corner cases, a longer pipeline can lead to faster throughput. There are four kinds of execution strategies that you can use to run a model on a multi-IPU device: * :py:class:`~poptorch.PipelinedExecution` * :py:class:`~poptorch.ShardedExecution` * :py:class:`~poptorch.SerialPhasedExecution` * :py:class:`~poptorch.ParallelPhasedExecution` You can select this with the :py:func:`~poptorch.Options.setExecutionStrategy` option. The following subsections first introduce the general functions which are relevant to all four parallel execution strategies. Next, they explain the four strategies with examples. By default, PopTorch will not let you run the model if the number of IPUs is not a power of 2. For this reason, it is preferable to annotate the model so that the number of IPUs used is a power of 2. However, you can also enable :py:func:`~poptorch.Options.autoRoundNumIPUs` to automatically round up the number of IPUs reserved to a power of 2, with the excess being reserved but idle. This option is not enabled by default to prevent unintentional overbooking of IPUs. .. _annotation_tools: Annotations ----------- In PopTorch, you can divide a model into blocks. Blocks are associated to stages and stages can be grouped into phases. This chapter will describe how to define them and how to use them to set up different execution modes. .. _figStages: .. figure:: stages_summary.png :width: 581 PopTorch model partition summary Model partitioning using blocks ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ :py:class:`~poptorch.BeginBlock` is a wrapper class, :py:class:`~poptorch.Block` is a context manager, and :py:func:`~poptorch.BlockFunction` is a function decorator. You can use one or more of these to partition models into "blocks" that can be executed on different IPUs. You can use :py:class:`~poptorch.BeginBlock` to annotate an existing model. Each call, with example arguments ``(layer_n, ipu_id=m)``, places layers enclosed in ``layer_n`` on IPU ``m``. Note that, PopTorch places the first layers on ``ipu_id`` 0 by default. However, layers in between :py:class:`~poptorch.BeginBlock` annotations will inherit that of the previous annotated block. .. literalinclude:: pipeline_simple.py :language: python :linenos: :start-after: annotations_start :end-before: annotations_end :emphasize-lines: 37-38, 41-42, 45-46 :caption: Annotating existing layers .. note:: The :py:class:`~poptorch.BeginBlock` annotations internally use PyTorch hooks. If the module passed to :py:func:`~poptorch.BeginBlock` uses hooks, for example with `register_forward_pre_hook `__, then the assignment of operations to blocks may depend on the order those hooks are added. A concrete example may help to clarify this: consider a layer, and an operation that is defined in a hook function. If ``register_forward_pre_hook()`` is called on the layer, followed by a call to :py:func:`~poptorch.BeginBlock` passing the same layer as argument, then the operation defined in the hook will be assigned to the preceding block (so not the same block as the layer). If instead the call to :py:func:`~poptorch.BeginBlock` happens before ``register_forward_pre_hook()``, then the operation will be assigned in the same block as the layer. You can use :py:class:`~poptorch.Block` to annotate a model from within its definition. This context manager class defines a scope in the context of the model. Everything within that scope is placed on the IPU specified (unless overridden by a :py:class:`~poptorch.Stage`). .. literalinclude:: pipeline_simple.py :language: python :linenos: :start-after: annotations_inline_start :end-before: annotations_inline_end :emphasize-lines: 16, 19, 22, 26 :caption: Annotating a model directly In addition, you can use the :py:func:`~poptorch.BlockFunction` function decorator to place functions (containing one or more layers) onto a particular block. Everything within that function is placed on the IPU specified (unless overridden by a :py:class:`~poptorch.Stage`). .. literalinclude:: pipeline_simple.py :language: python :linenos: :start-after: annotations_decorator_start :end-before: annotations_decorator_end :emphasize-lines: 19, 25 :caption: Annotating functions You can use any, or a combination, of these three annotation options. In the above examples, ``ipu_id`` is used to specify blocks. This alone is sufficient to enable parallel execution: by default, :py:class:`~poptorch.AutoStage` will set up a pipeline for which the pipeline stage is equal to the ``ipu_id`` for each block. However, it would be equally valid to instead use the ``user_id`` argument to assign names to each block. Then, using :py:class:`~poptorch.Stage` or :py:class:`~poptorch.Phase` classes, you can manually assign each block in a pipeline using their names, as outlined in the next sections. :py:class:`~poptorch.BeginBlock`, :py:class:`~poptorch.Block` and :py:func:`~poptorch.BlockFunction` need to follow a set of rules: * You must declare all the layers inside a :py:class:`~poptorch.Block` scope, using either the context manager or :py:func:`~poptorch.BlockFunction`, to avoid missing annotations. :py:class:`~poptorch.BeginBlock` does not have this constraint because all the layers called after this will automatically be added to the last :py:class:`~poptorch.BeginBlock`. * Note that PopTorch needs to reserve IPUs in powers of 2. You are advised to configure your model accordingly to take full advantage of the IPUs available. However, if you need to run with a different number of IPUs, you can use ``poptorch.Options().autoRoundNumIPUs(True)`` to allow PopTorch to reserve more IPUs than the model specifies. * You should not include unused or dead layers in any :py:class:`~poptorch.BeginBlock` or :py:class:`~poptorch.Block`. * If layer A happens before layer B inside the model and each layer has a :py:class:`~poptorch.BeginBlock` associated with it, you need to write :py:class:`~poptorch.BeginBlock` for layer A before :py:class:`~poptorch.BeginBlock` for layer B. Failing to obey above rules will result in compilation errors. poptorch.Stage and poptorch.AutoStage ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Conceptually, :py:class:`~poptorch.BeginBlock` and :py:class:`~poptorch.Block` collect the layers of a model into a :py:class:`~poptorch.Stage`. You can combine multiple stages into a :py:class:`~poptorch.Phase`. Multiple phases form an execution strategy. poptorch.Stage """""""""""""" :py:class:`~poptorch.Stage` defines the layers of the model to run on one IPU. A stage can consist of one or more blocks created using :py:class:`~poptorch.BeginBlock` or :py:class:`~poptorch.Block` and identified by their ``user_id``. You can define consecutive layers in a model in either the same stage or consecutive stages. Whether stages run in parallel or sequentially depends on the specific execution strategy. Internally, each operation in a model is assigned a ``stage_id`` through :py:class:`~poptorch.Stage`. poptorch.AutoStage """""""""""""""""" You can use :py:class:`~poptorch.AutoStage` if you don't want to specify stages by hand. This will assign one :py:class:`~poptorch.Stage` per :py:class:`~poptorch.BeginBlock` or :py:class:`~poptorch.Block`. By default, ``AutoStage.SameAsIpu`` is true, which means the ``stage_id`` of the :py:class:`~poptorch.Stage` will be set to the ``ipu_id`` specified for the :py:class:`~poptorch.BeginBlock` or :py:class:`~poptorch.Block`. Note that ``stage_id`` must have ascending values in :py:class:`~poptorch.PipelinedExecution`. Let's use the code example above. If your blocks "0", "1", and "2" are assigned to IPU 0, 1, and 0. Then the :py:class:`~poptorch.Block` "2" will be assigned ``stage_id`` 0. This will cause the compiler to fail to schedule the last two stages "1" and "2" due to a conflict: * The model implies "1" should run earlier than "2" * Their ``stage_id`` values suggest "2" should run earlier than "1" When ``AutoStage.AutoIncrement`` is true, each new :py:class:`~poptorch.BeginBlock` or :py:class:`~poptorch.Block` will be assigned an automatically incremented ``stage_id``. In the previous example the last stage would be assigned ``stage_id`` 2 and the compilation would succeed. poptorch.Phase ^^^^^^^^^^^^^^ :py:class:`~poptorch.Phase` defines a processing unit of phased execution. It can contain one or more :py:class:`~poptorch.Stage` stages. :py:class:`~poptorch.Phase` is only used in :py:class:`~poptorch.SerialPhasedExecution` and :py:class:`~poptorch.ParallelPhasedExecution`. It is not used in :py:class:`~poptorch.ShardedExecution` and :py:class:`~poptorch.PipelinedExecution`. .. literalinclude:: phased_execution.py :language: python :caption: Example of Stage declaration :linenos: :start-after: stage_start :end-before: stage_end In the code snippet above, "A" and "B" will run in parallel on IPUs 0 and 1 simultaneously because they are placed in two stages. They will run sequentially on one IPU if they are placed in a single stage. Advanced annotation with strings ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ You can use Python strings to represent the ``user_id`` and ``ipu_id`` for a :py:class:`~poptorch.Block` or :py:class:`~poptorch.BeginBlock`. Because strings are evaluated at runtime, they allow for a dynamic number of stages and phases. Here is an example showing how to use formatted strings(f-strings) in :py:class:`~poptorch.ParallelPhasedExecution`. In :numref:`parallel_phased_example`, there are several places where f-strings are used: * Line 25: ``f"phase{phase}_ipu{ipu}"``, where ``phase`` has the values 0, 1, 1, 2, 3, 3, 4, 5, and 5, and ``ipu`` ranges from 0 to 1. The total number of instances for this f-string is 12, from 6 phases and 2 IPUs. * Line 32: ``f"phase{N*2-1}_ipu1"``, where ``phase`` is 5 and ``ipu`` is 1. * Lines 46-47 and 50-51: when defining :py:class:`~poptorch.Stage`, four f-strings are used where ``n`` ranges from 0 to 2 * ``f"phase_{2*n}_ipu0"`` * ``f"phase{2*n}_ipu1"`` * ``f"phase_{2*n+1}_ipu0"`` * ``f"phase{2*n+1}_ipu1"`` These refer to phases 0, 2, 4 and 1, 3, 5, with ``ipu0`` and ``ipu1``, respectively. So all these 12 f-strings are defined in :py:class:`~poptorch.BeginBlock`, and used in :py:class:`~poptorch.Stage` dynamically. These match exactly. .. literalinclude:: phased_execution.py :caption: An example of parallel phased execution :language: python :linenos: :start-after: annotations_start :end-before: annotations_end :emphasize-lines: 23, 30, 45-46, 49-50 :name: parallel_phased_example With the above functions as building blocks, you can set execution strategies using the four kinds of execution modes, as shown below. .. _available_execution_strategies: Available execution strategies ------------------------------ Note that you can use the same annotation for each execution strategy. They only differ in the method of parallelisation and tensor locations. .. _pipelined_execution: Pipelined execution ^^^^^^^^^^^^^^^^^^^ :py:class:`~poptorch.PipelinedExecution` is the default execution strategy. When running a model for inference with :py:class:`~poptorch.PipelinedExecution`, you must set :py:meth:`~poptorch.Options.deviceIterations` to be greater than or equal to the number of pipeline stages used by the model. You can also do this for training to improve efficiency. Each time you switch IPU, PopTorch adds a new pipeline stage. If two consecutive blocks/stages use the same IPU, PopTorch will merge them into a single block/stage. It is usually better not to revisit an IPU, creating more than one pipeline stage on the same IPU, because the IPU can not run both stages at the same time. Hence in most cases, the number of pipeline stages for inference will be equal to the number of IPUs you have used. When training, PopTorch doubles the number of pipeline stages in order to run backpropagation, except for the last forward stage which becomes a combined forward and backward pipeline stage (:numref:`fig_poptorch_pipelining`). .. _fig_poptorch_pipelining: .. figure:: pipelined_execution.png :width: 95% PopTorch pipelined execution for training. The last forward stage is combined with the first backward stage. You must set :py:meth:`~poptorch.options._TrainingOptions.gradientAccumulation` to be greater than or equal to the number of pipeline stages (forward and backward). As well as these constraints, you must also consider that the number of batches obtained each time you call the model will be multiplied (from the conventional model batch size, known as the micro-batch size) by :py:meth:`~poptorch.Options.deviceIterations` * (:py:meth:`~poptorch.Options.replicationFactor` / ``input_group_size``) * :py:meth:`~poptorch.options._TrainingOptions.gradientAccumulation` during training and :py:meth:`~poptorch.Options.deviceIterations` * (:py:meth:`~poptorch.Options.replicationFactor` / ``input_group_size``) during inference (for details of ``input_group_size`` see :py:meth:`~poptorch.Options.replicationFactor`). You can use :py:class:`poptorch.DataLoader` to abstract this calculation but you should still be aware that this will take place. .. note:: The effective or conventional batch size for layers which depend on it (such as batch normalization) is known as the micro-batch size. If you use :py:class:`~poptorch.DataLoader`, the ``batch_size`` which you pass to it is the micro-batch size. After each IPU has finished processing a micro-batch, the same IPU immediately starts processing the next micro-batch while the next IPU processes the micro-batch that the same IPU just processed. This creates a pipeline which processes multiple micro-batches in parallel. An IPU can only start its own stage of a micro-batch after the previous stage of that micro-batch has been processed. Hence, not all IPUs will be occupied until after a "ramp-up" period. There is also a "ramp-down" period at the end of processing, during which there are no new micro-batches entering the pipeline for the first IPU to process while the IPUs down the pipeline still have micro-batches to process. Hence, during this period, the number of IPUs occupied will reduce each step. For this reason, you should try using a larger value for :py:meth:`~poptorch.options._TrainingOptions.gradientAccumulation`. But you should note that reducing the frequency of parameter updates will also have an adverse effect on training. Although you only define the :py:class:`~poptorch.Phase` for forward passes, the corresponding phases for backward passes are also created. .. _sharded_execution: Sharded execution ^^^^^^^^^^^^^^^^^ In this strategy, each IPU will sequentially execute a distinct part of the model. A single unit of processing :py:class:`~poptorch.ShardedExecution` is called a shard. A shard is specified using :py:class:`~poptorch.Stage`, or if no :py:class:`~poptorch.Stage` is specified, the ``user_id`` passed by :py:class:`~poptorch.BeginBlock` or :py:class:`~poptorch.Block` is used. Each shard is executed sequentially on a single IPU (:numref:`fig_poptorch_sharded`). You can place multiple shards on multiple IPUs. However, only one IPU is used at a time, while the other IPUs are idle. .. _fig_poptorch_sharded: .. figure:: sharded_execution.png :width: 95% PopTorch sharded execution for training. If an IPU is allocated to run consecutive stages, PopART will merge consecutive stages into one on the same IPU. Weights and activations will use the on-chip memory of the IPUs. You need to place layers that share weights on the same IPU. :py:class:`~poptorch.ShardedExecution` can be useful for processing a single sample or for debugging. Overall, it has low efficiency because only one IPU is used at a time. Phased execution ^^^^^^^^^^^^^^^^ :py:class:`~poptorch.ParallelPhasedExecution` and :py:class:`~poptorch.SerialPhasedExecution` have the following features in common: * A portion of the weights and activations are transferred to and from Streaming Memory, before and after each phase. * If the desired weights and activations are already stored in an IPU of the same group of IPUs, intra-phase cross-IPU copies can replace the copies to and from Streaming Memory. * This specific portion is needed by the layers of the model wrapped in :py:class:`~poptorch.BeginBlock` or :py:class:`~poptorch.Block` in current :py:class:`~poptorch.Phase`. * They both trade off some performance for larger models with higher memory needs. * Any number of phases is allowed. * The number of stages in each :py:class:`~poptorch.Phase` should match the number of IPUs in each group of IPUs. * Stages inside each :py:class:`~poptorch.Phase` can run in parallel. Although you only define the :py:class:`~poptorch.Phase` for forward passes, the corresponding phases for backward passes are also created. The order of phased execution for backward passes won't change but you can decide whether a phase is shared by both forward and backward passes. In other words, you decide whether to avoid a memory transfer of a portion of the weights and activations. Serial phased execution """"""""""""""""""""""" In :py:class:`~poptorch.SerialPhasedExecution`, phases execute on a single group of IPUs sequentially. .. literalinclude:: phased_execution.py :language: python :caption: How to use SerialPhasedExecution :linenos: :start-after: serial_start :end-before: serial_end The code above causes all phases to run serially on IPUs 0 and 1. (A,B and C on IPU 0, A2, B2, C2 on IPU 1). Parallel phased execution """"""""""""""""""""""""" In :py:class:`~poptorch.ParallelPhasedExecution`, phases are executed in parallel alternating between two groups of IPUs. Even phases must run on even IPUs and odd phases on odd IPUs. Inter-phase cross-IPU copies can replace the memory transfers to and from the Streaming Memory, if the desired weights and activations are already available in another group of IPUs. .. literalinclude:: phased_execution.py :language: python :caption: How to use ParallelPhasedExecution :linenos: :start-after: parallel_start :end-before: parallel_end In the code example above, there are three phases. Each phase has two stages and each IPU group has two IPUs, so the number of groups matches the number of IPUs. Even phases 0 and 2 run on IPU 0 and 2, while odd phase 1 runs on IPU 1 and 3. This allows for faster cross-IPU copies, both inter-phase and intra-phase. poptorch.Liveness """"""""""""""""" :py:class:`~poptorch.Liveness` controls the availability of tensors on IPU, and is only needed for :py:class:`~poptorch.ParallelPhasedExecution` and :py:class:`~poptorch.SerialPhasedExecution`. The default :py:class:`~poptorch.Liveness` is ``AlwaysLive``. ``OffChipAfterFwd``, ``OffChipAfterFwdNoOverlap`` and ``OffChipAfterEachPhase`` may be helpful if you run a large model with a tight memory budget. .. _grouping_tensor_weights: Grouping tensor weights across replicas --------------------------------------- PopTorch supports configuring weight tensors such that a different value of the weight tensor is sent to each replica, or to groups of replicas. This functionality can be used, for instance, to split a weight tensor and process parts of it on different groups of replicas. This functionality is accessed using the :py:func:`~replicaGrouping` method on the weight tensor in question. .. literalinclude:: replica_grouped_weights.py :language: python :caption: How to use replica grouped weights :linenos: :start-after: groupedweights_start :end-before: groupedweights_end In the code example above, eight replicas are used. The weight tensor ``W`` is split four ways between orthogonal groups, each containing two replicas. Orthogonal groups are organised perpendicularly to the replica ordering, so that in this example replicas 0 and 4 would form the first group, 1 and 5 the second, and so on. See :py:class:`~poptorch.CommGroupType` for other replica group organisation options (also illustrated in :numref:`figCommGroupTypes`), and :py:class:`~poptorch.VariableRetrievalMode` for options relating to how many replicas will be involved in value retrieval. .. figure:: comm-group-types.png :name: figCommGroupTypes :width: 100% Possible CommGroupTypes Note that in this code example, the input tensor ``X`` is split two ways. This is achieved using :py:meth:`~poptorch.Options.inputReplicaGrouping`. .. _optimizers: Optimizers ========== PopTorch supports the following optimizers: #. :py:class:`~poptorch.optim.SGD` #. :py:class:`~poptorch.optim.Adam` #. :py:class:`~poptorch.optim.AdamW` #. :py:class:`~poptorch.optim.RMSprop` #. :py:class:`~poptorch.optim.LAMB` In addition, PopTorch has features to support ``float16`` models, such as loss scaling, velocity scaling, bias correction and accumulator types. .. important:: All of these extra attributes (except ``velocity_scaling``) must have the same values for different ``param_groups`` and therefore you must set them at the optimizer level. .. literalinclude:: api.py :language: python :caption: How to update values in an Optimizer :linenos: :start-after: optim_start :end-before: optim_end :emphasize-lines: 5-9 .. important:: You must call :py:func:`~poptorch.PoplarExecutor.setOptimizer` to apply the new optimizer values to the model. .. warning:: PopTorch does not directly use the Python implementation of the optimizers. Built-in implementations are used in their place. This means that you cannot currently use custom optimizers. Subclassing a built-in optimizer will generate a warning. Any custom behaviour in a custom optimizer is unlikely to take effect, other than simply setting the existing attributes. Loss scaling ------------ When training models which use ``half`` or ``float16`` values, you can use loss scaling to prevent the gradients from becoming too small and causing underflows. Before calculating the gradients, PopTorch will scale the loss by the value of the ``loss_scaling`` parameter. PopTorch will multiply the gradients by the inverse scale prior to updating the optimizer state. Therefore, beyond improving numerical stability, neither the training nor the hyper-parameters are affected. Higher ``loss_scaling`` values can improve numerical stability by minimising underflow. However, too high a value can result in overflow. The optimal loss scaling factor depends on the model. You can either set the ``loss_scaling`` factors manually, or you can set :py:func:`~poptorch.options._TrainingOptions.setAutomaticLossScaling` in :py:class:`opts.Training `, which will automatically set a global loss scaling factor. If you both set ``loss_scaling`` manually and enable automatic loss scaling, the manually set factor(s) will be used initially and updated automatically during training. .. warning:: Automatic loss scaling is a preview feature. It is well tested and enabled in some of our example applications, but may not behave as expected in all models. Recommendation: if your model with automatic loss scaling enabled does not converge or triggers a compilation error, then you will need to set the loss scale manually. Velocity scaling (SGD combined variant only) -------------------------------------------- The SGD optimizer, when used with momentum, updates weights based on the velocity values. The combined variant uses one tensor per parameter to store the velocity and the changes to the velocity from accumulated gradients. Unlike the separate variant, therefore, each gradient accumulation step involves adding or subtracting values of a different magnitude to the gradients (for which loss scaling is used). You can therefore use the ``velocity_scaling`` parameter to scale the combined velocity tensor to improve numerical precision when using ``half``/``float16`` values. (Note that the gradients are, in effect, scaled by ``velocity_scaling/loss_scaling`` so the ``loss_scaling`` has no impact on the effective scaling of velocity parameters.) As with loss scaling, higher values can minimise underflow of the velocity values but may result in overflow. Accumulation types ------------------ In order to improve numerical stability some of the optimizers (LAMB, Adam, AdamW, RMSprop) give you the option to tweak the data type used by the optimizer's accumulators. ``accum_type`` lets you choose the type used for gradient accumulation. ``first_order_momentum_accum_type`` / ``second_order_momentum_accum_type`` give you control over the type used to store the first-order and second-order momentum optimizer states. Constant attributes ------------------- In order to improve performance and / or save memory PopTorch will try to embed directly in the program the attributes which are constant. .. important:: Trying to modify a constant attribute after the model has been compiled will result in an error. For PopTorch optimizers (those from the ``poptorch.optim`` namespace) by default the attributes explicitly passed to the optimizer's constructor will be considered variables and the others will be considered as constant. You can override this behaviour using :py:func:`~poptorch.optim.VariableAttributes.markAsConstant` and :py:func:`~poptorch.optim.VariableAttributes.markAsVariable` before compiling the model. .. literalinclude:: api.py :language: python :caption: Constant and variable attributes for PopTorch optimizers :linenos: :start-after: optim_const_start :end-before: optim_const_end For native optimizers (those from the `torch.optim `__ namespace) the attributes which are left to their default value in the constructor will be considered to be constant. There is no method to override this behaviour which is why we recommend you always use the ``poptorch.optim`` optimizers instead. .. literalinclude:: api.py :language: python :caption: Constant and variable attributes for Torch optimizers :linenos: :start-after: torch_optim_const_start :end-before: torch_optim_const_end .. note:: There is an exception: ``lr`` is always marked as variable. Reading and writing optimizer state ----------------------------------- When you use a ``poptorch.optim`` optimizer with a :py:func:`~poptorch.trainingModel`, you can use the optimizer's ``state_dict()`` and ``load_state_dict()`` functions to read/write optimizer state to/from the IPU. This can be used to restart training from a checkpoint saved previously. .. literalinclude:: api.py :caption: Reading and writing optimiser state :start-after: optim_state_dict_start :end-before: optim_state_dict_end :emphasize-lines: 6,11 .. note:: The structure of the state dictionary, as well as the keys within, will differ from those in PyTorch. As such, you cannot load a state dictionary with PopTorch that was obtained by running native PyTorch. PopTorch ops ============ This section describes some "helper" operations you can use within a model. poptorch.ctc_beam_search_decoder -------------------------------- This function adds a Connectionist Temporal Classification (CTC) beam search decoder operator to the model. .. literalinclude:: api.py :language: python :linenos: :start-after: ctc_beam_search_start :end-before: ctc_beam_search_end :emphasize-lines: 3 For more information see: :py:func:`~poptorch.ctc_beam_search_decoder`. poptorch.ipu_print_tensor ------------------------- This function adds an op to print the content of a tensor on the IPU. .. note:: To prevent the print operation being optimised out by the graph optimiser, you must use the return value of ``ipu_print_tensor()``. .. literalinclude:: api.py :language: python :linenos: :start-after: print_tensor_start :end-before: print_tensor_end :emphasize-lines: 10 For more information see: :py:func:`~poptorch.ipu_print_tensor`. poptorch.identity_loss ---------------------- You can use this function to implement custom losses. It takes a single PyTorch tensor and will backpropagate a gradient of ones through it. .. literalinclude:: api.py :language: python :linenos: :start-after: identity_start :end-before: identity_end :emphasize-lines: 5 :caption: Example of custom loss. For more information see: :py:func:`~poptorch.identity_loss`. poptorch.MultiConv ------------------ Use the :py:class:`~poptorch.MultiConv` wrapper class to define multi-convolutions. Refer to the `PopLibs documentation for multi-convolutions `__ for further information. For more information see: :py:class:`~poptorch.MultiConv` and :py:class:`~poptorch.MultiConvPlanType`. poptorch.nop ------------ PopTorch includes a "no-op" function for debugging purposes. For more information see: :py:func:`~poptorch.nop`. poptorch.dynamic_slice ---------------------- Standard PyTorch slicing syntax cannot currently be used to create dynamic slices. This function supports dynamic slicing on the IPU. For more information see: :py:func:`~poptorch.dynamic_slice`. poptorch.dynamic_update ----------------------- Standard PyTorch slicing syntax cannot currently be used to dynamically update a slice of a tensor. `poptorch.dynamic_update` allows updating a tensor with a statically sized slice at a dynamic index. This function supports dynamic updates on the IPU. For more information see: :py:func:`~poptorch.dynamic_update`. poptorch.serializedMatMul ------------------------- Use this function to create a serialized matrix multiplication, which splits a larger matrix multiplication into smaller matrix multiplications to reduce memory requirements. For more information see: :py:func:`~poptorch.serializedMatMul`. poptorch.set_available_memory ----------------------------- Use this function to override the default proportion of tile memory available as temporary memory for use by operations such as a convolution or matrix multiplication. The operators that can be tuned with this setting include: * convolution * matrix multiplication * embedding lookup * indexing operations For more information see: * :py:func:`~poptorch.set_available_memory` * `technical note `_ on optimising temporary memory usage Miscellaneous functions ----------------------- The following PopTorch functions, not related to model creation, are available: - :py:func:`~poptorch.ipuHardwareIsAvailable` - :py:func:`~poptorch.ipuHardwareVersion` - :py:func:`~poptorch.setLogLevel` 16-bit float support ==================== PopTorch supports the half-precision floating point (``float16``) format. You can simply input ``float16`` tensors into your model. (You can convert a tensor to ``float16`` using ``tensor = tensor.half()``) You can use your models in one of the following ways: #. Convert all parameters (weights) to ``float16`` by using a ``Module``'s ``.half()`` method. This is the most memory efficient, however small updates to weights may be lost, hindering training. #. Keep the parameters (weights) as ``float32``, in which case the parameter updates will occur using ``float32``. However, the parameters will be converted to ``float16`` if you call an operation with a ``float16`` input. This is more memory efficient than using ``float32`` tensors (inputs) but less memory efficient than using ``float16`` weights. #. Use a mix of ``float32`` and ``float16`` parameters by manually specifying parameters as ``float16`` or ``float32``. .. note:: When PyTorch encounters a mix of ``float16`` and ``float32`` inputs for a given operation, it will usually cast all inputs to ``float32``, and PopTorch complies with this convention. .. literalinclude:: inferenceModel.py :language: python :caption: How to run a model using half precision :linenos: :start-after: inference_half_start :end-before: inference_half_end :emphasize-lines: 1, 2 PyTorch buffers =============== PopTorch supports PyTorch buffers in some circumstances. You can use buffers to make tensors persistent, that is to allow tensors to keep their values from the previous run on each new run, without making them model parameters. However, you must make sure that you only make in-place modifications to the buffer using PyTorch in-place operations (such as `+=` or those ending in `_`). For example, you can ``torch.Tensor.copy_`` to copy the contents of another tensor to the buffer. Unlike when running on the CPU, the following PyTorch code does not increment ``model.i`` each time, when running on the IPU: .. literalinclude:: buffers.py :language: python :caption: The wrong way to have a persistent tensor :linenos: :start-after: counter_model_wrong_start :end-before: counter_model_wrong_end This is because the PyTorch dispatcher will capture the value for ``model.i`` when building the graph and freeze the value as a constant. You can keep the value of a tensor between runs by registering it as a buffer in PyTorch, as the following examples shows: .. literalinclude:: buffers.py :language: python :caption: An example showing a tensor which is incremented on each iteration by registering it as a tensor. :linenos: :start-after: counter_model_correct_start :end-before: counter_model_correct_end .. note:: When running an inference model (with :py:func:`~poptorch.inferenceModel`), any buffers which your model modifies will not be implicitly copied to the host. You will need to call :py:func:`~poptorch.PoplarExecutor.copyWeightsToHost` before reading the value of a buffer which has been changed as a result of a model call. .. note:: PopTorch does not support broadcasting of buffers between replicas. You can make each replica use its own buffer by setting the PopTorch option :py:func:`~poptorch.Options.broadcastBuffers` to False: ``poptorch.Options().broadcastBuffers(False)`` You need to ensure that your model still works with each replica using a separate buffer. .. _creating_custom_ops: Creating custom ops =================== If you need to implement functionality that is not directly supported in in PopTorch, you can create a custom op. There are two steps to creating a custom op in PopTorch: #. Implement the op in C++ using the PopART API #. Make the op available in PopTorch so you can use it in your PyTorch model Implementing the custom op -------------------------- You will need to implement the new op as C++ code by creating subclasses of, at least, the Op and Opx base classes provided by the PopART API. If you are going to use the custom op for training, then you will also need to define the classes that implement the gradient operation. For details of how to do this, see the `Custom operators `__ chapter of the PopART User Guide. You can find some examples of custom ops in the :tutorials-repo:`Graphcore GitHub examples repository `. Compiling the PopART custom op will create a dynamic library file, which you can use with your PyTorch code. Make the op available to PyTorch -------------------------------- After you have compiled the C++ implementation of the custom op, you can load the library file, and call the op from your PyTorch program, using the :py:class:`~poptorch.custom_op` class. First, load the dynamic library as shown in :numref:`loading_library_code`. .. literalinclude:: ../../tests/custom_ops_test.py :language: python :caption: Loading the library for the custom op :linenos: :start-after: loading_library_start :end-before: loading_library_end :name: loading_library_code You can now call your custom op using the PopTorch class :py:class:`~poptorch.custom_op`. Both the forward op and backward op are implemented in the PopART code. However, in this inference model example, only the forward op is called: .. literalinclude:: ../../tests/custom_ops_test.py :language: python :caption: Calling a custom op in a PopTorch inference model :linenos: :emphasize-lines: 4-8 :start-after: inference_start :end-before: inference_end In this example ``[x, x]`` is assigned to ``example_outputs``, where ``x`` is one of the input tensors which is used as a template for the output tensors. The custom op code will need to create the tensors that it returns. You can also call this custom op inside a training model using :py:class:`~poptorch.custom_op` and the backward op will be called automatically. The Graphcore examples repository contains a feature example demonstrating how to load and in and use a custom op in a PopTorch model: :tutorials-repo:`PopTorch example: Custom op `. Passing attributes to the custom op ----------------------------------- You can pass attributes to the custom op using a Python dictionary, as shown in :numref:`inference_with_attribute_code`. .. literalinclude:: ../../tests/custom_ops_test.py :language: python :caption: Passing an attribute to a custom op from PopTorch :linenos: :emphasize-lines: 8 :start-after: inference_with_attribute_start :end-before: inference_with_attribute_end :name: inference_with_attribute_code You can then access these attributes within the C++ custom op code. The above example passes a ``Float`` attribute with the name ``alpha`` to the LeakyRELU implementation. See the `Custom operators `__ chapter of the PopART User Guide for more information. Table :numref:`popart-attribute-types` and the code example in :numref:`many_attribtes_examples_code` show how to pass other attribute types to a custom op. PopTorch supports all attributes supported in PopART except for ``Graph``. .. list-table:: Python types to use to pass attributes to PopART :widths: 35 65 :header-rows: 1 :align: center :name: popart-attribute-types * - PopART attribute type - Python equivalent * - ``Float`` - Python float (converted to ``float32``) * - ``Floats`` - List or tuple of Python float * - ``Int`` - Python int (converted to 64-bit signed int) * - ``Ints`` - List or tuple of Python int * - ``String`` - Python str (converted to ASCII) * - ``Strings`` - List or tuple of Python str * - ``Graph`` - Not supported .. literalinclude:: ../../tests/custom_ops_attributes_test.py :language: python :caption: Passing different attribute types from PopTorch :linenos: :start-after: many_attribtes_examples_start :end-before: many_attribtes_examples_end :name: many_attribtes_examples_code Precompilation and caching ========================== .. _caching: Caching ------- By default PopTorch will re-compile the model every time you instantiate a model. However if you often run the same models you might want to enable executable caching to save time. You can do this by either setting the ``POPTORCH_CACHE_DIR`` environment variable or by calling :py:class:`~poptorch.Options.enableExecutableCaching`. .. warning:: The cache directory might grow large quickly because PopTorch doesn't delete old models from the cache and, depending on the number and size of your models and the number of IPUs used, the executables might be quite large. It is your responsibility to delete the unwanted cache files. Precompilation -------------- PopTorch supports precompilation: This means you can compile your model on a machine which doesn't have an IPU and export the executable to a file. You can then reload and execute it on a different machine which does have an IPU. .. important:: The PopTorch versions on both machines must be an exact match. To precompile your model you need to wrap it using either :py:func:`~poptorch.trainingModel` or :py:func:`~poptorch.inferenceModel` then call :py:meth:`~poptorch.PoplarExecutor.compileAndExport` on the wrapper. .. literalinclude:: precompilation.py :language: python :caption: How to precompile a model using an offline IPU target. :linenos: :start-after: precomp_start :end-before: precomp_end :emphasize-lines: 22-23,32 .. note:: If you don't know the IPU version on your system you can use :py:func:`~poptorch.ipuHardwareVersion`. The exported file by default will contain your original PyTorch model (including the weights), and enough information to re-create the PopTorch wrapper and reload the executable. .. important:: For your model and weights to be exported, your model must be picklable. See https://docs.python.org/3/library/pickle.html for more information. If your model is not picklable then use ``export_model=False``, as shown in :numref:`export_no_python`. Now both the torch model, PopTorch wrapper and executable can be restored on the target machine using :py:func:`~poptorch.load`: .. literalinclude:: precompilation.py :language: python :caption: How to load a precompiled model :linenos: :start-after: load_start :end-before: load_end :emphasize-lines: 1 In some cases you might want to provide some runtime information to select the device: you can do this using the ``edit_opts_fn`` argument of :py:func:`~poptorch.load`: .. literalinclude:: precompilation.py :language: python :caption: How to load a precompiled model and run on a specific IPU :linenos: :start-after: load_setIpu_start :end-before: load_setIpu_end :emphasize-lines: 1-2,5 .. note:: When loading a precompiled model, only run-time options will be applied; all others will be ignored. Going back to the precompilation step: in some cases you might want to export only the executable and not the python wrapper or torch model (for example if your model cannot be pickled). .. literalinclude:: precompilation.py :language: python :caption: How to export only the executable :linenos: :start-after: precomp_no_python_start :end-before: precomp_no_python_end :name: export_no_python It means you will need to re-create and wrap the model yourself before loading the executable: .. literalinclude:: precompilation.py :language: python :caption: How to load a precompiled executable :linenos: :start-after: load_exe_start :end-before: load_exe_end :emphasize-lines: 1,6-7 .. important:: Exported models lose their connections to other models. For example, if you have a :py:func:`~poptorch.trainingModel` and a :py:func:`~poptorch.inferenceModel` based on the same PyTorch model, you wouldn't usually need to keep the weights synchronised between the two; PopTorch would take care of it for you, implicitly. In the following example, PopTorch automatically copies the weights from the training model to the inference model: .. literalinclude:: precompilation.py :language: python :caption: PopTorch implicit copies :linenos: :start-after: implicit_cp_start :end-before: implicit_cp_end :emphasize-lines: 16,18-20 If you were to export these models: .. literalinclude:: precompilation.py :language: python :caption: Precompilation of both a training and validation models :linenos: :start-after: precomp_train_val_start :end-before: precomp_train_val_end :emphasize-lines: 11-12,14 .. note:: Don't forget to call ``model.eval()`` or ``model.train()``, as required, before calling :py:func:`~poptorch.PoplarExecutor.compileAndExport`. You could then insert explicit copy operations: .. literalinclude:: precompilation.py :language: python :caption: Precompilation of both a training and validation models :linenos: :start-after: load_train_val_start :end-before: load_train_val_end :emphasize-lines: 9,10 Or you would need to re-connect the two models by creating the second one from the first one and then loading the executable: .. literalinclude:: precompilation.py :language: python :caption: Precompilation of both a training and validation models :linenos: :start-after: load_train_val_connected_start :end-before: load_train_val_connected_end :emphasize-lines: 2-6 Environment variables ===================== Logging level ------------- PopTorch uses the following levels of logging: * ``OFF``: No logging * ``ERR``: Errors only * ``WARN``: Warnings and errors only * ``INFO``: Info, warnings and errors (default) * ``DEBUG``: Adds some extra debugging information * ``TRACE`` and ``TRACE_ALL``: Trace everything inside PopTorch You can use the ``POPTORCH_LOG_LEVEL`` environment variable to set the logging level: .. code-block:: bash export POPTORCH_LOG_LEVEL=DEBUG .. _profiling_env: Profiling --------- When running programs using PopTorch, you can enable profiling by using the ``POPLAR_ENGINE_OPTIONS`` environment variable used by Poplar. In order to capture the reports needed for the PopVision Graph Analyser you only need to set ``POPLAR_ENGINE_OPTIONS='{"autoReport.all":"true"}'``: .. code-block:: bash export POPLAR_ENGINE_OPTIONS='{"autoReport.all":"true"}' By default, report files are output to the current working directory. You can specify a different output directory by setting ``autoReport.directory``, for example: .. code-block:: bash export POPLAR_ENGINE_OPTIONS='{"autoReport.all":"true", "autoReport.directory":"./tommyFlowers"}' For more options, refer to the `PopVision Graph Analyser User Guide `__. In order to capture the ``pvti`` reports needed for the `PopVision System Analyser `__ you need to enable the `PopVision Trace Instrumentation library (PVTI) `__. To do so, set ``PVTI_OPTIONS='{"enable":"true"}'``. .. important:: By default, PopVision will display multiple trace files using relative time. This is because most of the time we want to compare two executions of the same model, for example. However, in this case we want the traces to be aligned on absolute time: this can be done by selecting "Absolute Timing" in the PopVision options. You can also add extra tracepoints in your own code by using :py:class:`~poptorch.profiling.Channel`. IPU Model --------- By default PopTorch will try to attach to a physical IPU. If instead you want to use the model, you can do so by setting ``POPTORCH_IPU_MODEL`` to 1: .. code-block:: bash export POPTORCH_IPU_MODEL=1 See the `Poplar and PopLibs User Guide `__ for the limitations of the IPU Model. Wait for an IPU to become available ----------------------------------- By default, attempting to attach to an IPU when all IPUs are already in use will raise an exception. If you would rather wait for an IPU to become available, you can do so by setting ``POPTORCH_WAIT_FOR_IPU`` to 1. .. code-block:: bash export POPTORCH_WAIT_FOR_IPU=1 Enable executable caching ------------------------- You can enable executable caching by either setting the ``POPTORCH_CACHE_DIR`` environment variable or by calling :py:class:`~poptorch.Options.enableExecutableCaching`. .. code-block:: bash export POPTORCH_CACHE_DIR=/tmp/poptorch_cache For more information, see :numref:`caching`. ================================================ FILE: docs/user_guide/phased_execution.py ================================================ #!/usr/bin/env python3 # Copyright (c) 2020 Graphcore Ltd. All rights reserved. import torch import torch.nn.functional as F import poptorch # pylint: disable=function-redefined, too-many-function-args # annotations_start poptorch.setLogLevel("DEBUG") # Force debug logging N = 3 size = 10 class Model(torch.nn.Module): def __init__(self): super().__init__() self.weights = torch.nn.ParameterList([ torch.nn.Parameter(torch.rand(size, size), requires_grad=True) for n in range(N * 6) ]) def forward(self, in0, target=None): phase = 0 weight = iter(self.weights) with poptorch.Block("phase0_ipu0"): ins = torch.split(in0, size) for n in range(N * 3): out = [] for ipu in range(2): x = ins[ipu] with poptorch.Block(f"phase{phase}_ipu{ipu}"): x = torch.matmul(next(weight), x) out.append(F.relu(x)) ins = out[1], out[0] # We want 2 matmuls in the same phase if n % 3 != 1: phase += 1 with poptorch.Block(f"phase{N*2-1}_ipu1"): res = ins[0] + ins[1] if target is None: return res return res, torch.nn.L1Loss(reduction="mean")(res, target) input = torch.rand(size * 2, 1) target = torch.rand(size, 1) model = Model() opts = poptorch.Options() phases = [] # Alternate between 0-2 and 1-3 for n in range(N): phases.append([ poptorch.Stage(f"phase{2*n}_ipu0").ipu(0), poptorch.Stage(f"phase{2*n}_ipu1").ipu(2) ]) phases.append([ poptorch.Stage(f"phase{2*n+1}_ipu0").ipu(1), poptorch.Stage(f"phase{2*n+1}_ipu1").ipu(3) ]) opts.setExecutionStrategy(poptorch.ParallelPhasedExecution(*phases)) poptorch_model = poptorch.trainingModel(model, opts) poptorch_model.compile(input, target) # annotations_end # stage_start class Model(torch.nn.Module): def forward(self, x, y): with poptorch.Block("A"): c = x + x with poptorch.Block("B"): d = y + y with poptorch.Block("C"): e = x * 3 return c, d, e first = poptorch.Phase(poptorch.Stage("A").ipu(0)) # Regrouped in a single stage second = poptorch.Phase(poptorch.Stage("B", "C").ipu(1)) # 2 separate stages second = poptorch.Phase(poptorch.Stage("B").ipu(1), poptorch.Stage("C").ipu(3)) # stage_end opts = poptorch.Options() opts.autoRoundNumIPUs(True) opts.setExecutionStrategy(poptorch.ParallelPhasedExecution(first, second)) m = poptorch.inferenceModel(Model(), opts) m.compile(input, input) m.destroy() class Model(torch.nn.Module): def forward(self, x, y): with poptorch.Block("A"): c = x + x with poptorch.Block("A2"): d = y + y with poptorch.Block("B"): e = c + d with poptorch.Block("B2"): f = y + d with poptorch.Block("C"): g = e + f with poptorch.Block("C2"): h = f + y return g, h opts = poptorch.Options() # serial_start strategy = poptorch.SerialPhasedExecution( poptorch.Phase(poptorch.Stage("A"), poptorch.Stage("A2")), poptorch.Phase(poptorch.Stage("B"), poptorch.Stage("B2")), poptorch.Phase(poptorch.Stage("C"), poptorch.Stage("C2"))) strategy.phase(0).ipus(0, 1) strategy.phase(1).ipus(0, 1) strategy.phase(2).ipus(0, 1) opts.setExecutionStrategy(strategy) # serial_end m = poptorch.inferenceModel(Model(), opts) m.compile(input, input) m.destroy() class Model(torch.nn.Module): def forward(self, x, y): poptorch.Block.useAutoId() with poptorch.Block(): c = x + x with poptorch.Block(): d = y + y with poptorch.Block(): e = c + d with poptorch.Block(): f = y + d with poptorch.Block(): g = e + f with poptorch.Block(): h = f + y return g, h opts = poptorch.Options() # parallel_start strategy = poptorch.ParallelPhasedExecution( poptorch.Phase(poptorch.Stage("0"), poptorch.Stage("1")), poptorch.Phase(poptorch.Stage("2"), poptorch.Stage("3")), poptorch.Phase(poptorch.Stage("4"), poptorch.Stage("5"))) strategy.phase(0).ipus(0, 2) strategy.phase(1).ipus(1, 3) strategy.phase(2).ipus(0, 2) opts.setExecutionStrategy(strategy) # parallel_end m = poptorch.inferenceModel(Model(), opts) m.compile(input, input) m.destroy() ================================================ FILE: docs/user_guide/pipeline_simple.py ================================================ # Copyright (c) 2020 Graphcore Ltd. All rights reserved. # annotations_start import transformers import torch import poptorch # A bert model from hugging face. See the packaged BERT example for actual usage. pretrained_weights = 'mrm8488/bert-medium-finetuned-squadv2' # For later versions of transformers, we need to wrap the model and set # return_dict to False class WrappedModel(torch.nn.Module): def __init__(self): super().__init__() self.wrapped = transformers.BertForQuestionAnswering.from_pretrained( pretrained_weights) def forward(self, input_ids, attention_mask, token_type_ids): return self.wrapped.forward(input_ids, attention_mask, token_type_ids, return_dict=False) def __getattr__(self, attr): try: return torch.nn.Module.__getattr__(self, attr) except AttributeError: return getattr(self.wrapped, attr) model = WrappedModel() # A handy way of seeing the names of all the layers in the network. print(model) # All layers before "model.bert.encoder.layer[0]" will be on IPU 0 and all layers from # "model.bert.encoder.layer[0]" onwards (inclusive) will be on IPU 1. model.bert.encoder.layer[0] = poptorch.BeginBlock(model.bert.encoder.layer[0], ipu_id=1) # Now all layers before layer are on IPU 1 and this layer onward is on IPU 2 model.bert.encoder.layer[2] = poptorch.BeginBlock(model.bert.encoder.layer[2], ipu_id=2) # Finally all layers from this layer till the end of the network are on IPU 3. model.bert.encoder.layer[4] = poptorch.BeginBlock(model.bert.encoder.layer[4], ipu_id=3) # We must batch the data by at least the number of IPUs. Each IPU will still execute # whatever the model batch size is. data_batch_size = 4 # Create a poptorch.Options instance to override default options opts = poptorch.Options() opts.deviceIterations(data_batch_size) # annotations_end # Model is now passed to the wrapper as usual. inference_model = poptorch.inferenceModel(model, opts) tokenizer = transformers.BertTokenizer.from_pretrained( "mrm8488/bert-medium-finetuned-squadv2", return_token_type_ids=True) # Make use of the model contexts = [ """Edinburgh is Scotland's compact, hilly capital.""", """The oldest cat recorded was Cream Puff at 38 years.""", """The largest litter of kittens produced 19 kittens.""", """The first webcam was used to check the status of a coffee pot.""" ] questions = [ "What is the capital of Scotland?", "How old was the oldest cat ever?", "How many kittens in the largest litter?", "What was the first webcam used for?" ] encoding = tokenizer(questions, contexts, padding=True) input_ids = encoding["input_ids"] start_scores, end_scores = inference_model( torch.tensor(encoding["input_ids"]), torch.tensor(encoding["attention_mask"]), torch.tensor(encoding["token_type_ids"])) answer_string = [] for batch_id in range(len(contexts)): ans_tokens = input_ids[batch_id][torch.argmax(start_scores[batch_id]):torch .argmax(end_scores[batch_id]) + 1] answer_tokens = tokenizer.convert_ids_to_tokens(ans_tokens) answer_tokens_to_string = tokenizer.convert_tokens_to_string(answer_tokens) answer_string.append(answer_tokens_to_string) print(answer_string) assert answer_string[0] == 'edinburgh' assert answer_string[1] == '38 years' assert answer_string[2] == '19' assert answer_string[3] == 'to check the status of a coffee pot' # annotations_inline_start class Network(torch.nn.Module): def __init__(self): super().__init__() self.layer1 = torch.nn.Linear(5, 10) self.layer2 = torch.nn.Linear(10, 5) self.layer3 = torch.nn.Linear(5, 5) self.layer4 = torch.nn.Linear(5, 5) self.act = torch.nn.ReLU() self.softmax = torch.nn.Softmax(dim=1) def forward(self, x): # Explicit layers on a certain IPU poptorch.Block.useAutoId() with poptorch.Block(ipu_id=0): x = self.act(self.layer1(x)) with poptorch.Block(ipu_id=1): x = self.act(self.layer2(x)) with poptorch.Block(ipu_id=2): x = self.act(self.layer3(x)) x = self.act(self.layer4(x)) with poptorch.Block(ipu_id=3): x = self.softmax(x) return x model = Network() opts = poptorch.Options() opts.deviceIterations(4) poptorch_model = poptorch.inferenceModel(model, options=opts) print(poptorch_model(torch.rand((4, 5)))) # annotations_inline_end # pylint: disable=function-redefined # annotations_decorator_start class Network(torch.nn.Module): def __init__(self): super().__init__() self.layer1 = torch.nn.Linear(5, 10) self.layer2 = torch.nn.Linear(10, 5) self.layer3 = torch.nn.Linear(5, 5) self.layer4 = torch.nn.Linear(5, 5) self.act = torch.nn.ReLU() self.softmax = torch.nn.Softmax(dim=1) def forward(self, x): poptorch.Block.useAutoId() x = self.block_one(x) x = self.block_two(x) x = self.final_activation(x) return x @poptorch.BlockFunction(ipu_id=0) def block_one(self, x): x = self.act(self.layer1(x)) x = self.act(self.layer2(x)) return x @poptorch.BlockFunction(ipu_id=1) def block_two(self, x): x = self.act(self.layer3(x)) x = self.act(self.layer4(x)) return x @poptorch.BlockFunction(ipu_id=1) def final_activation(self, x): return self.softmax(x) model = Network() opts = poptorch.Options() opts.deviceIterations(4) poptorch_model = poptorch.inferenceModel(model, options=opts) print(poptorch_model(torch.rand((4, 5)))) # annotations_decorator_end ================================================ FILE: docs/user_guide/poptorch.conf ================================================ deviceIterations(1) setExecutionStrategy(poptorch.ShardedExecution()) replicationFactor(1) enableSyntheticData(True) ================================================ FILE: docs/user_guide/poptorch_training_simple.py ================================================ # Copyright (c) 2020 Graphcore Ltd. All rights reserved. import torch import poptorch class ExampleModel(torch.nn.Module): def __init__(self): super().__init__() self.bias = torch.nn.Parameter(torch.zeros(())) def forward(self, x): return torch.cat([ 100 * torch.nn.LeakyReLU()(-x + self.bias), 100 * torch.nn.LeakyReLU()(x - self.bias) ], dim=-1) # model_with_loss_start class ExampleModelWithLoss(torch.nn.Module): def __init__(self): super().__init__() self.model = ExampleModel() def forward(self, input, target): out = self.model(input) return (torch.nn.functional.softmax(out), torch.nn.CrossEntropyLoss(reduction="mean")(out, target)) # model_with_loss_end class ExampleDataset(torch.utils.data.Dataset): def __init__(self, shape, length): super().__init__() self._shape = shape self._length = length self._all_data = [] self._all_labels = [] torch.manual_seed(0) for _ in range(length): label = 1 if torch.rand(()) > 0.5 else 0 data = (torch.rand(self._shape) + label) * 0.5 self._all_data.append(data) self._all_labels.append(label) def __len__(self): return self._length def __getitem__(self, index): return self._all_data[index], self._all_labels[index] def run_examples(): # simple_ipu_start # Set up the PyTorch DataLoader to load that much data at each iteration opts = poptorch.Options() opts.deviceIterations(10) training_data = poptorch.DataLoader(options=opts, dataset=ExampleDataset(shape=[1], length=20000), batch_size=10, shuffle=True, drop_last=True) model = ExampleModelWithLoss() model.train() optimizer = torch.optim.AdamW(model.parameters(), lr=0.001) # Wrap the model in a PopTorch training wrapper poptorch_model = poptorch.trainingModel(model, options=opts, optimizer=optimizer) momentum_loss = None for batch, target in training_data: # Performs forward pass, loss function evaluation, # backward pass and weight update in one go on the device. _, loss = poptorch_model(batch, target) if momentum_loss is None: momentum_loss = loss else: momentum_loss = momentum_loss * 0.95 + loss * 0.05 # Optimizer can be updated via setOptimizer. if momentum_loss < 0.1: poptorch_model.setOptimizer( torch.optim.AdamW(model.parameters(), lr=0.0001)) # simple_ipu_end print(model.model.bias) assert (model.model.bias > 0.4 and model.model.bias < 0.6) # simple_cpu_start training_data = torch.utils.data.DataLoader(ExampleDataset(shape=[1], length=20000), batch_size=10, shuffle=True, drop_last=True) model = ExampleModelWithLoss() model.train() optimizer = torch.optim.AdamW(model.parameters(), lr=0.001) momentum_loss = None for batch, target in training_data: # Zero gradients optimizer.zero_grad() # Run model. _, loss = model(batch, target) # Back propagate the gradients. loss.backward() # Update the weights. optimizer.step() if momentum_loss is None: momentum_loss = loss else: momentum_loss = momentum_loss * 0.95 + loss * 0.05 if momentum_loss < 0.1: optimizer = torch.optim.AdamW(model.parameters(), lr=0.0001) # simple_cpu_end print(model.model.bias) assert (model.model.bias > 0.4 and model.model.bias < 0.6) if __name__ == "__main__": run_examples() ================================================ FILE: docs/user_guide/precompilation.py ================================================ # Copyright (c) 2021 Graphcore Ltd. All rights reserved. import sys import poptorch if not poptorch.ipuHardwareIsAvailable(): sys.exit(0) ipu_target_version = poptorch.ipuHardwareVersion() filename = "training.poptorch" # pylint: disable=unused-variable, wrong-import-position, reimported, ungrouped-imports, wrong-import-order # precomp_start import torch import poptorch class ExampleModelWithLoss(torch.nn.Module): def __init__(self): super().__init__() self.fc = torch.nn.Linear(10, 10) self.loss = torch.nn.MSELoss() def forward(self, x, target=None): fc = self.fc(x) if self.training: return fc, self.loss(fc, target) return fc torch.manual_seed(0) model = ExampleModelWithLoss() opts = poptorch.Options() # You don't need a real IPU to compile the executable. opts.useOfflineIpuTarget(ipu_target_version) # Wrap the model in our PopTorch annotation wrapper. poptorch_model = poptorch.trainingModel(model, opts) # Some dummy inputs. input = torch.randn(10) target = torch.randn(10) poptorch_model.compileAndExport(filename, input, target) # precomp_end poptorch_model.destroy() # load_start poptorch_model = poptorch.load(filename) # That's all: your model is ready to be used. poptorch_model(input, target) # Run on IPU # load_end poptorch_model.destroy() # load_setIpu_start def setIpuDevice(opts): opts.useIpuId(1) # always use IPU 1 poptorch_model = poptorch.load(filename, edit_opts_fn=setIpuDevice) poptorch_model(input, target) # Run on IPU 1 # load_setIpu_end # precomp_no_python_start poptorch_model.compileAndExport(filename, input, target, export_model=False) # precomp_no_python_end poptorch_model.destroy() # load_exe_start model = ExampleModelWithLoss() opts = poptorch.Options() # Wrap the model in our PopTorch annotation wrapper. poptorch_model = poptorch.trainingModel(model, opts) poptorch_model.loadExecutable(filename) # Some dummy inputs. input = torch.randn(10) target = torch.randn(10) poptorch_model(input, target) # Run on IPU # load_exe_end poptorch_model.destroy() # precomp_train_val_start model = ExampleModelWithLoss() opts = poptorch.Options() # Some dummy inputs. input = torch.randn(10) target = torch.randn(10) # Wrap the model in our PopTorch annotation wrapper. training_model = poptorch.trainingModel(model, opts) training_model.compileAndExport("training.poptorch", input, target) model.eval() validation_model = poptorch.inferenceModel(model, opts) validation_model.compileAndExport("validation.poptorch", input) # precomp_train_val_end epochs = range(2) def run_training(_): pass def run_validation(_): pass # implicit_cp_start model = ExampleModelWithLoss() opts = poptorch.Options() # Wrap the model in our PopTorch annotation wrapper. training_model = poptorch.trainingModel(model, opts) model.eval() validation_model = poptorch.inferenceModel(model, opts) # Some dummy inputs. input = torch.randn(10) target = torch.randn(10) # Train the model: for epoch in epochs: training_model(input, target) # Weights are implicitly copied from the training model # to the validation model prediction = validation_model(input) # implicit_cp_end training_model.destroy() validation_model.destroy() # load_train_val_start training_model = poptorch.load("training.poptorch") validation_model = poptorch.load("validation.poptorch") for epoch in epochs: print("Epoch ", epoch) run_training(training_model) # Need to explicitly copy weights between the two models # because they're not connected anymore. training_model.copyWeightsToHost() validation_model.copyWeightsToDevice() run_validation(validation_model) # load_train_val_end training_model.destroy() validation_model.destroy() # load_train_val_connected_start training_model = poptorch.load("training.poptorch") # Create a validation python model based on the training model validation_model = poptorch.inferenceModel(training_model) validation_model.model.eval() # Load the executable for that model: validation_model.loadExecutable("validation.poptorch") for epoch in epochs: print("Epoch ", epoch) run_training(training_model) # Nothing to do: training_model and validation_model are now connected # and PopTorch will implicitly keep the weights in sync between them. run_validation(validation_model) # load_train_val_connected_end training_model.destroy() validation_model.destroy() ================================================ FILE: docs/user_guide/pytorch_to_poptorch.rst ================================================ ======================== From PyTorch to PopTorch ======================== This page will introduce the key features that enable training on the IPU, and how they differ from native PyTorch. .. note:: PopTorch compiles a ``torch.nn.Model`` model for the IPU when it is wrapped in either a :py:func:`~poptorch.trainingModel` or :py:func:`~poptorch.inferenceModel`, as appropriate. This provides similar functionality to ``torch.compile`` but with more flexibility to generate optimal code for the IPU. Also, ``torch.compile`` does not pass options to a custom compiler backend. For these reasons, we do not currently support ``torch.compile``. Any calls to ``torch.compile`` should be replaced by wrapping the model with either :py:func:`~poptorch.trainingModel` or :py:func:`~poptorch.inferenceModel`. These functions perform static `compilation of the whole graph `__ to produce optimized code to run on the IPU. The compilation of multiple partial graphs is not supported. Preparing your data =================== Data loading in PyTorch is typically handled using `torch.utils.data.DataLoader `_. PopTorch extends PyTorch's DataLoader with a :py:class:`~poptorch.DataLoader` to enable efficient data batching with respect to PopTorch's underlying machine learning framework, PopART. Instantiation is almost identical to PyTorch, but you must remember to pass an instance of :py:class:`~poptorch.Options`. +-------------------------------------------------+ | PyTorch | +-------------------------------------------------+ | .. literalinclude:: poptorch_training_simple.py | | :lines: 1-5 | | :start-after: simple_cpu_start | | :end-before: simple_cpu_end | | :dedent: 4 | +-------------------------------------------------+ | PopTorch | +-------------------------------------------------+ | .. literalinclude:: poptorch_training_simple.py | | :lines: 1-9 | | :start-after: simple_ipu_start | | :end-before: simple_ipu_end | | :dedent: 4 | +-------------------------------------------------+ For more information about how to set :py:class:`~poptorch.Options`, see :numref:`efficient_data_batching`. Creating your model =================== Training -------- If you want to create a model for training on the IPU, you first need to wrap your model in a PyTorch model that returns a tuple containing two elements: the outputs of the model and the loss. +-------------------------------------------------+ | PopTorch | +-------------------------------------------------+ | .. literalinclude:: poptorch_training_simple.py | | :start-after: model_with_loss_start | | :end-before: model_with_loss_end | +-------------------------------------------------+ Then all you need to do is instantiate a :py:func:`~poptorch.trainingModel`, by passing your new PyTorch model, :py:class:`~poptorch.Options`, and optimizer. +-------------------------------------------------+ | PyTorch | +-------------------------------------------------+ | .. literalinclude:: poptorch_training_simple.py | | :lines: 7-10 | | :start-after: simple_cpu_start | | :end-before: simple_cpu_end | | :dedent: 4 | +-------------------------------------------------+ | PopTorch | +-------------------------------------------------+ | .. literalinclude:: poptorch_training_simple.py | | :lines: 11-19 | | :start-after: simple_ipu_start | | :end-before: simple_ipu_end | | :dedent: 4 | | :emphasize-lines: 7-9 | +-------------------------------------------------+ Inference --------- For inference, it's even easier. Just instantiate an :py:func:`~poptorch.inferenceModel` by passing your PyTorch model. .. code-block:: python poptorch_model = poptorch.inferenceModel(model) The training loop ================= A simple training loop in PyTorch will typically consist of: - Setting gradients to zero - Performing a forwards pass with the model (and obtaining the loss) - Performing the backwards pass with respect to the loss, and updating weights - Updating the optimizer In PopTorch, these steps are combined into a single step +-------------------------------------------------+ | PyTorch | +-------------------------------------------------+ | .. literalinclude:: poptorch_training_simple.py | | :lines: 14-25 | | :start-after: simple_cpu_start | | :end-before: simple_cpu_end | | :dedent: 4 | +-------------------------------------------------+ | PopTorch | +-------------------------------------------------+ | .. literalinclude:: poptorch_training_simple.py | | :lines: 23-26 | | :start-after: simple_ipu_start | | :end-before: simple_ipu_end | | :dedent: 4 | +-------------------------------------------------+ Multiple/custom losses ====================== If using multiple losses, or when creating a custom loss, the final loss must be marked explicitly using :py:func:`~poptorch.identity_loss`. +----------------------------------------------------------------------+ | PyTorch | +----------------------------------------------------------------------+ | .. code-block:: python | | | | def custom_loss(output, target) | | loss1 = torch.nn.functional.nll_loss(x, target) | | loss2 = torch.nn.functional.nll_loss(x, target) * 5.0 | | return loss1 + loss2 | +----------------------------------------------------------------------+ | PopTorch | +----------------------------------------------------------------------+ | .. code-block:: python | | :emphasize-lines: 4 | | | | def custom_loss(output, target) | | loss1 = torch.nn.functional.nll_loss(x, target) | | loss2 = torch.nn.functional.nll_loss(x, target) * 5.0 | | return poptorch.identity_loss(loss1 + loss2, reduction='none') | +----------------------------------------------------------------------+ Optimizers ========== One important thing to note about using optimizers in PopTorch is that the optimizer state is encapsulated within the PopTorch model. As such, any change made to the optimizer outside of the model must be followed by a call to :py:meth:`poptorch_model.setOptimizer `, passing in the updated optimizer. .. warning:: PopTorch does not directly use the Python implementation of the optimizers. Built-in implementations are used in their place. This means that you cannot currently use custom optimizers. Subclassing a built-in optimizer will generate a warning. Any custom behaviour in a custom optimizer is unlikely to take effect, other than simply setting the existing attributes. +-------------------------------------------------+ | PyTorch | +-------------------------------------------------+ | .. literalinclude:: poptorch_training_simple.py | | :lines: 27-33 | | :start-after: simple_cpu_start | | :end-before: simple_cpu_end | | :dedent: 4 | | :emphasize-lines: 7 | +-------------------------------------------------+ | PopTorch | +-------------------------------------------------+ | .. literalinclude:: poptorch_training_simple.py | | :lines: 28-36 | | :start-after: simple_ipu_start | | :end-before: simple_ipu_end | | :dedent: 4 | | :emphasize-lines: 8-9 | +-------------------------------------------------+ .. note:: PopTorch also provides its own set of optimizers that can be accessed via ``poptorch.optim``. These are wrapper classes which have several advantages over the native PyTorch optimizers. They embed constant attributes for performance/memory savings and allow you to specify additional parameters such as loss scaling and velocity scaling. See :numref:`optimizers` for more information. Going further ============= For a more detailed example of getting started with PopTorch, see the :tutorials-repo:`PyTorch basics tutorial ` which walks through training an MNIST model on the IPU. ================================================ FILE: docs/user_guide/reference.rst ================================================ .. _reference: ============= API reference ============= .. _api_options: Options ======= .. autoclass:: poptorch.Options :members: .. autoclass:: poptorch.options._DistributedOptions :members: .. autoclass:: poptorch.options._PrecisionOptions :members: .. autoclass:: poptorch.options._JitOptions :members: .. autoclass:: poptorch.options._TensorLocationOptions :members: .. autoclass:: poptorch.TensorLocationSettings :members: .. autoclass:: poptorch.options._TrainingOptions :members: Helpers ======= .. autofunction:: poptorch.ipuHardwareIsAvailable .. autofunction:: poptorch.ipuHardwareVersion .. autofunction:: poptorch.setLogLevel .. autoclass:: poptorch.profiling.Channel :members: PopTorch ops ============ .. autofunction:: poptorch.ctc_beam_search_decoder .. autofunction:: poptorch.ipu_print_tensor .. autofunction:: poptorch.for_loop .. autofunction:: poptorch.recomputationCheckpoint .. autofunction:: poptorch.identity_loss .. autoclass:: poptorch.MultiConv :members: .. autoclass:: poptorch.CPU :special-members: __init__ .. autoclass:: poptorch.NameScope :members: .. autoclass:: poptorch.MultiConvPlanType .. autoclass:: poptorch.custom_op .. autofunction:: poptorch.nop .. autofunction:: poptorch.dynamic_slice .. autofunction:: poptorch.dynamic_update .. autofunction:: poptorch.serializedMatMul .. autofunction:: poptorch.set_available_memory .. autofunction:: poptorch.set_overlap_for_input .. autofunction:: poptorch.set_overlap_for_output .. autofunction:: poptorch.nearest .. autofunction:: poptorch.fps .. autofunction:: poptorch.cond Model wrapping functions ======================== .. autofunction:: poptorch.trainingModel .. autofunction:: poptorch.inferenceModel .. autoclass:: poptorch.PoplarExecutor :special-members: __call__ :members: .. autofunction:: poptorch.isRunningOnIpu .. autofunction:: poptorch.load Parallel execution ================== .. autoclass:: poptorch.Block :special-members: __init__, useAutoId .. autoclass:: poptorch.BeginBlock :special-members: __init__ .. autofunction:: poptorch.BlockFunction .. autofunction:: poptorch.removeBlocks .. autoclass:: poptorch.Stage :special-members: __init__ .. autoclass:: poptorch.AutoStage .. autoclass:: poptorch.Phase :special-members: __init__ .. autoclass:: poptorch.ShardedExecution :inherited-members: .. autoclass:: poptorch.PipelinedExecution :special-members: __init__ :inherited-members: .. autoclass:: poptorch.SerialPhasedExecution :special-members: __init__ :inherited-members: .. autoclass:: poptorch.ParallelPhasedExecution :special-members: __init__ :inherited-members: .. autoclass:: poptorch.Liveness .. autoclass:: poptorch.CommGroupType .. autoclass:: poptorch.VariableRetrievalMode .. py:function:: replicaGrouping Call this function on a weight tensor (after applying a PopTorch wrapper with :py:func:`~poptorch.inferenceModel` or :py:func:`~poptorch.trainingModel`) to configure replica groups which each receive a different value of the weight tensor. For details and a code example see :numref:`grouping_tensor_weights`. :param comm_group_type: The replica group arrangement to use for this tensor. :type comm_group_type: poptorch.CommGroupType :param shards: The number of replicas in each replica group. :type shards: int :param variable_retrieval_mode: The method to use when retrieving the value of this tensor from the replicas. :type variable_retrieval_mode: poptorch.VariableRetrievalMode Optimizers ========== .. autoclass:: poptorch.optim.VariableAttributes :members: .. autoclass:: poptorch.optim.SGD :special-members: __init__ :members: .. autoclass:: poptorch.optim.Adam :special-members: __init__ :members: .. autoclass:: poptorch.optim.AdamW :special-members: __init__ :members: .. autoclass:: poptorch.optim.RMSprop :special-members: __init__ :members: .. autoclass:: poptorch.optim.LAMB :special-members: __init__ :members: Data batching ============= .. autoclass:: poptorch.DataLoader :special-members: __init__ :members: terminate .. autoclass:: poptorch.AsynchronousDataAccessor :special-members: __init__, __len__ :members: terminate .. autoclass:: poptorch.DataLoaderMode :members: Enumerations ============ .. autoclass:: poptorch.SharingStrategy :members: .. autoclass:: poptorch.OverlapMode :members: .. autoclass:: poptorch.MatMulSerializationMode :members: .. autoclass:: poptorch.SyncPattern :members: .. autoclass:: poptorch.ReductionType :members: .. autoclass:: poptorch.ConnectionType :members: .. autoclass:: poptorch.OutputMode :members: .. autoclass:: poptorch.MeanReductionStrategy :members: ================================================ FILE: docs/user_guide/replica_grouped_weights.py ================================================ # Copyright (c) 2022 Graphcore Ltd. All rights reserved. import numpy import torch import poptorch # groupedweights_start class ModelWithLoss(torch.nn.Module): def __init__(self, W_init): super().__init__() self.W = torch.nn.Parameter(W_init) def forward(self, X): Z = X @ self.W return Z, poptorch.identity_loss(Z**2, reduction="mean") # Split the weight tensor into 4, and the input data tensor into 2. tensor_shards = 4 data_shards = 2 # Set up the problem random = numpy.random.RandomState(seed=100) prob_X = random.normal(size=(24, 40)).astype(numpy.float32) prob_W_init = random.normal(size=(40, 56)).astype( numpy.float32) * (5 * 8)**-0.5 prob_steps = 4 X = torch.tensor(prob_X) # Run on 8 IPUs W_init = torch.tensor( prob_W_init.reshape(prob_W_init.shape[0], tensor_shards, prob_W_init.shape[1] // tensor_shards).transpose( 1, 0, 2)).contiguous() m = ModelWithLoss(W_init) optim = torch.optim.SGD(m.parameters(), lr=0.01) pt_opts = poptorch.Options() pt_opts.replicationFactor(data_shards * tensor_shards) pt_opts.inputReplicaGrouping(tensor_shards, poptorch.enums.CommGroupType.Consecutive) pt_opts.outputMode(poptorch.OutputMode.All) pt_m = poptorch.trainingModel(m, optimizer=optim, options=pt_opts) pt_m.W.replicaGrouping(poptorch.enums.CommGroupType.Orthogonal, data_shards, poptorch.enums.VariableRetrievalMode.OnePerGroup) pt_losses = [] if data_shards > 1: X = X.reshape(data_shards, X.shape[0] // data_shards, *X.shape[1:]) for _ in range(prob_steps): _, loss = pt_m(X) # We divide by the number of replicas because the mean is being # taken only over a part of the tensor on each replica, so we need to # divide by the number of replicas to get the correct mean. pt_losses.append(torch.sum(loss.detach()) / (data_shards * tensor_shards)) pt_losses = numpy.array(pt_losses) pt_W_final = m.W.detach().numpy().transpose(1, 0, 2) \ .reshape(prob_W_init.shape) # groupedweights_end ================================================ FILE: docs/user_guide/sumAnchorReturnType.py ================================================ # Copyright (c) 2021 Graphcore Ltd. All rights reserved. import random import torch import poptorch RAND_SEED = 8549 class ExampleClassDataset(torch.utils.data.Dataset): """ A dummy dataset with classes for emulating a classification task. All instances of a class, C, will correspond to R*V where R is a randomly generated rotation matrix, fixed for the whole dataset V = V_all + V_cls V_all is a vector of vec_length for which all elements are sampled from an i.i.d. normal distribution, V_all ~ N(0, 0.2). V_cls is a vector of vec_length such that V_cls[x] ~ N(1, 0.2), if x = C, (i.e. the class label) = 0, otherwise """ def __init__(self, num_classes, vec_length, num_examples): super().__init__() assert vec_length >= num_classes random.seed(RAND_SEED) #Generate the class label at this point self.targets = [None] * num_examples for idx in range(num_examples): self.targets[idx] = random.randrange(num_classes) # To get R, make a random symmetric matrix and use eigenvalue # decomposition torch.manual_seed(RAND_SEED) R = torch.rand([vec_length, vec_length]) R = R + R.transpose(0, 1) _, eigenvectors = torch.linalg.eig(R) self._R = eigenvectors.to(torch.float) # # For now, use identity for R # self._R = torch.eye(vec_length, vec_length) self._dist = torch.distributions.normal.Normal(0, 0.2) self._dist = self._dist.expand([vec_length]) self._vec_length = vec_length def __getitem__(self, idx): torch.manual_seed(idx + RAND_SEED) v = self._dist.sample() item_cls = self.targets[idx] v[item_cls] += 1.0 v = torch.matmul(self._R, v) return v, item_cls def __len__(self): return len(self.targets) # yapf: disable #model_returning_accuracy_start class MulticlassPerceptron(torch.nn.Module): def __init__(self, vec_length, num_classes): super().__init__() self.fc = torch.nn.Linear(vec_length, num_classes) self.loss = torch.nn.CrossEntropyLoss() def forward(self, x, target): fc = self.fc(x) classification = torch.argmax(fc, dim=-1) accuracy = (torch.sum((classification == target).to(torch.float)) / float(classification.numel())) if self.training: return self.loss(fc, target), accuracy return classification, accuracy # model_returning_accuracy_end # yapf: enable NUM_CLASSES = 10 VEC_LENGTH = NUM_CLASSES * 2 # yapf: disable #sum_accuracy_start opts = poptorch.Options() opts.deviceIterations(5) opts.Training.gradientAccumulation(10) opts.outputMode(poptorch.OutputMode.Sum) training_data = poptorch.DataLoader(opts, dataset=ExampleClassDataset( NUM_CLASSES, VEC_LENGTH, 2000), batch_size=5, shuffle=True, drop_last=True) model = MulticlassPerceptron(VEC_LENGTH, NUM_CLASSES) model.train() # Wrap the model in a PopTorch training wrapper poptorch_model = poptorch.trainingModel(model, options=opts, optimizer=torch.optim.Adam( model.parameters())) # Run over the training data, 5 batches at a time. for batch_number, (data, labels) in enumerate(training_data): # Execute the device with a 5 iteration loop of batchsize 5 with 10 # gradient accumulations (global batchsize = 5 * 10 = 50). "loss" and # "accuracy" will be the sum across all device iterations and gradient # accumulations but not across the model batch size. _, accuracy = poptorch_model(data, labels) # Correct for iterations # Do not divide by batch here, as this is already accounted for in the # PyTorch Model. accuracy /= (opts.device_iterations * opts.Training.gradient_accumulation) print(f"Accuracy: {float(accuracy)*100:.2f}%") #sum_accuracy_end # yapf: enable ================================================ FILE: docs/user_guide/supported_ops.rst ================================================ .. _supported_ops: IPU supported operations ************************ Below is a list of currently supported operations that can be executed on IPU hardware. This list will be expanded over time as we add more support. Some overloads and modes of operation for ops are not supported and we've tried to list all the caveats but some may have been missed. Torch operations ================ Tensor operations ----------------- Many of the tensor operations will be executed before even reaching the IPU so we can consider them supported anyway. Some, like ``contiguous()``, make no sense on a distributed memory system like the IPU so are ignored. There are no constraints on the memory format of how operations should be called other than the constraint that all graph inputs should be contiguous. We will also create tensor views. However, the aliasing property of views with respect to in-place operations should not be relied on as we may have slightly different view behaviour. Additionally, some PyTorch operations may be implemented by composition of the listed ops but may not be explicitly listed but are in fact supported. Creation ops '''''''''''' * ``torch.arange`` * ``tensor.fill`` * ``torch.full`` * ``torch.full_like`` * ``torch.Tensor.new_ones`` * ``torch.Tensor.new_zeros`` * ``torch.ones`` * ``torch.ones_like`` * ``torch.zeros`` * ``torch.zeros_like`` Indexing, slicing, joining and mutating ops ''''''''''''''''''''''''''''''''''''''''''' In PyTorch, slicing a tensor is accessing a subset of the tensor by providing the start and end indices, such as ``tensor[1:5]``. With a PopTorch model, you may take a slice of a tensor only if one of two conditions are met: * The start and end are constants, or can be resolved to be constants (for example, a function of the shape of a tensor which does not change between runs). * The start and end of the slice are related by a constant, for example ``tensor[x:x+5]``. Please note that this will produce different results to PyTorch if the end value exceeds the length of the tensor: PyTorch will output a smaller size tensor but PopTorch will allow the slice to wrap round to the start of the relevant dimension. PyTorch functions * ``torch.bincount`` * ``torch.bucketize`` * ``torch.cat`` * ``torch.chunk`` * ``torch.gather`` * ``torch.index_select`` * ``torch.index_reduce`` * ``torch.reshape`` * ``torch.roll`` * ``torch.scatter`` * ``torch.scatter_add`` * ``torch.scatter_reduce`` * ``torch.stack`` * ``torch.split`` * ``torch.squeeze`` * ``torch.t`` * ``torch.take_along_dim`` * ``torch.transpose`` * ``torch.unbind`` * ``torch.unsqueeze`` * ``torch.where`` Tensor methods * ``tensor.expand`` * ``tensor.expand_as`` * ``tensor.masked_fill`` * ``tensor.index_fill_`` Random samplers ''''''''''''''' To set the random state, use ``poptorch.Options.randomSeed`` * ``torch.bernoulli`` * ``torch.distributions.Bernoulli`` * ``torch.randn`` * ``torch.normal`` * ``torch.distributions.Normal`` * ``torch.rand`` * ``torch.uniform`` * ``torch.distributions.Uniform`` * ``torch.exponential`` * ``torch.distributions.Exponential`` Math operations --------------- Pointwise ops ''''''''''''' * ``torch.abs`` * ``torch.acos`` * ``torch.acosh`` * ``torch.add`` * ``torch.addcdiv`` * ``torch.amax`` * ``torch.amin`` * ``torch.asin`` * ``torch.asinh`` * ``torch.atan`` * ``torch.atanh`` * ``torch.bitwise_and`` * ``torch.bitwise_not`` * ``torch.bitwise_or`` * ``torch.bitwise_xor`` * ``torch.cdist`` * ``torch.ceil`` * ``torch.clamp`` * ``torch.clamp_max`` * ``torch.clamp_min`` * ``torch.cos`` * ``torch.cosh`` * ``torch.div`` * ``torch.exp`` * ``torch.expm1`` * ``torch.floor`` * ``torch.floor_divide`` * ``torch.fmod`` * ``torch.frac`` * ``torch.log`` * ``torch.log10`` * ``torch.log1p`` * ``torch.log2`` * ``torch.logical_and`` * ``torch.logical_or`` * ``torch.mul`` * ``torch.norm`` * ``torch.neg`` * ``torch.pow`` * ``torch.reciprocal`` * ``torch.remainder`` * ``torch.round`` * ``torch.rsqrt`` * ``torch.sigmoid`` * ``torch.sign`` * ``torch.sin`` * ``torch.sinh`` * ``torch.sqrt`` * ``torch.square`` * ``torch.sub`` * ``torch.tan`` * ``torch.tanh`` * ``torch.true_divide`` * ``torch.trunc`` Reduction ops ''''''''''''' * ``torch.all`` * ``torch.any`` * ``torch.argmax`` * ``torch.argmin`` * ``torch.count_nonzero`` * ``torch.mean`` * ``torch.median`` * ``torch.prod`` * ``torch.logsumexp`` * ``torch.std`` * ``torch.std_mean`` * ``torch.sum`` * ``torch.var`` * ``torch.var_mean`` Comparison ops '''''''''''''' * ``torch.eq`` * ``torch.ge`` * ``torch.gt`` * ``torch.le`` * ``torch.lt`` * ``torch.max`` * ``torch.min`` * ``torch.ne`` * ``torch.isnan`` * ``torch.topk`` * The option ``sorted=False`` is not supported for ``torch.topk``. * ``torch.argsort`` * ``torch.randperm`` * ``torch.sort`` torch.linalg ops '''''''''''''''' * ``torch.linalg.norm`` 2-norm and nuclear norm are unsupported for matrices. * ``torch.linalg.matrix_norm`` 2-norm and nuclear norm are unsupported. * ``torch.linalg.vector_norm`` Other ops ''''''''' * ``torch.cumsum`` * ``torch.cumprod`` * ``torch.cross`` * ``torch.meshgrid`` * ``torch.cartesian_prod`` * ``torch.tensordot`` BLAS and LAPACK Operations '''''''''''''''''''''''''' * ``torch.addmm`` * ``torch.matmul`` * ``torch.bmm`` Torch.nn operations =================== Containers ---------- ``torch.nn.Module`` and ``torch.nn.Sequential`` can be passed into our compiler wrappers and just work. Convolution layers ------------------ Conv transpose operations do not yet support dilations. * ``torch.nn.Conv1d`` * ``torch.nn.Conv2d`` * ``torch.nn.Conv3d`` * ``torch.nn.ConvTranspose1d`` * ``torch.nn.ConvTranspose2d`` * ``torch.nn.ConvTranspose3d`` Pooling layers -------------- Currently the max pool layers do not return the indices so only the variants with ``return_indices=False`` are supported. * ``torch.nn.MaxPool1d`` * ``torch.nn.MaxPool2d`` * ``torch.nn.MaxPool3d`` * ``torch.nn.AvgPool1d`` * ``torch.nn.AvgPool2d`` * ``torch.nn.AvgPool3d`` * ``torch.nn.AdaptiveAvgPool1d`` * ``torch.nn.AdaptiveAvgPool2d`` * ``torch.nn.AdaptiveAvgPool3d`` Padding layers -------------- All padding layers are supported. * ``torch.nn.ReflectionPad1d`` * ``torch.nn.ReflectionPad2d`` * ``torch.nn.ReplicationPad1d`` * ``torch.nn.ReplicationPad2d`` * ``torch.nn.ReplicationPad3d`` * ``torch.nn.ZeroPad2d`` * ``torch.nn.ConstantPad1d`` * ``torch.nn.ConstantPad2d`` * ``torch.nn.ConstantPad3d`` Activations ----------- * ``torch.nn.ELU`` * ``torch.nn.CELU`` * ``torch.nn.GELU`` * ``torch.nn.Hardshrink`` * ``torch.nn.LeakyReLU`` * ``torch.nn.LogSoftmax`` * ``torch.nn.Mish`` * ``torch.nn.ReLU`` * ``torch.nn.SELU`` * ``torch.nn.SiLU`` * ``torch.nn.Sigmoid`` * ``torch.nn.Softmax`` * ``torch.nn.Softplus`` * ``torch.nn.Softsign`` * ``torch.nn.Softshrink`` * ``torch.nn.Tanh`` * ``torch.nn.PReLU`` * ``torch.nn.RReLU`` * ``torch.nn.Hardtanh`` * ``torch.nn.functional.glu`` * ``torch.nn.Threshold`` Normalization layers -------------------- Currently only ``affine=True`` is supported as a parameter. That is to say, only the variants with trainable parameters are supported. * ``torch.nn.BatchNorm1d`` * ``torch.nn.BatchNorm2d`` * ``torch.nn.BatchNorm3d`` * ``torch.nn.LayerNorm`` * ``torch.nn.GroupNorm`` * ``torch.nn.InstanceNorm1d`` * ``torch.nn.InstanceNorm2d`` * ``torch.nn.InstanceNorm3d`` * ``torch.nn.utils.weight_norm`` Recurrent layers ---------------- Bidirectional layers, non-zero dropout probabilities, and setting ``num_layers`` to a value greater than 1 are not currently supported for any recurrent layer. In addition, setting ``bias=False`` is currently only supported for ``torch.nn.GRU``. * ``torch.nn.RNN`` * ``torch.nn.GRU`` * ``torch.nn.LSTM`` Linear layers ------------- * ``torch.nn.Identity`` * ``torch.nn.Linear`` * ``torch.nn.Bilinear`` Dropout ------- * ``torch.nn.dropout`` Sparse layers ------------- Embedding and EmbeddingBag are supported with the exception of the ``padding_idx`` parameter being unsupported. * ``torch.nn.Embedding`` * ``torch.nn.EmbeddingBag`` * ``torch.nn.functional.one_hot`` Loss functions -------------- This version supports a limited subset of loss functions. However, we support :py:func:`~poptorch.identity_loss` which gives you the ability to implement any arbitrary loss function. .. seealso:: :py:func:`~poptorch.identity_loss` One caveat for the following loss functions is if they are used they will always be included in the back propagation and will always receive a gradient, which is a slight deviation from normal PyTorch operations, where they have to opt in to the gradient pass. * ``torch.nn.L1Loss`` * ``torch.nn.MSELoss`` * ``torch.nn.CrossEntropyLoss`` * ``torch.nn.NLLLoss`` * ``torch.nn.BCELoss`` * ``torch.nn.KLDivLoss`` * ``torch.nn.PoissonNLLLoss`` * ``torch.nn.HingeEmbeddingLoss`` * ``torch.nn.BCEWithLogitsLoss`` * ``torch.nn.SmoothL1Loss`` * ``torch.nn.SoftMarginLoss`` * ``torch.nn.CosineEmbeddingLoss`` * ``torch.nn.MarginRankingLoss`` * ``torch.nn.TripletMarginLoss`` * ``torch.nn.CTCLoss`` Vision Layers ------------- Support nearest and bicubic mode. * ``torch.nn.Upsample`` PyTorch Scatter functions * ``torch_scatter.scatter`` * ``torch_scatter.composite.scatter_log_softmax`` * ``torch_scatter.composite.scatter_softmax`` * ``torch_scatter.composite.scatter_std`` * ``torch_scatter.composite.scatter_logsumexp`` PyTorch Spline Convolution functions * ``torch_spline_conv.spline_basis`` * ``torch_spline_conv.spline_weighting`` .. _float_16_op_support: 16-bit float operations ======================= .. warning:: Handling of ``float16`` operations has been greatly simplified since PopTorch version 3.0. Please read this section carefully if you are used to the way this worked prior to version 3.0. In PopTorch version 3.0 and later, ``float16`` operations are handled straightforwardly by the dispatcher frontend. Tensors and models can be freely cast to and from ``float16``, and normalization running statistics can also be retyped by simple casting. If you have PopTorch code created with a previous version of PopTorch, see :numref:`float_16_migration`. .. _float_16_migration: 16-bit float migration ====================== Legacy PopTorch code using ``float16`` can be updated for the dispatcher frontend by considering the following points: * Casts were not well supported by the tracing frontend. They are fully supported by the dispatcher frontend. * ``opts.Precision.halfFloatCasting()`` was used to switch between ways of resolving ops with both ``float32`` and ``float16`` inputs (mixed-precision inputs), either by upcasting the inputs to ``float32``, or by downcasting them to ``float16``. This option is not supported under the dispatcher frontend: mixed precision ops are now always upcast to ``float32``, in accordance with normal PyTorch behaviour. To recreate the effect of ``opts.Precision.halfFloatCasting(poptorch.HalfFloatCastingBehavior.FloatDowncastToHalf)``, which was the default behaviour with the tracing frontend, ``float32`` inputs to mixed-precision ops should be explicitly cast to ``float16`` before being passed to the op. * ``opts.Precision.runningStatisticsAlwaysFloat()`` was used to cause the running mean and variance of certain normalization ops to be calculated in ``float32`` precision, even though the normalization module itself had been cast to ``float16``. This option is not supported in the dispatcher frontend, as the same effect can be achieved by simply casting the running statistic tensors back to ``float32`` before running the model. Gradient computation control ============================ ``torch.no_grad`` is supported as a context manager as well as a decorator to suppress the computation of gradients locally. ================================================ FILE: docs/user_guide/trainingModel.py ================================================ # Copyright (c) 2020 Graphcore Ltd. All rights reserved. # training_model_start import torch import poptorch class ExampleModelWithLoss(torch.nn.Module): def __init__(self): super().__init__() self.fc = torch.nn.Linear(10, 10) self.loss = torch.nn.MSELoss() def forward(self, x, target=None): fc = self.fc(x) if self.training: return fc, self.loss(fc, target) return fc torch.manual_seed(0) model = ExampleModelWithLoss() # Wrap the model in our PopTorch annotation wrapper. poptorch_model = poptorch.trainingModel(model) # Some dummy inputs. input = torch.randn(10) target = torch.randn(10) ones = torch.ones(10) # Train on IPU. for i in range(0, 800): # Each call here executes the forward pass, loss calculation, and backward # pass in one step. # Model input and loss function input are provided together. poptorch_out, loss = poptorch_model(input, target) print(f"{i}: {loss}") # Copy the trained weights from the IPU back into the host model. poptorch_model.copyWeightsToHost() # Execute the trained weights on host. model.eval() native_out = model(input) # Models should be very close to native output although some operations are # numerically different and floating point differences can accumulate. torch.testing.assert_close(native_out, poptorch_out, rtol=1e-04, atol=1e-04) # training_model_end Model = ExampleModelWithLoss def train(model): # Dummy single training step on IPU model(input, target) def train_on_cpu(model): # Dummy single training step on CPU optimizer = torch.optim.SGD(model.parameters(), lr=0.1) _, loss = model(input, target) loss.backward() optimizer.step() def validate(model): # Dummy validate step print(model(ones)) # explicit_copy_start model = Model() model.eval() poptorch_inf = poptorch.inferenceModel(model) # Switch for "poptorch.trainingModel": poptorch_inf will remain in "eval" mode model.train() poptorch_train = poptorch.trainingModel(model) # train on IPU train(poptorch_train) torch.save(model.state_dict(), "model.save") # OK # Aready in "eval" mode validate(poptorch_inf) # OK # switch to "eval" mode for CPU model.eval() validate(model) # OK # train on CPU model.train() train_on_cpu(model) # Explicit copy needed poptorch_inf.copyWeightsToDevice() validate(poptorch_inf) # explicit_copy_end ================================================ FILE: examples/CMakeLists.txt ================================================ function(add_poptorch_py_example name path) message(STATUS "Adding python example '${name}'") set(extra_labels "") if("${name}" STREQUAL "bert_ipu") set(extra_labels ";external_data") else() set(extra_labels ";short") endif() add_test(NAME "${name}_example" COMMAND python3 ${path}/${name}.py WORKING_DIRECTORY ${PROJECT_BINARY_DIR}) set_tests_properties("${name}_example" PROPERTIES LABELS "examples${extra_labels}") endfunction() file(GLOB EXAMPLES "${CMAKE_CURRENT_SOURCE_DIR}/*.py") if(COPY_TESTS) install(FILES ${EXAMPLES} DESTINATION "${CMAKE_CURRENT_BINARY_DIR}") set(EXAMPLES_PATH "${CMAKE_CURRENT_BINARY_DIR}") else() set(EXAMPLES_PATH "${CMAKE_CURRENT_SOURCE_DIR}") endif() foreach(EXAMPLE ${EXAMPLES}) get_filename_component(NAME ${EXAMPLE} NAME_WE) add_poptorch_py_example(${NAME} ${EXAMPLES_PATH}) endforeach() ================================================ FILE: examples/bert_ipu.py ================================================ #!/usr/bin/env python3 # Copyright (c) 2020 Graphcore Ltd. All rights reserved. import os import transformers import torch import poptorch if not poptorch.ipuHardwareIsAvailable(): os.environ["POPTORCH_IPU_MODEL"] = "1" tokenizer = transformers.BertTokenizer.from_pretrained( 'mrm8488/bert-medium-finetuned-squadv2', return_token_type_ids=True) # For later versions of transformers, we need to wrap the model and set # return_dict to False class WrappedModel(torch.nn.Module): def __init__(self): super().__init__() self.wrapped = transformers.BertForQuestionAnswering.from_pretrained( 'mrm8488/bert-medium-finetuned-squadv2') def forward(self, input_ids, attention_mask): return self.wrapped.forward(input_ids, attention_mask, return_dict=False) def __getattr__(self, attr): try: return torch.nn.Module.__getattr__(self, attr) except AttributeError: return getattr(self.wrapped, attr) model = WrappedModel() context = """Scotland is a country that is part of the United Kingdom. Covering the northern third of the island of Great Britain, mainland Scotland has a 96 mile (154 km) border with England to the southeast and is otherwise surrounded by the Atlantic Ocean to the north and west, the North Sea to the northeast and the Irish Sea to the south. In addition, Scotland includes more than 790 islands; principally within the Northern Isles and the Hebrides archipelagos.""" questions = [ "How many islands are there in Scotland?", "What sea is to the south of Scotland", "Where is England in relation to Scotland?", "How long is the border between England and Scotland?" ] batches = len(questions) # Pipeline the model over two IPUs. You must have at least as many batches (questions) as you have IPUs. model.wrapped.bert.embeddings.position_embeddings = poptorch.BeginBlock( model.wrapped.bert.embeddings.position_embeddings, ipu_id=1) # Mark model for inference. opts = poptorch.Options().deviceIterations(batches) inference_model = poptorch.inferenceModel(model, opts) # Batch by the number of iterations so we fill the pipeline. encoding, input_ids, attention_mask = [None] * batches, [[None]] * batches, [ None ] * batches # Encode the query and context. batch_list, atten_list = [], [] # Encode each question for the IPU. for i in range(0, batches): encoding[i] = tokenizer.encode_plus(questions[i], context, max_length=110, pad_to_max_length='right') input_ids[i], attention_mask[i] = encoding[i]["input_ids"], encoding[i][ "attention_mask"] batch_list.append(input_ids[i]) atten_list.append(attention_mask[i]) input_batch = torch.tensor(batch_list) attention_batch = torch.tensor(atten_list) print(input_batch.size()) # Execute on IPU. start_score_pop, end_scores_pop = inference_model(input_batch, attention_batch) print("Context: " + context) index = 0 for start_score, end_score in zip(start_score_pop, end_scores_pop): answer_ids = input_ids[index][torch.argmax(start_score ):torch.argmax(end_score) + 1] answer_tokens = tokenizer.convert_ids_to_tokens(answer_ids, skip_special_tokens=True) answer = tokenizer.convert_tokens_to_string(answer_tokens) print("Question : " + questions[index]) print("Answer : " + answer) index += 1 ================================================ FILE: examples/lstm.py ================================================ #!/usr/bin/env python3 # Copyright (c) 2020 Graphcore Ltd. All rights reserved. import torch import torch.nn as nn import poptorch class SimpleLSTM(nn.Module): def __init__(self): super().__init__() self.lstm = nn.LSTM(3, 3) def forward(self, input_tensors, hidden): Y, (Y_h, Y_c) = self.lstm(input_tensors, hidden) return Y, (Y_h, Y_c) inputs = [torch.randn(1, 3) for _ in range(5)] # Add the extra 2nd dimension inputs = torch.cat(inputs).view(len(inputs), 1, -1) hidden = (torch.randn(1, 1, 3), torch.randn(1, 1, 3)) # clean out hidden state inference_lstm = poptorch.inferenceModel(SimpleLSTM()) out, hidden = inference_lstm(inputs, hidden) print(out) print(hidden) ================================================ FILE: examples/mnist.py ================================================ #!/usr/bin/env python3 # Copyright (c) 2020 Graphcore Ltd. All rights reserved. # pylint: disable=too-many-statements def example(): # pylint: disable=import-outside-toplevel import sys import poptorch if not poptorch.ipuHardwareIsAvailable(): poptorch.logger.warn("This examples requires IPU hardware to run") sys.exit(0) # pylint: disable=unused-variable, wrong-import-position, reimported, ungrouped-imports, wrong-import-order, import-outside-toplevel # mnist_start import torch import torch.nn as nn import torchvision import poptorch # Normal pytorch batch size training_batch_size = 20 validation_batch_size = 100 opts = poptorch.Options() # Device "step" opts.deviceIterations(20) # How many IPUs to replicate over. opts.replicationFactor(4) opts.randomSeed(42) # Load MNIST normally. training_data = poptorch.DataLoader( opts, torchvision.datasets.MNIST('mnist_data/', train=True, download=True, transform=torchvision.transforms.Compose([ torchvision.transforms.ToTensor(), torchvision.transforms.Normalize( (0.1307, ), (0.3081, )) ])), batch_size=training_batch_size, shuffle=True) # Load MNIST normally. val_options = poptorch.Options() validation_data = poptorch.DataLoader( val_options, torchvision.datasets.MNIST('mnist_data/', train=True, download=True, transform=torchvision.transforms.Compose([ torchvision.transforms.ToTensor(), torchvision.transforms.Normalize( (0.1307, ), (0.3081, )) ])), batch_size=validation_batch_size, shuffle=True, drop_last=True) # A helper block to build convolution-pool-relu blocks. class Block(nn.Module): def __init__(self, in_channels, num_filters, kernel_size, pool_size): super(Block, self).__init__() self.conv = nn.Conv2d(in_channels, num_filters, kernel_size=kernel_size) self.pool = nn.MaxPool2d(kernel_size=pool_size) self.relu = nn.ReLU() def forward(self, x): x = self.conv(x) x = self.pool(x) x = self.relu(x) return x # Define the network using the above blocks. class Network(nn.Module): def __init__(self): super().__init__() self.layer1 = Block(1, 10, 5, 2) self.layer2 = Block(10, 20, 5, 2) self.layer3 = nn.Linear(320, 256) self.layer3_act = nn.ReLU() self.layer4 = nn.Linear(256, 10) self.softmax = nn.LogSoftmax(1) self.loss = nn.NLLLoss(reduction="mean") def forward(self, x, target=None): x = self.layer1(x) x = self.layer2(x) x = x.view(-1, 320) x = self.layer3_act(self.layer3(x)) x = self.layer4(x) x = self.softmax(x) if target is not None: loss = self.loss(x, target) return x, loss return x # Create our model. model = Network() # Create model for training which will run on IPU. training_model = poptorch.trainingModel(model, training_data.options) # Same model as above, they will share weights (in 'model') which once training is finished can be copied back. inference_model = poptorch.inferenceModel(model, validation_data.options) def train(): for batch_number, (data, labels) in enumerate(training_data): output, losses = training_model(data, labels) if batch_number % 10 == 0: print(f"PoptorchIPU loss at batch: {batch_number} is {losses}") # Pick the highest probability. _, ind = torch.max(output, 1) assert training_model.options.output_mode in ( poptorch.OutputMode.All, poptorch.OutputMode.Final ), "Only 'Final' and 'All' OutputMode supported" # If we're using Final: only keep the last labels, no-op if using All num_labels = ind.shape[0] labels = labels[-num_labels:] eq = torch.eq(ind, labels) elms, counts = torch.unique(eq, sorted=False, return_counts=True) acc = 0.0 if len(elms) == 2: if elms[0]: acc = (counts[0].item() / num_labels) * 100.0 else: acc = (counts[1].item() / num_labels) * 100.0 print( f"Training accuracy: {acc}% from batch of size {num_labels}" ) print("Done training") def test(): correct = 0 total = 0 with torch.no_grad(): for (data, labels) in validation_data: output = inference_model(data) # Argmax the probabilities to get the highest. _, ind = torch.max(output, 1) # Compare it against the ground truth for this batch. eq = torch.eq(ind, labels) # Count the number which are True and the number which are False. elms, counts = torch.unique(eq, sorted=False, return_counts=True) if len(elms) == 2 or elms[0]: if elms[0]: correct += counts[0].item() else: correct += counts[1].item() total += validation_batch_size print("Validation: of " + str(total) + " samples we got: " + str((correct / total) * 100.0) + "% correct") # Train on IPU. train() test() # mnist_end # AsynchronousDataAccessor must run in the main process if __name__ == "__main__": example() ================================================ FILE: examples/simple_adder.py ================================================ #!/usr/bin/env python3 # Copyright (c) 2020 Graphcore Ltd. All rights reserved. import torch import torch.nn as nn import poptorch # This simple example demonstrates compiling a model to add # two tensors together using the IPU. class SimpleAdder(nn.Module): def forward(self, x, y): return x + y model = SimpleAdder() inference_model = poptorch.inferenceModel(model) t1 = torch.tensor([1.]) t2 = torch.tensor([2.]) assert inference_model(t1, t2) == 3.0 print("Success") ================================================ FILE: popart_compiler/CMakeLists.txt ================================================ cmake_minimum_required(VERSION 3.14 FATAL_ERROR) project(popart_compiler) find_package(popef REQUIRED) add_library(popart_compiler_types INTERFACE) target_include_directories(popart_compiler_types INTERFACE types/include) add_library(popart_compiler SHARED "source/CodeletsCompilation.cpp" "source/Compiler.cpp" "source/CompilerImpl.cpp" "source/Utils.cpp" "source/SessionOptions.cpp" "source/custom_operations/Embedding.cpp" "source/custom_operations/FastGatherLastDim.cpp" "source/custom_operations/HostOp.cpp" "source/custom_operations/TorchSoftplus.cpp" "source/custom_operations/UpsampleBilinear2d.cpp" ) file(GLOB_RECURSE popart_compiler_public_headers "${CMAKE_CURRENT_SOURCE_DIR}/include/*.hpp*" "${CMAKE_CURRENT_SOURCE_DIR}/types/include/*.hpp*") set_target_properties(popart_compiler PROPERTIES CXX_STANDARD 14 PUBLIC_HEADER "${popart_compiler_public_headers}") target_link_libraries(popart_compiler PUBLIC popart_compiler_types PRIVATE popef popart-only poptorch_logging poptorch_exception_info poprithms) target_include_directories(popart_compiler PUBLIC $ $ PRIVATE source/include) # Copy custom codelet sources so that we can install and later pre-compile them # on-demand, configure_file keeps track of changes and always copies on new # version. Custom codelets are also copied into the python package during wheel # creation. set(CUSTOM_CODELETS "UpsampleBilinear2dCodelets.inc.cpp" "FastGatherLastDimFwdCodelets.inc.cpp" "FastGatherLastDimBwdCodelets.inc.cpp" ) foreach(SRC ${CUSTOM_CODELETS}) configure_file(source/custom_operations/${SRC} ${SRC} COPYONLY) endforeach() install(TARGETS popart_compiler LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/popart_compiler ) foreach(SRC ${CUSTOM_CODELETS}) install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${SRC} DESTINATION ${INSTALL_PYDIR}) endforeach() ================================================ FILE: popart_compiler/include/popart_compiler/CodeletsCompilation.hpp ================================================ // Copyright (c) 2022 Graphcore Ltd. All rights reserved. #ifndef POPART_COMPILER_CODELETS_COMPILATION_HPP #define POPART_COMPILER_CODELETS_COMPILATION_HPP #include namespace poptorch { namespace popart_compiler { // Called from python on each 'import poptorch'. Cache path is expected to be // a true filesystem path of the installed python package where codelet sources // are stored. void setCustomCodeletsPath(const char *cache_path); // Compile a custom codelet (if not already compiled) and store the output // file to the path specified with 'setCustomCodeletsPath' above. This can // safely be called from multiple threads/processes. std::unique_ptr compileCustomCodeletIfNeeded(const char *src_file_name, bool hw_only_codelet); } // namespace popart_compiler } // namespace poptorch #endif // POPART_COMPILER_CODELETS_COMPILATION_HPP ================================================ FILE: popart_compiler/include/popart_compiler/Compiler.hpp ================================================ // Copyright (c) 2020 Graphcore Ltd. All rights reserved. #pragma once #include #include #include #include #include #include #include "popart_compiler/CompilerTypes.hpp" #include "poptorch_logging/LoggingLight.hpp" namespace popart { class any; enum class DataType; class ConstVoidData; } // namespace popart namespace poptorch { namespace popart_compiler { namespace detail { struct CompilerImpl; struct SessionOptionsImpl; } // namespace detail void throwTestError(TestErrorType type); // Examines the supplied exception. If it is a popart or poplar exception, // rethrow it as an ExceptionInfo subclass (which gives easy access to the // exception detail) void rethrowPopartOrPoplarException(const std::exception_ptr &eptr, const char *filename, uint64_t line); void setPopartLogLevel(logging::Level level); // Copies the value and constness of one parameter to another void copyParam(Optimizer &dest_optim, const Optimizer &source_optim, const char *source, const char *dest); class Compiler; class SessionOptions { public: SessionOptions(); SessionOptions(SessionOptions &&); ~SessionOptions(); // Disable copy: Move only SessionOptions(const SessionOptions &) = delete; SessionOptions &operator=(const SessionOptions &) = delete; void setMemoryProportion(std::uint32_t ipu, float memory); void setPatternsLevel(std::uint64_t level); void addPattern(const char *pattern, bool enabled); void setTensorLocation(const char *tensor, const char *option, std::uint64_t value); void setCompilationProgressLogger(const std::function &logger); void addStringOption(const char *option, const char *value); void addUint64Option(const char *option, std::uint64_t value); void addBoolOption(const char *option, bool value); void addDoubleOption(const char *option, double value); // Insert a string option in an option container (set / list / vector) void insertStringOption(const char *option, const char *value); // Insert a key / value pair in an option map void insertStringPairOption(const char *option, const char *key, const char *value); bool broadcastBuffers() const; bool hasInputReplication() const; private: std::unique_ptr _impl; friend Compiler; }; // Represents an attribute used in a custom operation: popart uses popart::any // to store the different values class PopartAttribute { public: // Templating works with g++ but not clang++ PopartAttribute(const char *name, const int64_t &value); PopartAttribute(const char *name, const std::vector &values); PopartAttribute(const char *name, const float &value); PopartAttribute(const char *name, const std::vector &values); PopartAttribute(const char *name, const std::unique_ptr &str); PopartAttribute(const char *name, const std::vector> &strs); // Required for opaque pointer PopartAttribute(PopartAttribute &&); PopartAttribute &operator=(PopartAttribute &&); ~PopartAttribute(); popart::any *getValue(); const char *name() const { return _name.get(); } private: // Convert a "const char *" to a std::unique_ptr char* static std::unique_ptr cStrToUP(const char *name); // Use a pointer to circumvent the C++ ABI problems with std::string std::unique_ptr _name; // Use an opaque pointer to avoid the need for popart headers std::unique_ptr _any; }; // A class to store all the data and info required to create a constant in the // popart builder for convenience. Internally, it is a simple wrapper to // popart::ConstVoidData. class PopartConstant { public: PopartConstant(const PopartType &popart_type, const void *data, const std::vector &shape); ~PopartConstant(); // Required for opaque pointer const popart::ConstVoidData &getPopartData() const { return *_data; } private: // Use an opaque pointer to avoid the need for popart headers std::unique_ptr _data; }; // A class to store a constant which is simply returned, (possibly in a tuple // or list) and is not inserted into Popart class HostSideConstant { public: HostSideConstant(const PopartType &popart_type, void *data, size_t data_size, std::vector shape); PopartType popartType() const { return _popart_type; } const std::vector &shape() const { return _shape; } void copyDataTo(void *ptr) const; private: const PopartType _popart_type; std::vector _data; std::vector _shape; }; class Compiler { public: Compiler(bool is_training, const SessionOptions &options); ~Compiler(); Compiler(Compiler &&compiler); TensorId addInputTensor(const char *type, const std::vector &dims, const char *overlap = "no_overlap"); TensorId createTensorId(const char *name); void setCurrentPythonCodeLocation(const char *torch_node, const char *filename, std::uint64_t line, std::uint64_t col); #define INT_VEC std::vector #define FLOAT_VEC std::vector #define FLOAT float #define INT std::int64_t #define BOOL bool #define CHAR char #define STRING const char * #define STRING_VEC std::vector #define NONE #define ARG(Type, Name) , Type Name #define POPART_CONST_ARG(Name) , const PopartConstant &Name #define HOST_SIDE_CONST_ARG(Name) , const HostSideConstant &Name #define POPART_ATTRIB_VEC_ARG(Name) \ , std::shared_ptr> Name #define BODY_ARG(Name) NONE // Create a function decl with the given call and arguments. #define OP_DECL(Namespace, FuncName, function, OnnxImpl, Args, BodyArgs) \ TensorId function(const std::vector &inputs Args); // Create a function decl with the given call and arguments which returns void. #define OP_DECL_NO_RETURN(Namespace, FuncName, function, OnnxImpl, Args, \ BodyArgs) \ void function(const std::vector &inputs Args); #include "SupportedOperations.inc.hpp" #undef OP_DECL #undef OP_DECL_NO_RETURN #undef BODY_ARG #undef POPART_ATTRIB_VEC_ARG #undef HOST_SIDE_CONST_ARG #undef POPART_CONST_ARG #undef ARG #undef NONE #undef STRING_VEC #undef STRING #undef CHAR #undef BOOL #undef INT #undef FLOAT #undef FLOAT_VEC #undef INT_VEC TensorId addInitializedInputTensor(const char *name, const char *type, const std::vector &dims, void *data); TensorId addInitializedInputTensor(const char *name, const char *type, const std::vector &dims, void *data, int comm_group_type, int shards, int variable_retrieval_mode); bool tensorIdIsValid(TensorId id) const; const char *tensorName(TensorId id) const; static const std::vector invalid_size; std::vector getSize(TensorId id) const; std::unique_ptr getTensorDTypeString(TensorId id) const; bool isHostSideConstant(TensorId id) const; void addOutputType(OutputTypeShape type); // This function marks |output| as being read back from the device by the // host. |output_mode| determines how frequently that should happen. // clang-format off // "ALL": Will return all popart batches. // "SUM": Will return the sum of all popart batches (I.E device iterations) // "EVERYN": Will return every N batch // "FINAL": Will return the last batch only // clang-format on void addOutputTensor(TensorId output, PopartOutputMode output_mode = PopartOutputMode::N, size_t output_return_period = 1, const char *overlap = "no_overlap"); void setUpInputOp(TensorId id, float *ptr, const std::vector &dims); void setUpInputOp(TensorId id, std::int32_t *ptr, const std::vector &dims); void setUpInputOp(TensorId id, bool *ptr, const std::vector &dims); void setUpInputOp(TensorId id, std::int16_t *ptr, const std::vector &dims, bool float16 = false); // at::ScalarType::Byte void setUpInputOp(TensorId id, std::uint8_t *ptr, const std::vector &dims); // at::ScalarType::Char void setUpInputOp(TensorId id, std::int8_t *ptr, const std::vector &dims); // at::ScalarType::Byte void setUpOutputOp(TensorId id, std::uint8_t *ptr, const std::vector &dims); // at::ScalarType::Char void setUpOutputOp(TensorId id, std::int8_t *ptr, const std::vector &dims); void setUpOutputOp(TensorId id, float *ptr, const std::vector &dims); void setUpOutputOp(TensorId id, std::int32_t *ptr, const std::vector &dims); void setUpOutputOp(TensorId id, bool *ptr, const std::vector &dims); void setUpOutputOp(TensorId id, std::int16_t *ptr, const std::vector &dims); // Each std::set of tensors represents all the outputs of a node to set // the available memory proportion on. This function loops over the outer // vector, so the total number of nodes it will set the proportion on // will be inputs.size(). void setAvailableMemoryProportion(const std::vector> &inputs, float availableMemoryProportion); void setMatMulSerialization(TensorId matmul, const char *mode, std::uint64_t factor, std::uint64_t keep_precision); void clearActiveIpu(); void setActiveIpu(std::uint64_t stage_id, std::int64_t phase_id, std::int64_t ipu_id); void initSession(const std::vector &opt, const char *export_proto_filename); void setRngState(std::uint64_t seed, const std::vector &rng_state); std::vector getRngState() const; std::uint64_t getRandomSeed() const; void saveExecutableToFile(const char *export_filename) const; void compileAndPrepareDevice(); void loadEngineAndConnectStreams(); void loadExecutableAndPrepareDevice(const char *import_filename); static void appendPoptorchMetadataToFile(const char *serialized_poptorch_metadata, size_t metadata_length, const char *export_filename); static std::vector importPoptorchMetadataFromFile(const char *import_filename); TensorId addCPUCallback(const std::vector &inputs, const CallbackMetadata &callback, std::vector input_types, std::vector> input_shapes, std::vector output_types, std::vector> output_shapes); void startSubgraph(); TensorId endForLoop(std::int32_t trip_count, std::int64_t num_outputs, const std::vector &inputs); void startIfBlock(); void startElseBlock(); TensorId endIfBlock(const TensorId &condition, std::size_t num_outputs); void pushNameScope(const char *name); void popNameScope(); TensorId addUntypedInputTensor(); // Write the weights into IPU memory from the pytorch tensor buffers in the // model. void copyWeightsToDevice(const std::vector &host_buffers); // Write the named buffers into IPU memory from the pytorch tensor buffers // in the model. void copyNamedBuffersToDevice(const std::vector &host_buffers); // Read the weights from IPU memory into the pytorch tensor buffers. void copyWeightsToHost(const std::vector &host_buffers); // Return the type of the given tensor. PopartType getPopartType(TensorId id) const; // Execute the compiled popart graph using poplar. void run(); // Update the optimizers currently being run by the graph. void updateOptimizers(const std::vector &optimizers); std::uint64_t batchPerStep() const; // Return the PopART batch dimensions [DeviceIterations * ReplicationFactor * // GradientAccumulation] std::uint64_t popartBatchDim() const; // Take the above and work out how much of it is being returned. ID must be // an anchor. The batch dim will be mutated depending on what the anchor is // returning. std::uint64_t popartBatchDimForAnchor(TensorId id) const; // Return a flat representation of the output types // For example: ( T0, T2, (T3, T4)) is represented as: // [ Tuple3, Tensor, Tensor, Tuple2, Tensor, Tensor ] const std::vector &outputTypes() const; // We return this as a unique char pointer to avoid leaking memory while // protecting the ABI boundry. std::unique_ptr getPopartIR() const; // We return this as a unique char pointer to avoid leaking memory while // protecting the ABI boundry. std::set> getTensorNames() const; void optimizerGroup(const std::vector &inputs, int64_t group); std::vector optimizerTensorMetadataList() const; void fillHostOptimizerStateTensorData(const std::vector &host_buffers); void writeDeviceOptimizerStateTensorData(const std::vector &host_buffers); std::unique_ptr getExecutionInfo() const; void addMultiConvPart(const std::vector &inputs, const std::vector &dilations, const std::vector &kernel_shape, const std::vector &pads, const std::vector &strides); void setMultiConvAvailableMemoryProportions(const std::vector &v); void setMultiConvPartialsTypes(const std::vector &partials_types); void setMultiConvEnableConvDithering(const std::vector &conv_dithering); void setMultiConvPlanType(int64_t plan_type); void setMultiConvPerConvReservedTiles(int64_t v); void setMultiConvCycleBackOff(double c); std::vector endMultiConv(); void setAttribute(const char *attribute, const char *key, const char *value); void clearAttribute(const char *attribute, const char *key); void detachFromDevice(); void attachToDevice(); bool isAttachedToDevice() const; Timestamps getTimestamps() const; // Returns the number of cycles (on replica 0) run by the IPU for the last // model run. uint64_t getCycleCount() const; size_t getNumInputs() const; size_t getNumOutputs() const; // Mark named buffer as updatable void registerUpdatableNamedBuffer(const TensorId &id); private: void assertTensorIs(PopartType dataType, TensorId id) const; // Make sure no overlap is specified for pipelined mode and that the output // mode is supported by PopART. void verifySettingsForOverlappedIO(PopartOutputMode output_mode); std::unique_ptr _impl; // Store the cycle account of last run, if the relevant option is enabled, // otherwise no_cycles int64_t _cycle_count; static constexpr int64_t no_cycles = -1; static constexpr const char *poptorch_opaque_name = "poptorch"; }; } // namespace popart_compiler } // namespace poptorch ================================================ FILE: popart_compiler/include/popart_compiler/CompilerOperationMacros.inc.hpp ================================================ // Copyright (c) 2022 Graphcore Ltd. All rights reserved. // Auto generated file, do not modify // Run `python3 scripts/PopParse.py` to regenerate // clang-format off // Ops from AiGraphcoreOpset1 OP_DECL(popart, copyvarupdate, copyvarupdate, AiGraphcoreOpset1.copyvarupdate, NONE, BODY_ARG(DEBUG_CONTEXT("Copyvarupdate"))) OP_DECL(popart, batchnormalization, batchnormalization, AiGraphcoreOpset1.batchnormalization, ARG(INT,num_outputs) ARG(FLOAT,epsilon) ARG(FLOAT,momentum) , BODY_ARG(num_outputs) BODY_ARG(epsilon) BODY_ARG(momentum) BODY_ARG(DEBUG_CONTEXT("Batchnormalization"))) OP_DECL(popart, groupnormalization, groupnormalization, AiGraphcoreOpset1.groupnormalization, ARG(INT,num_groups) ARG(FLOAT,epsilon) , BODY_ARG(num_groups) BODY_ARG(epsilon) BODY_ARG(DEBUG_CONTEXT("Groupnormalization"))) OP_DECL(popart, subsample, subsample, AiGraphcoreOpset1.subsample, ARG(INT_VEC,strides) , BODY_ARG(strides) BODY_ARG(DEBUG_CONTEXT("Subsample"))) OP_DECL(popart, printtensor, printtensor, AiGraphcoreOpset1.printtensor, ARG(INT,print_gradient) ARG(STRING,title) ARG(INT,summariseThreshold) ARG(INT,edgeItems) ARG(INT,maxLineWidth) ARG(INT,digits) ARG(INT,floatFormat) ARG(CHAR,separator) ARG(CHAR,openBracket) ARG(CHAR,closeBracket) , BODY_ARG(print_gradient) BODY_ARG(DEBUG_CONTEXT("Printtensor"))BODY_ARG(title) BODY_ARG(summariseThreshold) BODY_ARG(edgeItems) BODY_ARG(maxLineWidth) BODY_ARG(digits) BODY_ARG(floatFormat) BODY_ARG(separator) BODY_ARG(openBracket) BODY_ARG(closeBracket) ) OP_DECL(popart, nop, nop, AiGraphcoreOpset1.nop, NONE, BODY_ARG(DEBUG_CONTEXT("Nop"))) OP_DECL(popart, scale, scale, AiGraphcoreOpset1.scale, ARG(FLOAT,scale) , BODY_ARG(scale) BODY_ARG(DEBUG_CONTEXT("Scale"))) OP_DECL(popart, scaledadd, scaledadd, AiGraphcoreOpset1.scaledadd, ARG(FLOAT,scale0) ARG(FLOAT,scale1) , BODY_ARG(scale0) BODY_ARG(scale1) BODY_ARG(DEBUG_CONTEXT("Scaledadd"))) OP_DECL(popart, lstm, lstm, AiGraphcoreOpset1.lstm, ARG(INT,outputFullSequence) , BODY_ARG(outputFullSequence) BODY_ARG(DEBUG_CONTEXT("Lstm"))) OP_DECL(popart, gelu, gelu, AiGraphcoreOpset1.gelu, NONE, BODY_ARG(DEBUG_CONTEXT("Gelu"))) OP_DECL(popart, geluerf, geluerf, AiGraphcoreOpset1.geluerf, NONE, BODY_ARG(DEBUG_CONTEXT("GeluErf"))) OP_DECL(popart, detach, detach, AiGraphcoreOpset1.detach, NONE, BODY_ARG(DEBUG_CONTEXT("Detach"))) OP_DECL(popart, depthtospace, depthtospace, AiGraphcoreOpset1.depthtospace, ARG(INT,blocksize) ARG(STRING,mode) , BODY_ARG(blocksize) BODY_ARG(mode) BODY_ARG(DEBUG_CONTEXT("Depthtospace"))) OP_DECL(popart, round, round, AiGraphcoreOpset1.round, NONE, BODY_ARG(DEBUG_CONTEXT("Round"))) OP_DECL(popart, dynamicslice, dynamicslice, AiGraphcoreOpset1.dynamicslice, ARG(INT_VEC,axes) ARG(INT_VEC,sizes) ARG(INT,noOverlap) , BODY_ARG(axes) BODY_ARG(sizes) BODY_ARG(noOverlap) BODY_ARG(DEBUG_CONTEXT("Dynamicslice"))) OP_DECL(popart, dynamicupdate, dynamicupdate, AiGraphcoreOpset1.dynamicupdate, ARG(INT_VEC,axes) ARG(INT_VEC,sizes) ARG(INT,noOverlap) , BODY_ARG(axes) BODY_ARG(sizes) BODY_ARG(noOverlap) BODY_ARG(DEBUG_CONTEXT("Dynamicupdate"))) OP_DECL(popart, dynamiczero, dynamiczero, AiGraphcoreOpset1.dynamiczero, ARG(INT_VEC,axes) ARG(INT_VEC,sizes) , BODY_ARG(axes) BODY_ARG(sizes) BODY_ARG(DEBUG_CONTEXT("Dynamiczero"))) OP_DECL(popart, dynamicadd, dynamicadd, AiGraphcoreOpset1.dynamicadd, ARG(INT_VEC,axes) ARG(INT_VEC,sizes) , BODY_ARG(axes) BODY_ARG(sizes) BODY_ARG(DEBUG_CONTEXT("Dynamicadd"))) OP_DECL(popart, sequenceslice, sequenceslice, AiGraphcoreOpset1.sequenceslice, ARG(INT,zeroUnused) , BODY_ARG(zeroUnused) BODY_ARG(DEBUG_CONTEXT("Sequenceslice"))) OP_DECL(popart, l1loss, l1loss, AiGraphcoreOpset1.l1loss, ARG(FLOAT,lambda) ARG(INT,reduction) , BODY_ARG(lambda) BODY_ARG(static_cast(reduction)) BODY_ARG(DEBUG_CONTEXT("L1loss"))) OP_DECL(popart, nllloss, nllloss, AiGraphcoreOpset1.nllloss, ARG(INT,reduction) ARG(INT,ignoreIndex) ARG(INT,inputIsLogProbability) , BODY_ARG(static_cast(reduction)) BODY_ARG(ignoreIndex) BODY_ARG(inputIsLogProbability) BODY_ARG(DEBUG_CONTEXT("Nllloss"))) OP_DECL(popart, identityloss, identityloss, AiGraphcoreOpset1.identityloss, ARG(INT,reduction) , BODY_ARG(static_cast(reduction)) BODY_ARG(DEBUG_CONTEXT("Identityloss"))) OP_DECL(popart, _ctcloss, _ctcloss, AiGraphcoreOpset1._ctcloss, ARG(INT,reduction) ARG(INT,blank) ARG(STRING,outDataType) ARG(INT,zeroInfinity) , BODY_ARG(static_cast(reduction)) BODY_ARG(blank) BODY_ARG(outDataType) BODY_ARG(zeroInfinity) BODY_ARG(DEBUG_CONTEXT("_ctcloss"))) OP_DECL(popart, ctcbeamsearchdecoder, ctcbeamsearchdecoder, AiGraphcoreOpset1.ctcbeamsearchdecoder, ARG(INT,blank) ARG(INT,beamWidth) ARG(INT,topPaths) , BODY_ARG(blank) BODY_ARG(beamWidth) BODY_ARG(topPaths) BODY_ARG(DEBUG_CONTEXT("Ctcbeamsearchdecoder"))) OP_DECL(popart, shapeddropout, shapeddropout, AiGraphcoreOpset1.shapeddropout, ARG(INT_VEC,shape) ARG(FLOAT,ratio) , BODY_ARG(shape) BODY_ARG(ratio) BODY_ARG(DEBUG_CONTEXT("Shapeddropout"))) OP_DECL(popart, atan2, atan2, AiGraphcoreOpset1.atan2, NONE, BODY_ARG(DEBUG_CONTEXT("Atan2"))) OP_DECL(popart, expm1, expm1, AiGraphcoreOpset1.expm1, NONE, BODY_ARG(DEBUG_CONTEXT("Expm1"))) OP_DECL(popart, log1p, log1p, AiGraphcoreOpset1.log1p, NONE, BODY_ARG(DEBUG_CONTEXT("Log1p"))) OP_DECL(popart, fmod, fmod, AiGraphcoreOpset1.fmod, NONE, BODY_ARG(DEBUG_CONTEXT("Fmod"))) OP_DECL(popart, remainder, remainder, AiGraphcoreOpset1.remainder, NONE, BODY_ARG(DEBUG_CONTEXT("Remainder"))) OP_DECL(popart, reverse, reverse, AiGraphcoreOpset1.reverse, ARG(INT_VEC,dimensions) , BODY_ARG(dimensions) BODY_ARG(DEBUG_CONTEXT("Reverse"))) OP_DECL(popart, slice, slice, AiGraphcoreOpset1.slice, ARG(INT_VEC,ends) ARG(INT_VEC,starts) ARG(INT_VEC,axes) , BODY_ARG(ends) BODY_ARG(starts) BODY_ARG(axes) BODY_ARG(DEBUG_CONTEXT("Slice"))) OP_DECL(popart, bitwisenot, bitwisenot, AiGraphcoreOpset1.bitwisenot, NONE, BODY_ARG(DEBUG_CONTEXT("Bitwisenot"))) OP_DECL(popart, bitwiseand, bitwiseand, AiGraphcoreOpset1.bitwiseand, NONE, BODY_ARG(DEBUG_CONTEXT("Bitwiseand"))) OP_DECL(popart, bitwiseor, bitwiseor, AiGraphcoreOpset1.bitwiseor, NONE, BODY_ARG(DEBUG_CONTEXT("Bitwiseor"))) OP_DECL(popart, bitwisexor, bitwisexor, AiGraphcoreOpset1.bitwisexor, NONE, BODY_ARG(DEBUG_CONTEXT("Bitwisexor"))) OP_DECL(popart, bitwisexnor, bitwisexnor, AiGraphcoreOpset1.bitwisexnor, NONE, BODY_ARG(DEBUG_CONTEXT("Bitwisexnor"))) OP_DECL(popart, reducemedian, reducemedian, AiGraphcoreOpset1.reducemedian, ARG(INT_VEC,axes) ARG(INT,keepdims) , BODY_ARG(axes) BODY_ARG(keepdims) BODY_ARG(DEBUG_CONTEXT("Reducemedian"))) OP_DECL(popart, scatterreduce, scatterreduce, AiGraphcoreOpset1.scatterreduce, ARG(INT,axis_size) ARG(INT,axis) ARG(INT,reduction) ARG(INT, enable_index_broadcast), BODY_ARG(axis_size) BODY_ARG(axis) BODY_ARG(static_cast(reduction)) BODY_ARG(enable_index_broadcast) BODY_ARG(DEBUG_CONTEXT("Scatterreduce"))) OP_DECL(popart, groupedscatterreduce, groupedscatterreduce, AiGraphcoreOpset1.groupedscatterreduce, ARG(INT,axis_size) ARG(INT,axis) ARG(INT,reduction) ARG(INT,group_size) ARG(INT, enable_index_broadcast), BODY_ARG(axis_size) BODY_ARG(axis) BODY_ARG(static_cast(reduction)) BODY_ARG(group_size) BODY_ARG(enable_index_broadcast) BODY_ARG(DEBUG_CONTEXT("Scatterreduce"))) OP_DECL(popart, groupedgather, groupedgather, AiGraphcoreOpset1.groupedgather, ARG(INT,axis) ARG(INT,group_size) , BODY_ARG(axis) BODY_ARG(group_size) BODY_ARG(DEBUG_CONTEXT("GroupedGather"))) OP_DECL(popart, swish, swish, AiGraphcoreOpset1.swish, NONE, BODY_ARG(DEBUG_CONTEXT("Swish"))) OP_DECL(popart, bucketize, bucketize, AiGraphcoreOpset1.bucketize, ARG(INT,right) , BODY_ARG(right) BODY_ARG(DEBUG_CONTEXT("Bucketize"))) OP_DECL(popart, sort, sort, AiGraphcoreOpset1.sort, ARG(INT,axis) ARG(INT,descending) ARG(INT,stable) , BODY_ARG(axis) BODY_ARG(descending) BODY_ARG(stable) BODY_ARG(DEBUG_CONTEXT("Sort"))) OP_DECL(popart, nearbyint, nearbyint, AiGraphcoreOpset1.nearbyint, NONE, BODY_ARG(DEBUG_CONTEXT("NearbyInt"))) OP_DECL(popart, splinebasis, splinebasis, AiGraphcoreOpset1.splinebasis, ARG(INT,degree) , BODY_ARG(degree) BODY_ARG(DEBUG_CONTEXT("SplineBasis"))) OP_DECL(popart, splineweighting, splineweighting, AiGraphcoreOpset1.splineweighting, NONE, BODY_ARG(DEBUG_CONTEXT("SplineWeighting"))) // Ops from AiOnnxOpset11 OP_DECL(popart, topk, topk, AiOnnxOpset11.topk, ARG(INT,axis) ARG(INT,largest) ARG(INT,sorted), BODY_ARG(axis) BODY_ARG(largest) BODY_ARG(sorted) BODY_ARG(DEBUG_CONTEXT("Topk"))) // Ops from AiOnnxOpset11 OP_DECL(popart, averagepool, averagepool, AiOnnxOpset11.averagepool, ARG(INT_VEC,kernel_shape) ARG(INT,ceil_mode) ARG(INT,count_include_pad) ARG(INT_VEC,pads) ARG(INT_VEC,strides) , BODY_ARG(kernel_shape) BODY_ARG(ceil_mode) BODY_ARG(count_include_pad) BODY_ARG(pads) BODY_ARG(strides) BODY_ARG(DEBUG_CONTEXT("Averagepool"))) OP_DECL(popart, convinteger, convinteger, AiOnnxOpset11.convinteger, ARG(INT_VEC,dilations) ARG(INT,group) ARG(INT_VEC,kernel_shape) ARG(INT_VEC,pads) ARG(INT_VEC,strides) , BODY_ARG(dilations) BODY_ARG(group) BODY_ARG(kernel_shape) BODY_ARG(pads) BODY_ARG(strides) BODY_ARG(DEBUG_CONTEXT("Convinteger"))) OP_DECL(popart, dequantizelinear, dequantizelinear, AiOnnxOpset11.dequantizelinear, NONE, BODY_ARG(DEBUG_CONTEXT("Dequantizelinear"))) OP_DECL(popart, dropout, dropout, AiOnnxOpset11.dropout, ARG(INT,num_outputs) ARG(FLOAT,ratio) , BODY_ARG(num_outputs) BODY_ARG(ratio) BODY_ARG(DEBUG_CONTEXT("Dropout"))) OP_DECL(popart, isinf, isinf, AiOnnxOpset11.isinf, ARG(INT,detect_negative) ARG(INT,detect_positive) , BODY_ARG(detect_negative) BODY_ARG(detect_positive) BODY_ARG(DEBUG_CONTEXT("Isinf"))) OP_DECL(popart, matmulinteger, matmulinteger, AiOnnxOpset11.matmulinteger, NONE, BODY_ARG(DEBUG_CONTEXT("Matmulinteger"))) OP_DECL(popart, maxpool, maxpool, AiOnnxOpset11.maxpool, ARG(INT,num_outputs) ARG(INT_VEC,kernel_shape) ARG(INT,ceil_mode) ARG(INT_VEC,dilations) ARG(INT_VEC,pads) ARG(INT,storage_order) ARG(INT_VEC,strides) , BODY_ARG(num_outputs) BODY_ARG(kernel_shape) BODY_ARG(ceil_mode) BODY_ARG(dilations) BODY_ARG(pads) BODY_ARG(storage_order) BODY_ARG(strides) BODY_ARG(DEBUG_CONTEXT("Maxpool"))) OP_DECL(popart, mod, mod, AiOnnxOpset11.mod, ARG(INT,fmod) , BODY_ARG(fmod) BODY_ARG(DEBUG_CONTEXT("Mod"))) OP_DECL(popart, nonmaxsuppression, nonmaxsuppression, AiOnnxOpset11.nonmaxsuppression, ARG(INT,center_point_box) , BODY_ARG(center_point_box) BODY_ARG(DEBUG_CONTEXT("Nonmaxsuppression"))) OP_DECL(popart, qlinearconv, qlinearconv, AiOnnxOpset11.qlinearconv, ARG(INT_VEC,dilations) ARG(INT,group) ARG(INT_VEC,kernel_shape) ARG(INT_VEC,pads) ARG(INT_VEC,strides) , BODY_ARG(dilations) BODY_ARG(group) BODY_ARG(kernel_shape) BODY_ARG(pads) BODY_ARG(strides) BODY_ARG(DEBUG_CONTEXT("Qlinearconv"))) OP_DECL(popart, qlinearmatmul, qlinearmatmul, AiOnnxOpset11.qlinearmatmul, NONE, BODY_ARG(DEBUG_CONTEXT("Qlinearmatmul"))) OP_DECL(popart, quantizelinear, quantizelinear, AiOnnxOpset11.quantizelinear, NONE, BODY_ARG(DEBUG_CONTEXT("Quantizelinear"))) OP_DECL(popart, resize, resize, AiOnnxOpset11.resize, ARG(STRING,coordinate_transformation_mode) ARG(FLOAT,cubic_coeff_a) ARG(INT,exclude_outside) ARG(FLOAT,extrapolation_value) ARG(STRING,mode) ARG(STRING,nearest_mode), BODY_ARG(coordinate_transformation_mode) BODY_ARG(cubic_coeff_a) BODY_ARG(exclude_outside) BODY_ARG(extrapolation_value) BODY_ARG(mode) BODY_ARG(nearest_mode) BODY_ARG(DEBUG_CONTEXT("Resize"))) OP_DECL(popart, reversesequence, reversesequence, AiOnnxOpset11.reversesequence, ARG(INT,batch_axis) ARG(INT,time_axis) , BODY_ARG(batch_axis) BODY_ARG(time_axis) BODY_ARG(DEBUG_CONTEXT("Reversesequence"))) OP_DECL(popart, roialign, roialign, AiOnnxOpset11.roialign, ARG(STRING,mode) ARG(INT,output_height) ARG(INT,output_width) ARG(INT,sampling_ratio) ARG(FLOAT,spatial_scale) , BODY_ARG(mode) BODY_ARG(output_height) BODY_ARG(output_width) BODY_ARG(sampling_ratio) BODY_ARG(spatial_scale) BODY_ARG(DEBUG_CONTEXT("Roialign"))) OP_DECL(popart, thresholdedrelu, thresholdedrelu, AiOnnxOpset11.thresholdedrelu, ARG(FLOAT,alpha) , BODY_ARG(alpha) BODY_ARG(DEBUG_CONTEXT("Thresholdedrelu"))) OP_DECL(popart, upsample, upsample, AiOnnxOpset11.upsample, ARG(STRING,mode) , BODY_ARG(mode) BODY_ARG(DEBUG_CONTEXT("Upsample"))) // Ops from AiOnnxOpset9 OP_DECL(popart, acosh, acosh, AiOnnxOpset11.acosh, NONE, BODY_ARG(DEBUG_CONTEXT("Acosh"))) OP_DECL(popart, asinh, asinh, AiOnnxOpset11.asinh, NONE, BODY_ARG(DEBUG_CONTEXT("Asinh"))) OP_DECL(popart, atanh, atanh, AiOnnxOpset11.atanh, NONE, BODY_ARG(DEBUG_CONTEXT("Atanh"))) OP_DECL(popart, cast, cast, AiOnnxOpset11.cast, ARG(STRING,to) , BODY_ARG(to) BODY_ARG(DEBUG_CONTEXT("Cast"))) OP_DECL(popart, compress, compress, AiOnnxOpset11.compress, ARG(INT,axis) , BODY_ARG(axis) BODY_ARG(DEBUG_CONTEXT("Compress"))) OP_DECL(popart, cosh, cosh, AiOnnxOpset11.cosh, NONE, BODY_ARG(DEBUG_CONTEXT("Cosh"))) OP_DECL(popart, erf, erf, AiOnnxOpset11.erf, NONE, BODY_ARG(DEBUG_CONTEXT("Erf"))) OP_DECL(popart, eyelike, eyelike, AiOnnxOpset11.eyelike, ARG(INT,dtype) ARG(INT,k) , BODY_ARG(dtype) BODY_ARG(k) BODY_ARG(DEBUG_CONTEXT("Eyelike"))) OP_DECL(popart, flatten, flatten, AiOnnxOpset11.flatten, ARG(INT,axis) , BODY_ARG(axis) BODY_ARG(DEBUG_CONTEXT("Flatten"))) OP_DECL(popart, gemm, gemm, AiOnnxOpset11.gemm, ARG(FLOAT,alpha) ARG(FLOAT,beta) ARG(INT,transA) ARG(INT,transB) , BODY_ARG(alpha) BODY_ARG(beta) BODY_ARG(transA) BODY_ARG(transB) BODY_ARG(DEBUG_CONTEXT("Gemm"))) OP_DECL(popart, greater, greater, AiOnnxOpset11.greater, NONE, BODY_ARG(DEBUG_CONTEXT("Greater"))) OP_DECL(popart, isnan, isnan, AiOnnxOpset11.isnan, NONE, BODY_ARG(DEBUG_CONTEXT("Isnan"))) OP_DECL(popart, less, less, AiOnnxOpset11.less, NONE, BODY_ARG(DEBUG_CONTEXT("Less"))) OP_DECL(popart, matmul, matmul, AiOnnxOpset11.matmul, NONE, BODY_ARG(DEBUG_CONTEXT("Matmul"))) OP_DECL(popart, maxunpool, maxunpool, AiOnnxOpset11.maxunpool, ARG(INT_VEC,kernel_shape) ARG(INT_VEC,pads) ARG(INT_VEC,strides) , BODY_ARG(kernel_shape) BODY_ARG(pads) BODY_ARG(strides) BODY_ARG(DEBUG_CONTEXT("Maxunpool"))) OP_DECL(popart, meanvariancenormalization, meanvariancenormalization, AiOnnxOpset11.meanvariancenormalization, ARG(INT_VEC,axes) , BODY_ARG(axes) BODY_ARG(DEBUG_CONTEXT("Meanvariancenormalization"))) OP_DECL(popart, nonzero, nonzero, AiOnnxOpset11.nonzero, NONE, BODY_ARG(DEBUG_CONTEXT("Nonzero"))) OP_DECL(popart, onehot, onehot, AiOnnxOpset11.onehot, ARG(INT,axis) , BODY_ARG(axis) BODY_ARG(DEBUG_CONTEXT("Onehot"))) OP_DECL(popart, scatter, scatter, AiOnnxOpset11.scatter, ARG(INT,axis) , BODY_ARG(axis) BODY_ARG(DEBUG_CONTEXT("Scatter"))) OP_DECL(popart, scatterelements, scatterelements, AiOnnxOpset11.scatterelements, ARG(INT,axis) , BODY_ARG(axis) BODY_ARG(DEBUG_CONTEXT("ScatterElements"))) OP_DECL(popart, shrink, shrink, AiOnnxOpset11.shrink, ARG(FLOAT,bias) ARG(FLOAT,lambd) , BODY_ARG(bias) BODY_ARG(lambd) BODY_ARG(DEBUG_CONTEXT("Shrink"))) OP_DECL(popart, sign, sign, AiOnnxOpset11.sign, NONE, BODY_ARG(DEBUG_CONTEXT("Sign"))) OP_DECL(popart, sinh, sinh, AiOnnxOpset11.sinh, NONE, BODY_ARG(DEBUG_CONTEXT("Sinh"))) OP_DECL(popart, tfidfvectorizer, tfidfvectorizer, AiOnnxOpset11.tfidfvectorizer, ARG(INT,max_gram_length) ARG(INT,max_skip_count) ARG(INT,min_gram_length) ARG(STRING,mode) ARG(INT_VEC,ngram_counts) ARG(INT_VEC,ngram_indexes) ARG(INT_VEC,pool_int64s) ARG(STRING_VEC,pool_strings) ARG(FLOAT_VEC,weights) , BODY_ARG(max_gram_length) BODY_ARG(max_skip_count) BODY_ARG(min_gram_length) BODY_ARG(mode) BODY_ARG(ngram_counts) BODY_ARG(ngram_indexes) BODY_ARG(pool_int64s) BODY_ARG(pool_strings) BODY_ARG(weights) BODY_ARG(DEBUG_CONTEXT("Tfidfvectorizer"))) OP_DECL(popart, where, where, AiOnnxOpset11.where, NONE, BODY_ARG(DEBUG_CONTEXT("Where"))) // Ops from AiOnnxOpset8 OP_DECL(popart, expand, expand, AiOnnxOpset11.expand, NONE, BODY_ARG(DEBUG_CONTEXT("Expand"))) OP_DECL(popart, max, max, AiOnnxOpset11.max, NONE, BODY_ARG(DEBUG_CONTEXT("Max"))) OP_DECL(popart, mean, mean, AiOnnxOpset11.mean, NONE, BODY_ARG(DEBUG_CONTEXT("Mean"))) OP_DECL(popart, min, min, AiOnnxOpset11.min, NONE, BODY_ARG(DEBUG_CONTEXT("Min"))) OP_DECL(popart, sum, sum, AiOnnxOpset11.sum, NONE, BODY_ARG(DEBUG_CONTEXT("Sum"))) // Ops from AiOnnxOpset7 OP_DECL(popart, acos, acos, AiOnnxOpset11.acos, NONE, BODY_ARG(DEBUG_CONTEXT("Acos"))) OP_DECL(popart, add, add, AiOnnxOpset11.add, NONE, BODY_ARG(DEBUG_CONTEXT("Add"))) OP_DECL(popart, logical_and, logical_and, AiOnnxOpset11.logical_and, NONE, BODY_ARG(DEBUG_CONTEXT("Logical_and"))) OP_DECL(popart, asin, asin, AiOnnxOpset11.asin, NONE, BODY_ARG(DEBUG_CONTEXT("Asin"))) OP_DECL(popart, atan, atan, AiOnnxOpset11.atan, NONE, BODY_ARG(DEBUG_CONTEXT("Atan"))) OP_DECL(popart, cos, cos, AiOnnxOpset11.cos, NONE, BODY_ARG(DEBUG_CONTEXT("Cos"))) OP_DECL(popart, div, div, AiOnnxOpset11.div, NONE, BODY_ARG(DEBUG_CONTEXT("Div"))) OP_DECL(popart, equal, equal, AiOnnxOpset11.equal, NONE, BODY_ARG(DEBUG_CONTEXT("Equal"))) OP_DECL(popart, mul, mul, AiOnnxOpset11.mul, NONE, BODY_ARG(DEBUG_CONTEXT("Mul"))) OP_DECL(popart, multinomial, multinomial, AiOnnxOpset11.multinomial, ARG(INT,dtype) ARG(INT,sample_size) ARG(FLOAT,seed) , BODY_ARG(dtype) BODY_ARG(sample_size) BODY_ARG(seed) BODY_ARG(DEBUG_CONTEXT("Multinomial"))) OP_DECL(popart, logical_or, logical_or, AiOnnxOpset11.logical_or, NONE, BODY_ARG(DEBUG_CONTEXT("Logical_or"))) OP_DECL(popart, pow, pow, AiOnnxOpset11.pow, NONE, BODY_ARG(DEBUG_CONTEXT("Pow"))) OP_DECL(popart, sin, sin, AiOnnxOpset11.sin, NONE, BODY_ARG(DEBUG_CONTEXT("Sin"))) OP_DECL(popart, sub, sub, AiOnnxOpset11.sub, NONE, BODY_ARG(DEBUG_CONTEXT("Sub"))) OP_DECL(popart, tan, tan, AiOnnxOpset11.tan, NONE, BODY_ARG(DEBUG_CONTEXT("Tan"))) OP_DECL(popart, logical_xor, logical_xor, AiOnnxOpset11.logical_xor, NONE, BODY_ARG(DEBUG_CONTEXT("Logical_xor"))) // Ops from AiOnnxOpset6 OP_DECL(popart, abs, abs, AiOnnxOpset11.abs, NONE, BODY_ARG(DEBUG_CONTEXT("Abs"))) OP_DECL(popart, argmax, argmax, AiOnnxOpset11.argmax, ARG(INT,axis) ARG(INT,keepdims) , BODY_ARG(axis) BODY_ARG(keepdims) BODY_ARG(DEBUG_CONTEXT("Argmax"))) OP_DECL(popart, argmin, argmin, AiOnnxOpset11.argmin, ARG(INT,axis) ARG(INT,keepdims) , BODY_ARG(axis) BODY_ARG(keepdims) BODY_ARG(DEBUG_CONTEXT("Argmin"))) OP_DECL(popart, ceil, ceil, AiOnnxOpset11.ceil, NONE, BODY_ARG(DEBUG_CONTEXT("Ceil"))) OP_DECL(popart, clip, clip, AiOnnxOpset11.clip, NONE, BODY_ARG(DEBUG_CONTEXT("Clip"))) OP_DECL(popart, concat, concat, AiOnnxOpset11.concat, ARG(INT,axis) , BODY_ARG(axis) BODY_ARG(DEBUG_CONTEXT("Concat"))) OP_DECL(popart, conv, conv, AiOnnxOpset11.conv, ARG(INT_VEC,dilations) ARG(INT,group) ARG(INT_VEC,kernel_shape) ARG(INT_VEC,pads) ARG(INT_VEC,strides) , BODY_ARG(dilations) BODY_ARG(group) BODY_ARG(kernel_shape) BODY_ARG(pads) BODY_ARG(strides) BODY_ARG(DEBUG_CONTEXT("Conv"))) OP_DECL(popart, convtranspose, convtranspose, AiOnnxOpset11.convtranspose, ARG(INT_VEC,dilations) ARG(INT,group) ARG(INT_VEC,kernel_shape) ARG(INT_VEC,output_padding) ARG(INT_VEC,output_shape) ARG(INT_VEC,pads) ARG(INT_VEC,strides) , BODY_ARG(dilations) BODY_ARG(group) BODY_ARG(kernel_shape) BODY_ARG(output_padding) BODY_ARG(output_shape) BODY_ARG(pads) BODY_ARG(strides) BODY_ARG(DEBUG_CONTEXT("Convtranspose"))) OP_DECL(popart, elu, elu, AiOnnxOpset11.elu, ARG(FLOAT,alpha) , BODY_ARG(alpha) BODY_ARG(DEBUG_CONTEXT("Elu"))) OP_DECL(popart, exp, exp, AiOnnxOpset11.exp, NONE, BODY_ARG(DEBUG_CONTEXT("Exp"))) OP_DECL(popart, floor, floor, AiOnnxOpset11.floor, NONE, BODY_ARG(DEBUG_CONTEXT("Floor"))) OP_DECL(popart, gather, gather, AiOnnxOpset11.gather, ARG(INT,axis) , BODY_ARG(axis) BODY_ARG(DEBUG_CONTEXT("Gather"))) OP_DECL(popart, globalaveragepool, globalaveragepool, AiOnnxOpset11.globalaveragepool, NONE, BODY_ARG(DEBUG_CONTEXT("Globalaveragepool"))) OP_DECL(popart, globallppool, globallppool, AiOnnxOpset11.globallppool, ARG(INT,p) , BODY_ARG(p) BODY_ARG(DEBUG_CONTEXT("Globallppool"))) OP_DECL(popart, globalmaxpool, globalmaxpool, AiOnnxOpset11.globalmaxpool, NONE, BODY_ARG(DEBUG_CONTEXT("Globalmaxpool"))) OP_DECL(popart, hardsigmoid, hardsigmoid, AiOnnxOpset11.hardsigmoid, ARG(FLOAT,alpha) ARG(FLOAT,beta) , BODY_ARG(alpha) BODY_ARG(beta) BODY_ARG(DEBUG_CONTEXT("Hardsigmoid"))) OP_DECL(popart, hardmax, hardmax, AiOnnxOpset11.hardmax, ARG(INT,axis) , BODY_ARG(axis) BODY_ARG(DEBUG_CONTEXT("Hardmax"))) OP_DECL(popart, identity, identity, AiOnnxOpset11.identity, NONE, BODY_ARG(DEBUG_CONTEXT("Identity"))) OP_DECL(popart, instancenormalization, instancenormalization, AiOnnxOpset11.instancenormalization, ARG(FLOAT,epsilon) , BODY_ARG(epsilon) BODY_ARG(DEBUG_CONTEXT("Instancenormalization"))) OP_DECL(popart, lrn, lrn, AiOnnxOpset11.lrn, ARG(INT,size) ARG(FLOAT,alpha) ARG(FLOAT,beta) ARG(FLOAT,bias) , BODY_ARG(size) BODY_ARG(alpha) BODY_ARG(beta) BODY_ARG(bias) BODY_ARG(DEBUG_CONTEXT("Lrn"))) OP_DECL(popart, leakyrelu, leakyrelu, AiOnnxOpset11.leakyrelu, ARG(FLOAT,alpha) , BODY_ARG(alpha) BODY_ARG(DEBUG_CONTEXT("Leakyrelu"))) OP_DECL(popart, log, log, AiOnnxOpset11.log, NONE, BODY_ARG(DEBUG_CONTEXT("Log"))) OP_DECL(popart, logsoftmax, logsoftmax, AiOnnxOpset11.logsoftmax, ARG(INT,axis) , BODY_ARG(axis) BODY_ARG(DEBUG_CONTEXT("Logsoftmax"))) OP_DECL(popart, lpnormalization, lpnormalization, AiOnnxOpset11.lpnormalization, ARG(INT,axis) ARG(INT,p) , BODY_ARG(axis) BODY_ARG(p) BODY_ARG(DEBUG_CONTEXT("Lpnormalization"))) OP_DECL(popart, lppool, lppool, AiOnnxOpset11.lppool, ARG(INT_VEC,kernel_shape) ARG(INT,p) ARG(INT_VEC,pads) ARG(INT_VEC,strides) , BODY_ARG(kernel_shape) BODY_ARG(p) BODY_ARG(pads) BODY_ARG(strides) BODY_ARG(DEBUG_CONTEXT("Lppool"))) OP_DECL(popart, maxroipool, maxroipool, AiOnnxOpset11.maxroipool, ARG(INT_VEC,pooled_shape) ARG(FLOAT,spatial_scale) , BODY_ARG(pooled_shape) BODY_ARG(spatial_scale) BODY_ARG(DEBUG_CONTEXT("Maxroipool"))) OP_DECL(popart, neg, neg, AiOnnxOpset11.neg, NONE, BODY_ARG(DEBUG_CONTEXT("Neg"))) OP_DECL(popart, logical_not, logical_not, AiOnnxOpset11.logical_not, NONE, BODY_ARG(DEBUG_CONTEXT("Logical_not"))) OP_DECL(popart, pad, pad, AiOnnxOpset11.pad, ARG(STRING,mode), BODY_ARG(mode) BODY_ARG(DEBUG_CONTEXT("Pad"))) OP_DECL(popart, randomnormallike, randomnormallike, AiOnnxOpset11.randomnormallike, ARG(INT,dtype) ARG(FLOAT,mean) ARG(FLOAT,scale) ARG(FLOAT,seed) , BODY_ARG(dtype) BODY_ARG(mean) BODY_ARG(scale) BODY_ARG(seed) BODY_ARG(DEBUG_CONTEXT("Randomnormallike"))) OP_DECL(popart, randomuniformlike, randomuniformlike, AiOnnxOpset11.randomuniformlike, ARG(INT,dtype) ARG(FLOAT,high) ARG(FLOAT,low) ARG(FLOAT,seed) , BODY_ARG(dtype) BODY_ARG(high) BODY_ARG(low) BODY_ARG(seed) BODY_ARG(DEBUG_CONTEXT("Randomuniformlike"))) OP_DECL(popart, reciprocal, reciprocal, AiOnnxOpset11.reciprocal, NONE, BODY_ARG(DEBUG_CONTEXT("Reciprocal"))) OP_DECL(popart, reducel1, reducel1, AiOnnxOpset11.reducel1, ARG(INT_VEC,axes) ARG(INT,keepdims) , BODY_ARG(axes) BODY_ARG(keepdims) BODY_ARG(DEBUG_CONTEXT("Reducel1"))) OP_DECL(popart, reducel2, reducel2, AiOnnxOpset11.reducel2, ARG(INT_VEC,axes) ARG(INT,keepdims) , BODY_ARG(axes) BODY_ARG(keepdims) BODY_ARG(DEBUG_CONTEXT("Reducel2"))) OP_DECL(popart, reducelogsum, reducelogsum, AiOnnxOpset11.reducelogsum, ARG(INT_VEC,axes) ARG(INT,keepdims) , BODY_ARG(axes) BODY_ARG(keepdims) BODY_ARG(DEBUG_CONTEXT("Reducelogsum"))) OP_DECL(popart, reducelogsumexp, reducelogsumexp, AiOnnxOpset11.reducelogsumexp, ARG(INT_VEC,axes) ARG(INT,keepdims) , BODY_ARG(axes) BODY_ARG(keepdims) BODY_ARG(DEBUG_CONTEXT("Reducelogsumexp"))) OP_DECL(popart, reducemax, reducemax, AiOnnxOpset11.reducemax, ARG(INT_VEC,axes) ARG(INT,keepdims) , BODY_ARG(axes) BODY_ARG(keepdims) BODY_ARG(DEBUG_CONTEXT("Reducemax"))) OP_DECL(popart, reducemean, reducemean, AiOnnxOpset11.reducemean, ARG(INT_VEC,axes) ARG(INT,keepdims) , BODY_ARG(axes) BODY_ARG(keepdims) BODY_ARG(DEBUG_CONTEXT("Reducemean"))) OP_DECL(popart, reducemin, reducemin, AiOnnxOpset11.reducemin, ARG(INT_VEC,axes) ARG(INT,keepdims) , BODY_ARG(axes) BODY_ARG(keepdims) BODY_ARG(DEBUG_CONTEXT("Reducemin"))) OP_DECL(popart, reduceprod, reduceprod, AiOnnxOpset11.reduceprod, ARG(INT_VEC,axes) ARG(INT,keepdims) , BODY_ARG(axes) BODY_ARG(keepdims) BODY_ARG(DEBUG_CONTEXT("Reduceprod"))) OP_DECL(popart, reducesum, reducesum, AiOnnxOpset11.reducesum, ARG(INT_VEC,axes) ARG(INT,keepdims) , BODY_ARG(axes) BODY_ARG(keepdims) BODY_ARG(DEBUG_CONTEXT("Reducesum"))) OP_DECL(popart, reducesumsquare, reducesumsquare, AiOnnxOpset11.reducesumsquare, ARG(INT_VEC,axes) ARG(INT,keepdims) , BODY_ARG(axes) BODY_ARG(keepdims) BODY_ARG(DEBUG_CONTEXT("Reducesumsquare"))) OP_DECL(popart, relu, relu, AiOnnxOpset11.relu, NONE, BODY_ARG(DEBUG_CONTEXT("Relu"))) OP_DECL(popart, selu, selu, AiOnnxOpset11.selu, ARG(FLOAT,alpha) ARG(FLOAT,gamma) , BODY_ARG(alpha) BODY_ARG(gamma) BODY_ARG(DEBUG_CONTEXT("Selu"))) OP_DECL(popart, shape, shape, AiOnnxOpset11.shape, NONE, BODY_ARG(DEBUG_CONTEXT("Shape"))) OP_DECL(popart, sigmoid, sigmoid, AiOnnxOpset11.sigmoid, NONE, BODY_ARG(DEBUG_CONTEXT("Sigmoid"))) OP_DECL(popart, size, size, AiOnnxOpset11.size, NONE, BODY_ARG(DEBUG_CONTEXT("Size"))) OP_DECL(popart, softmax, softmax, AiOnnxOpset11.softmax, ARG(INT,axis) , BODY_ARG(axis) BODY_ARG(DEBUG_CONTEXT("Softmax"))) OP_DECL(popart, softplus, softplus, AiOnnxOpset11.softplus, NONE, BODY_ARG(DEBUG_CONTEXT("Softplus"))) OP_DECL(popart, softsign, softsign, AiOnnxOpset11.softsign, NONE, BODY_ARG(DEBUG_CONTEXT("Softsign"))) OP_DECL(popart, spacetodepth, spacetodepth, AiOnnxOpset11.spacetodepth, ARG(INT,blocksize) , BODY_ARG(blocksize) BODY_ARG(DEBUG_CONTEXT("Spacetodepth"))) OP_DECL(popart, split, split, AiOnnxOpset11.split, ARG(INT,num_outputs) ARG(INT,axis) ARG(INT_VEC,split) , BODY_ARG(num_outputs) BODY_ARG(axis) BODY_ARG(split) BODY_ARG(DEBUG_CONTEXT("Split"))) OP_DECL(popart, sqrt, sqrt, AiOnnxOpset11.sqrt, NONE, BODY_ARG(DEBUG_CONTEXT("Sqrt"))) OP_DECL(popart, squeeze, squeeze, AiOnnxOpset11.squeeze, ARG(INT_VEC,axes) , BODY_ARG(axes) BODY_ARG(DEBUG_CONTEXT("Squeeze"))) OP_DECL(popart, tanh, tanh, AiOnnxOpset11.tanh, NONE, BODY_ARG(DEBUG_CONTEXT("Tanh"))) OP_DECL(popart, tile, tile, AiOnnxOpset11.tile, NONE, BODY_ARG(DEBUG_CONTEXT("Tile"))) OP_DECL(popart, transpose, transpose, AiOnnxOpset11.transpose, ARG(INT_VEC,perm) , BODY_ARG(perm) BODY_ARG(DEBUG_CONTEXT("Transpose"))) OP_DECL(popart, unsqueeze, unsqueeze, AiOnnxOpset11.unsqueeze, ARG(INT_VEC,axes) , BODY_ARG(axes) BODY_ARG(DEBUG_CONTEXT("Unsqueeze"))) ================================================ FILE: popart_compiler/include/popart_compiler/ManuallyAddedOperations.inc.hpp ================================================ // Copyright (c) 2020 Graphcore Ltd. All rights reserved. OP_DECL(popart, reshape_static_shape, reshape, _impl->reshape, ARG(INT_VEC, shape), BODY_ARG(shape)) OP_DECL(poptorch, ipu_print_tensor, ipu_print_tensor, AiGraphcoreOpset1.printtensor, ARG(INT,print_gradient) ARG(STRING,title) ARG(INT,summariseThreshold) ARG(INT,edgeItems) ARG(INT,maxLineWidth) ARG(INT,digits) ARG(INT,floatFormat) ARG(CHAR,separator) ARG(CHAR,openBracket) ARG(CHAR,closeBracket) , BODY_ARG(print_gradient) BODY_ARG(DEBUG_CONTEXT("Printtensor"))BODY_ARG(title) BODY_ARG(summariseThreshold) BODY_ARG(edgeItems) BODY_ARG(maxLineWidth) BODY_ARG(digits) BODY_ARG(floatFormat) BODY_ARG(separator) BODY_ARG(openBracket) BODY_ARG(closeBracket)) OP_DECL(poptorch, tensor_constant, tensor_constant, _impl->tensorConstant, POPART_CONST_ARG(popartConstant), BODY_ARG(popartConstant)) OP_DECL(poptorch, host_side_tensor_constant, host_side_tensor_constant, _impl->hostSideTensorConstant, HOST_SIDE_CONST_ARG(hostSideTensorConstant), BODY_ARG(hostSideTensorConstant)) OP_DECL(poptorch, constant_pad, constant_pad, AiOnnxOpset11.pad, NONE, BODY_ARG("constant") BODY_ARG(DEBUG_CONTEXT("Constantpad"))) OP_DECL(poptorch, reflection_pad, reflection_pad, AiOnnxOpset11.pad, NONE, BODY_ARG("reflect")) OP_DECL(poptorch, edge_pad, edge_pad, AiOnnxOpset11.pad, NONE, BODY_ARG("edge") BODY_ARG(DEBUG_CONTEXT("Reflectionpad"))) OP_DECL(poptorch, add_not_in_place, add_not_in_place, _impl->addNotInPlace, NONE, NONE) OP_DECL(poptorch, custom_operation, custom_operation, _impl->customOperation, ARG(STRING, name) ARG(STRING, domain) ARG(INT, version) ARG(INT, num_outputs) POPART_ATTRIB_VEC_ARG(attributes), BODY_ARG(name) BODY_ARG(domain) BODY_ARG(version) BODY_ARG(num_outputs) BODY_ARG(attributes)) OP_DECL_NO_RETURN(poptorch, addOutputTensor, addOutputTensor, _impl->addOutputTensor, NONE, NONE) OP_DECL(poptorch, random_uniform, random_uniform, _impl->randomUniform, ARG(INT_VEC, shape) ARG(FLOAT, high) ARG(FLOAT, low) ARG(STRING, dtype), BODY_ARG(shape) BODY_ARG(high) BODY_ARG(low) BODY_ARG(dtype)) OP_DECL(poptorch, random_normal, random_normal, _impl->randomNormal, ARG(INT_VEC, shape) ARG(FLOAT, mean) ARG(FLOAT, scale) ARG(STRING, dtype), BODY_ARG(shape) BODY_ARG(mean) BODY_ARG(scale) BODY_ARG(dtype)) OP_DECL(poptorch, ones, ones, _impl->ones, ARG(INT_VEC, shape) ARG(STRING, dtype), BODY_ARG(shape) BODY_ARG(dtype)) OP_DECL(poptorch, zeros, zeros, _impl->zeros, ARG(INT_VEC, shape) ARG(STRING, dtype), BODY_ARG(shape) BODY_ARG(dtype)) OP_DECL(poptorch, recomputation_checkpoint, recomputation_checkpoint, _impl->recomputationCheckpoint, NONE, NONE) OP_DECL(poptorch, unfold, unfold, _impl->unfold, ARG(INT, dimension) ARG(INT, size) ARG(INT, step), BODY_ARG(dimension) BODY_ARG(size) BODY_ARG(step)) OP_DECL(poptorch, prelu, prelu, _impl->prelu, NONE, NONE) // Operations which need extra types #define EMPTY_FLOAT_VEC std::vector() #define EMPTY_STRING_VEC std::vector() #define OPTIONAL_FLOAT nonstd::optional() #define OPTIONAL_INT nonstd::optional() OP_DECL(poptorch, gru, gru, AiOnnxOpset11.gru, ARG(INT, hidden_size), BODY_ARG(2) BODY_ARG(EMPTY_FLOAT_VEC) BODY_ARG(EMPTY_FLOAT_VEC) BODY_ARG(EMPTY_STRING_VEC) BODY_ARG(OPTIONAL_FLOAT) BODY_ARG("forward") BODY_ARG(hidden_size) BODY_ARG(1) BODY_ARG(DEBUG_CONTEXT("Gru"))) OP_DECL(poptorch, rnn, rnn, AiOnnxOpset11.rnn, ARG(STRING_VEC, activations), BODY_ARG(2) BODY_ARG(EMPTY_FLOAT_VEC) BODY_ARG(EMPTY_FLOAT_VEC) BODY_ARG(activations) BODY_ARG(OPTIONAL_FLOAT) BODY_ARG("forward") BODY_ARG(OPTIONAL_INT) BODY_ARG(DEBUG_CONTEXT("Rnn"))) #undef EMPTY_STRING_VEC #undef OPTIONAL_INT #undef OPTIONAL_FLOAT #undef EMPTY_FLOAT_VEC ================================================ FILE: popart_compiler/include/popart_compiler/SupportedOperations.inc.hpp ================================================ // Copyright (c) 2020 Graphcore Ltd. All rights reserved. /* OP_DECLS are in the following form: OP_DECL(namespace, funcName, function, onnx implementation, arguments, body argument) - namespace is the op's namespace - funcName is the op name - function is the actual op part of the : pair and will be used to name/call the given function. - Onnx implementation is the underlaying onnx function which will be called. - Arguments are the arguments to the op which will be parsed by different macros depending on which file this is in. - Body arguments are just the names of the arguments so they can be used in the cpp file. */ #include "CompilerOperationMacros.inc.hpp" #include "ManuallyAddedOperations.inc.hpp" ================================================ FILE: popart_compiler/include/popart_compiler/Utils.hpp ================================================ // Copyright (c) 2020 Graphcore Ltd. All rights reserved. #ifndef POPART_COMPILER_UTILS_HPP #define POPART_COMPILER_UTILS_HPP #include #include namespace poptorch { namespace popart_compiler { bool ipuModelEnvironmentVariableIsEnabled(); bool ipuSmallModelEnvironmentVariableIsEnabled(); std::string getIpuModelVersion(); int getNumTilesPerIpu(const std::string &ipu_model_version); std::uint64_t roundUpNumIPUs(std::uint64_t num_ipus); bool waitIfIpuIsUnavailable(); bool waitForAWhile(); /** Returns the IPU version of the device if the system contains a device with * num_ipus -1 if there is a device but the architecture is unknown. 0 if there * is no device with num_ipus. * * Note: This function doesn't check if the devices are currently in use. */ std::int64_t ipuHardwareVersion(std::uint64_t num_ipus = 1); // Converts a C++ string to a unique pointer of the string array; the purpose // is to return a "string" without using the non ABI-compatible std::string std::unique_ptr stringToUniquePtr(const std::string &str); // Returns the dtype int corresponding to the onnx type string int64_t dtypeIntFromOnnxStr(const char *onnx_type); // Returns the Onnx datatype as string corresponding the dtype int used in Onnx // and Popart ops which take an int64_t dtype argument, a.g. "randomnormal" const char *onnxStrFromDtypeInt(int64_t dtype); } // namespace popart_compiler } // namespace poptorch #endif // POPART_COMPILER_UTILS_HPP ================================================ FILE: popart_compiler/source/CodeletsCompilation.cpp ================================================ // Copyright (c) 2022 Graphcore Ltd. All rights reserved. #include #include #include #include #include #include #include "popart_compiler/CodeletsCompilation.hpp" #include "popart_compiler/Utils.hpp" #include "poptorch_logging/Error.hpp" #include "poptorch_logging/Logging.hpp" namespace poptorch { namespace popart_compiler { namespace { // Inter-process exclusive read file lock. class ExclusiveFileLock { public: explicit ExclusiveFileLock(const std::string &path) : _fd(open(path.c_str(), O_RDONLY)) { ERROR_ON_MSG(_fd == -1, "Could not open file " << path); if (flock(_fd, LOCK_EX) == -1) { close(_fd); ERROR("Could not obtain an exclusive lock on file " << path); } } ~ExclusiveFileLock() { flock(_fd, LOCK_UN); close(_fd); } private: int _fd; }; // Returns the commit hash of poplar (via popc --version). std::string poplarVersion() { FILE *stream = popen("popc --version", "r"); ERROR_ON_MSG(stream == NULL, "Unable to read Poplar version. Is Poplar SDK enabled?"); std::string output; try { char buffer[1024]; while (fgets(buffer, sizeof(buffer), stream) != NULL) { output += buffer; } } catch (const std::exception &e) { pclose(stream); ERROR( "Unable to read the output of 'popc --version'. Reason: " << e.what()); } ERROR_ON_MSG(pclose(stream) == -1, "Unable to read the output of 'popc --version'. Reason: " << strerror(errno)); std::smatch match; std::regex regex("([a-z0-9]{10,32})"); if (std::regex_search(output, match, regex)) { return match.str(); } ERROR("Unable to parse the output of 'popc --version'."); } // Computes a hash of the contents of a file at the specified path. std::size_t getFileContentHash(const std::string &path) { std::ifstream file; file.open(path); ERROR_ON_MSG(!file.is_open(), "Could not open file " << path); try { file.seekg(0, std::ios::end); size_t size = file.tellg(); std::string buffer(size, '\0'); file.seekg(0); file.read(&buffer[0], size); return std::hash()(buffer); } catch (const std::exception &e) { ERROR("Could not read file " << path << ". Reason: " << e.what()); } } // Final path is of form: // --.gp std::string compiledCodeletPath(const std::string &src_file_path) { std::size_t src_hash = getFileContentHash(src_file_path); std::string poplar_version = poplarVersion(); // Remove the '.inc.cpp' file extension. std::string out_file_path = src_file_path.substr(0, src_file_path.size() - 8); out_file_path += "-"; out_file_path += std::to_string(src_hash); out_file_path += "-"; out_file_path += poplar_version; out_file_path += ".gp"; return out_file_path; } void compileCodelet(const std::string &src_file_path, const std::string &out_file_path, const std::string &target) { int pipe_fd[2]; ERROR_ON_MSG(pipe(pipe_fd) == -1, "Could not compile codelet " << src_file_path << ", pipe failed. Reason: " << strerror(errno)); pid_t child_pid = fork(); ERROR_ON_MSG(child_pid == -1, "Could not compile codelet " << src_file_path << ", fork failed. Reason: " << strerror(errno)); if (child_pid == 0) { // No reason to ERROR_ON_MSG as we can't see stdout/stderr at this point. ERROR_ON(close(pipe_fd[0]) == -1); ERROR_ON(setpgid(0, 0) == -1); // Pipe stdout and stderr to the parent process. ERROR_ON(dup2(pipe_fd[1], STDOUT_FILENO) == -1); ERROR_ON(dup2(pipe_fd[1], STDERR_FILENO) == -1); ERROR_ON_MSG(close(pipe_fd[1]) == -1, "Could not compile codelet " << src_file_path << ", closing child write pipe failed. Reason: " << strerror(errno)); char *const argv[] = {const_cast("popc"), const_cast("-target"), const_cast(target.c_str()), const_cast("-O3"), const_cast(src_file_path.c_str()), const_cast("-o"), const_cast(out_file_path.c_str()), NULL}; std::string path_env_var = "PATH=" + std::string(std::getenv("PATH")); char *const env[] = {const_cast(path_env_var.c_str()), NULL}; execvpe("popc", argv, env); // 'exec' only returns on failure. _exit(EXIT_FAILURE); } else { // Close the write end. ERROR_ON_MSG(close(pipe_fd[1]) == -1, "Could not compile codelet " << src_file_path << ", closing parent write pipe failed. Reason: " << strerror(errno)); int status; ERROR_ON_MSG(waitpid(child_pid, &status, 0) == -1, "Could not compile codelet " << src_file_path << ", waiting for child process failed. Reason: " << strerror(errno)); // Return on success and report errors on failures. std::string exit_reason; if (WIFEXITED(status)) { if (WEXITSTATUS(status) == 0) { // Child exited successfully. ERROR_ON_MSG(close(pipe_fd[0]) == -1, "Could not compile codelet " << src_file_path << ", closing parent read pipe failed. Reason: " << strerror(errno)); return; } // Child exited with non-zero code. exit_reason = "child failed with exit code "; exit_reason += std::to_string(WEXITSTATUS(status)); exit_reason += "."; // Read child stdout and stderr and let the user know what happened. FILE *stream = fdopen(pipe_fd[0], "r"); std::string output; try { char buffer[1024]; while (fgets(buffer, sizeof(buffer), stream) != NULL) { output += buffer; } exit_reason += " 'popc' output was:\n"; exit_reason += output; } catch (const std::exception &) { // Only report that 'popc' command failed and ignore this error. } fclose(stream); } else if (WIFSIGNALED(status)) { // Child killed by a signal. exit_reason = "child killed with a signal "; exit_reason += std::to_string(WTERMSIG(status)); exit_reason += " ("; exit_reason += strsignal(WTERMSIG(status)); exit_reason += ")."; } else { exit_reason = "child failure unknown."; } close(pipe_fd[0]); ERROR("Could not compile codelet " << src_file_path << ", " << exit_reason); } } // True filesystem python package path where codelet sources are stored. // It gets initialized on first 'import poptorch' from python. std::string custom_codelets_path; } // namespace void setCustomCodeletsPath(const char *cache_path) { if (custom_codelets_path.empty()) { custom_codelets_path = cache_path; } } std::unique_ptr compileCustomCodeletIfNeeded(const char *src_file_name, bool hw_only_codelet) { logging::LogContext ctx("CompileCustomCodeletIfNeeded"); logging::debug("Inspecting whether custom codelet {} needs to be compiled", src_file_name); // Should never happen. ERROR_ON(custom_codelets_path.empty()); std::string src_file_path = custom_codelets_path; src_file_path += "/"; src_file_path += src_file_name; // Lock the src file to make sure only a single process does the compilation. ExclusiveFileLock lock(src_file_path); std::string out_file_path = compiledCodeletPath(src_file_path); // Skip compilation if codelet is already compiled. std::ifstream out_file; out_file.open(out_file_path); if (out_file.is_open()) { logging::debug("Custom codelet {} already compiled", src_file_name); out_file.close(); return stringToUniquePtr(out_file_path); } std::string target; std::int64_t hw_version = ipuHardwareVersion(); if (hw_only_codelet) { ERROR_ON_MSG( hw_version == 0 || hw_version == -1, "Can't infer IPU hardware version, are there any IPUs in the system?"); target = "ipu" + std::to_string(hw_version); } else if (hw_version == 0 || hw_version == -1) { target = "cpu," + getIpuModelVersion(); } else { target = "cpu,ipu" + std::to_string(hw_version); } logging::debug("Compiling custom codelet {} for target {}", src_file_name, target); compileCodelet(src_file_path, out_file_path, target); return stringToUniquePtr(out_file_path); } } // namespace popart_compiler } // namespace poptorch ================================================ FILE: popart_compiler/source/Compiler.cpp ================================================ // Copyright (c) 2020 Graphcore Ltd. All rights reserved. #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "popart_compiler/Compiler.hpp" #include "popart_compiler/CompilerImpl.hpp" #include "popart_compiler/CustomOps.hpp" #include "popart_compiler/MultiConvBuilder.hpp" #include "popart_compiler/PopartEnums.hpp" #include "popart_compiler/SessionOptionsImpl.hpp" #include "popart_compiler/Utils.hpp" #include "poptorch_err/ExceptionInfo.hpp" #include "poptorch_logging/Error.hpp" #include "poptorch_logging/Logging.hpp" namespace poptorch { namespace popart_compiler { namespace { void saveModelProtoIfNeeded(popart::Builder *builder, const char *export_proto_filename) { const std::string filename = export_proto_filename; if (!filename.empty()) { // Important: popart_compiler is compiled using C++ 14 and therefore // doesn't have access to the filesystem utilities so the caller is // responsible for making sure the directories exist and the // filename is a valid filename. std::ofstream fs(filename); bool human_readable = true; if (const char *proto_as_bin = std::getenv("POPTORCH_EXPORT_PROTO_AS_BINARY")) { human_readable = std::stoi(proto_as_bin) == 0; } if (human_readable) { logging::info("Exporting model proto as text (Set " "POPTORCH_EXPORT_PROTO_AS_BINARY=1 to export as binary)"); } else { logging::info("Exporting model proto as binary (Set " "POPTORCH_EXPORT_PROTO_AS_BINARY=0 to export as human " "readable text)"); } fs << builder->getModelProto(human_readable); fs.close(); } } // Helper to let us filter string arguments into const char*s. This is to catch // the std::string produced by some attributes before they cross the ABI // boundary. template T convertType(T &&t) { return std::forward(t); } std::vector convertType(std::vector v) { return std::vector(v.begin(), v.end()); } // Convert an overlap string to a PopART TileSet and Exchange Strategy std::pair exchangeStrToPopartEnum(const char *overlap) { std::pair tile_set_and_strat( popart::TileSet::Compute, popart::ExchangeStrategy::JustInTime); if (strcmp(overlap, "overlap_accumulation_loop") == 0) { tile_set_and_strat.first = popart::TileSet::IO; tile_set_and_strat.second = popart::ExchangeStrategy::OverlapInnerLoop; } else if (strcmp(overlap, "overlap_device_iteration_loop") == 0) { tile_set_and_strat.first = popart::TileSet::IO; tile_set_and_strat.second = popart::ExchangeStrategy::OverlapLoops; } else { ERROR_ON(strcmp(overlap, "no_overlap") != 0); } return tile_set_and_strat; } // Variadic output case. For now we will add all outputs to the graph and // allocate them on the same IPU but we will only return one. This means only // one output can be used by user IR (but can still be used by the backed via // transformations). template struct HandleOutput { TensorId operator()(T &in, bool loss, detail::CompilerImpl *_impl) { ERROR_ON_MSG(loss, "Unreachable internal error: no operation with multiple " "returns is expected to be a loss."); std::set ids; for (const popart::TensorId &id : in) { ids.insert(id); _impl->ids.push_back(id); } _impl->setExecutionStrategyAttributes(ids); // Return the first added tensor as the sole return of this IR op. return _impl->ids.size() - in.size(); } }; // Single tensor output case template <> struct HandleOutput { TensorId operator()(const popart::TensorId &in, bool loss, detail::CompilerImpl *_impl) { // See if any available memory has been set for this IPU. auto itr = _impl->options.available_memory_proportion.find(_impl->active_ipu); if (itr != _impl->options.available_memory_proportion.end()) { logging::info("Setting memory proportion on tensor {} to {}. On IPU {}", in, itr->second, itr->first); _impl->active_builder->setAvailableMemoryProportion(in, itr->second); } _impl->ids.push_back(in); if (!_impl->active_builder->nodeHasAttribute( popart::sPipelineStageAttribute, {in}) && !_impl->active_builder->nodeHasAttribute( popart::sExecutionPhaseAttribute, {in})) { _impl->setExecutionStrategyAttributes({in}); } if (loss) { _impl->loss = in; } return _impl->ids.size() - 1; } }; // Host side constant case template <> struct HandleOutput { TensorId operator()(TensorId in, bool loss, detail::CompilerImpl *_impl) { UNUSED(loss); ERROR_ON(!_impl->isHostSideConstant(in)); return in; } }; // A whitelist of supported loss operations. Popart needs to know which // operations are losses so they can be marked by the session. bool IsLoss(const std::string &operation) { return operation == "popart::identityloss"; } } // namespace void copyParam(Optimizer &dest_optim, const Optimizer &source_optim, const char *source, const char *dest) { const float *source_float = nullptr; const bool *source_is_const = nullptr; float *dest_float = nullptr; bool *dest_is_const = nullptr; for (const auto ¶m : source_optim.parameters) { const char *param_name = static_cast(param.name); if (strcmp(param_name, source) == 0) { source_float = ¶m.value; source_is_const = ¶m.is_const; } } for (auto ¶m : dest_optim.parameters) { const char *param_name = static_cast(param.name); if (strcmp(param_name, dest) == 0) { dest_float = ¶m.value; dest_is_const = ¶m.is_const; } } if ((source_float != nullptr) && (dest_float != nullptr)) { ERROR_ON(!source_is_const); ERROR_ON(!dest_is_const); logging::debug("Set {} ({}) to {} ({})", dest, *dest_float, source, *source_float); (*dest_float) = (*source_float); (*dest_is_const) = (*source_is_const); } } PopartAttribute::PopartAttribute(const char *name, const int64_t &value) : _name(stringToUniquePtr(name)), _any(new popart::any(value)) {} PopartAttribute::PopartAttribute(const char *name, const std::vector &values) : _name(stringToUniquePtr(name)), _any(new popart::any(values)) {} PopartAttribute::PopartAttribute(const char *name, const float &value) : _name(stringToUniquePtr(name)), _any(new popart::any(value)) {} PopartAttribute::PopartAttribute(const char *name, const std::vector &values) : _name(stringToUniquePtr(name)), _any(new popart::any(values)) {} PopartAttribute::PopartAttribute(const char *name, const std::unique_ptr &str) : _name(stringToUniquePtr(name)), _any(new popart::any(std::string(str.get()))) {} PopartAttribute::PopartAttribute( const char *name, const std::vector> &strs) : _name(stringToUniquePtr(name)) { std::vector strs_new; strs_new.reserve(strs.size()); for (const auto &str : strs) { strs_new.emplace_back(str.get()); } _any = std::make_unique(std::move(strs_new)); } PopartAttribute::PopartAttribute(PopartAttribute &&) = default; PopartAttribute &PopartAttribute::operator=(PopartAttribute &&) = default; PopartAttribute::~PopartAttribute() = default; popart::any *PopartAttribute::getValue() { return _any.get(); } PopartConstant::PopartConstant(const PopartType &popart_type, const void *data, const std::vector &shape) { ERROR_ON_MSG(popart_type == PopartType::DOUBLE, "Adding a double constant is not supported. " "This should have been demoted to a float"); const popart::TensorInfo info{toPopartTypeStr(popart_type), shape}; _data = std::make_unique(data, info); } PopartConstant::~PopartConstant() = default; HostSideConstant::HostSideConstant(const PopartType &popart_type, void *data, size_t data_size, std::vector shape) : _popart_type(popart_type), _shape(std::move(shape)) { _data.resize(data_size); std::memcpy(_data.data(), data, data_size); } void HostSideConstant::copyDataTo(void *ptr) const { std::memcpy(ptr, _data.data(), _data.size()); } TensorId Compiler::addInputTensor(const char *type, const std::vector &dims, const char *overlap) { // Create the tensor info for our new tensor. const popart::TensorInfo info{type, dims}; popart::InputSettings settings; const auto tile_set_and_strat = exchangeStrToPopartEnum(overlap); if (tile_set_and_strat.second != popart::ExchangeStrategy::JustInTime) { _impl->using_overlapped_io = true; } settings.setTileSet(tile_set_and_strat.first); settings.setExchangeStrategy(tile_set_and_strat.second); const auto popart_id = _impl->active_builder->addInputTensor(info, settings); _impl->inputs.push_back(popart_id); _impl->ids.push_back(popart_id); return _impl->ids.size() - 1; } TensorId Compiler::createTensorId(const char *name) { const popart::TensorId tensor(name); _impl->ids.push_back(tensor); return _impl->ids.size() - 1; } #define INT_VEC std::vector #define FLOAT_VEC std::vector #define FLOAT float #define INT std::int64_t #define BOOL bool #define DEBUG_CONTEXT(Name) _impl->getDebugContext(Name) #define CHAR char #define STRING const char * #define STRING_VEC std::vector #define NONE #define ARG(Type, Name) , Type Name #define POPART_CONST_ARG(Name) , const PopartConstant &Name #define HOST_SIDE_CONST_ARG(Name) , const HostSideConstant &Name #define POPART_ATTRIB_VEC_ARG(Name) \ , std::shared_ptr> Name #define BODY_ARG(Name) , convertType(Name) // Create a function decl with the given call and arguments. #define OP_DECL(ns, funcName, function, onnxImpl, Args, BodyArgs) \ TensorId Compiler::function(const std::vector &inputs Args) { \ auto AiOnnxOpset11 = _impl->active_builder->aiOnnxOpset11(); \ auto AiGraphcoreOpset1 = _impl->active_builder->aiGraphcoreOpset1(); \ const bool isLoss = IsLoss(#ns "::" #funcName); \ std::vector ins; \ std::transform(inputs.begin(), inputs.end(), std::back_inserter(ins), \ [&](TensorId index) { return _impl->ids[index]; }); \ auto output = onnxImpl(ins BodyArgs); \ return HandleOutput{}(output, isLoss, _impl.get()); \ } // Create a function decl with the given call and arguments. #define OP_DECL_NO_RETURN(ns, funcName, function, onnxImpl, Args, BodyArgs) \ void Compiler::function(const std::vector &inputs Args) { \ auto AiOnnxOpset11 = _impl->active_builder->aiOnnxOpset11(); \ auto AiGraphcoreOpset1 = _impl->active_builder->aiGraphcoreOpset1(); \ std::vector ins; \ std::transform(inputs.begin(), inputs.end(), std::back_inserter(ins), \ [&](TensorId index) { return _impl->ids[index]; }); \ onnxImpl(ins BodyArgs); \ } #include "popart_compiler/SupportedOperations.inc.hpp" #undef OP_DECL #undef OP_DECL_NO_RETURN #undef BODY_ARG #undef POPART_ATTRIB_VEC_ARG #undef POPART_CONST_ARG #undef HOST_SIDE_CONST_ARG #undef ARG #undef NONE #undef STRING_VEC #undef CHAR #undef STRING #undef BOOL #undef INT #undef FLOAT #undef FLOAT_VEC #undef INT_VEC #undef DEBUG_CONTEXT TensorId Compiler::addInitializedInputTensor(const char *name, const char *type, const std::vector &dims, void *data) { // Create the tensor info for our new tensor. const popart::TensorInfo info{type, dims}; // Create the inital data for the variable. const popart::ConstVoidData the_data{data, info}; _impl->ids.push_back( _impl->active_builder->addInitializedInputTensor(the_data, name)); const popart::TensorId &id = _impl->ids[_impl->ids.size() - 1]; _impl->weights.registerParameter(id, info); return _impl->ids.size() - 1; } TensorId Compiler::addInitializedInputTensor( const char *name, const char *type, const std::vector &dims, void *data, int comm_group_type, int shards, int variable_retrieval_mode) { // Create the tensor info for our new tensor. const popart::TensorInfo info{type, dims}; // Create the inital data for the variable. const popart::ConstVoidData the_data{data, info}; const popart::VariableSettings settings( popart::CommGroup(popart::CommGroupType(comm_group_type), shards), popart::VariableRetrievalMode(variable_retrieval_mode)); _impl->ids.push_back(_impl->active_builder->addInitializedInputTensor( the_data, settings, name)); const popart::TensorId &id = _impl->ids[_impl->ids.size() - 1]; _impl->weights.registerParameter(id, info); return _impl->ids.size() - 1; } void Compiler::addOutputTensor(TensorId output, PopartOutputMode output_mode, size_t output_return_period, const char *overlap) { _impl->outputs.push_back(_impl->ids[output]); if (isHostSideConstant(output)) { return; // Nothing more to do } if (output_mode == PopartOutputMode::N) { output_mode = _impl->options.output_mode; if (output_mode == PopartOutputMode::EveryN) { output_return_period = _impl->options.output_return_period; } } const auto tile_set_and_strat = exchangeStrToPopartEnum(overlap); if (tile_set_and_strat.second != popart::ExchangeStrategy::JustInTime) { _impl->using_overlapped_io = true; } // Check for any use of overlapped io // NB this relies on the fact that manual anchors never overlap and other // outputs all have the same output_mode. If these assumptions change, // the logic will have to make sure _impl->using_overlapped_io is correct // before any call to this function rather than changed to true on the first // instance. if (_impl->using_overlapped_io) { verifySettingsForOverlappedIO(output_mode); } const char *as_str = outputModeToString(output_mode); // If we are returning EveryN we need to pass in the return period. if (output_mode == PopartOutputMode::EveryN) { _impl->anchors.insert({_impl->ids[output], popart::AnchorReturnType( as_str, output_return_period, tile_set_and_strat.first, tile_set_and_strat.second)}); } else { _impl->anchors.insert( {_impl->ids[output], popart::AnchorReturnType(as_str, tile_set_and_strat.first, tile_set_and_strat.second)}); } } template static void setUpInputImpl(TensorId id, T *ptr, const std::vector &dims, detail::CompilerImpl *impl) { // Popart wrapper around the tensor pointer. impl->memory_manager.push_back( std::make_unique>(ptr, dims)); impl->popart_incoming.insert( {impl->ids[id], *impl->memory_manager.back().get()}); } void Compiler::setUpInputOp(TensorId id, float *ptr, const std::vector &dims) { assertTensorIs(PopartType::FLOAT, id); setUpInputImpl(id, ptr, dims, _impl.get()); } void Compiler::setUpInputOp(TensorId id, std::int32_t *ptr, const std::vector &dims) { assertTensorIs(PopartType::INT32, id); setUpInputImpl(id, ptr, dims, _impl.get()); } void Compiler::setUpInputOp(TensorId id, bool *ptr, const std::vector &dims) { assertTensorIs(PopartType::BOOL, id); setUpInputImpl(id, ptr, dims, _impl.get()); } void Compiler::setUpInputOp(TensorId id, std::int8_t *ptr, const std::vector &dims) { assertTensorIs(PopartType::INT8, id); setUpInputImpl(id, ptr, dims, _impl.get()); } void Compiler::setUpInputOp(TensorId id, std::uint8_t *ptr, const std::vector &dims) { assertTensorIs(PopartType::UINT8, id); setUpInputImpl(id, ptr, dims, _impl.get()); } void Compiler::setUpInputOp(TensorId id, std::int16_t *ptr, const std::vector &dims, bool float16) { if (float16) { assertTensorIs(PopartType::FLOAT16, id); } else { assertTensorIs(PopartType::INT16, id); } // Popart wrapper around the tensor pointer. _impl->memory_manager.push_back( std::make_unique>( ptr, popart::TensorInfo(float16 ? popart::DataType::FLOAT16 : popart::DataType::INT16, dims))); _impl->popart_incoming.insert( {_impl->ids[id], *_impl->memory_manager.back().get()}); } template static void addOutput(TensorId id, T *ptr, const std::vector &dims, detail::CompilerImpl *impl) { // Popart wrapper around the tensor pointer. auto memory = std::make_unique>(static_cast(ptr), dims); impl->addMemoryToOutput(id, ptr, std::move(memory)); } void Compiler::setUpOutputOp(TensorId id, std::uint8_t *ptr, const std::vector &dims) { addOutput(id, ptr, dims, _impl.get()); } void Compiler::setUpOutputOp(TensorId id, std::int8_t *ptr, const std::vector &dims) { addOutput(id, ptr, dims, _impl.get()); } void Compiler::setUpOutputOp(TensorId id, float *ptr, const std::vector &dims) { addOutput(id, ptr, dims, _impl.get()); } void Compiler::setUpOutputOp(TensorId id, std::int32_t *ptr, const std::vector &dims) { addOutput(id, ptr, dims, _impl.get()); } void Compiler::setUpOutputOp(TensorId id, bool *ptr, const std::vector &dims) { addOutput(id, ptr, dims, _impl.get()); } void Compiler::setUpOutputOp(TensorId id, std::int16_t *ptr, const std::vector &dims) { addOutput(id, ptr, dims, _impl.get()); } void Compiler::initSession(const std::vector &optimizers, const char *export_proto_filename) { const logging::LogContext ctx_init_session{"Compiler::initSession"}; logging::trace("Initializing session"); // Some simple PyTorch models will not need an IPU at all. However, we do not // want users to experience error messages as these may be trivial models // which users try in their first use of PopTorch. if (_impl->used_ipus.empty()) { logging::info("No IPUs are used by this model. This may happen if the " "model is trivial"); return; } const auto device = _impl->createDevice(); popart::SessionOptions &options = _impl->popart_options; if (options.engineOptions.count("debug.retainDebugInformation") == 0) { options.engineOptions.emplace("debug.retainDebugInformation", "false"); // Message has to be consistent with format used by setOptionIfNotSet() logging::debug( "engineOptions[debug.retainDebugInformation] set to value false"); } // 'Auto' mode works if only one IPU is used per replica, and allows // overlapped IO to work. Excerpt from D51863 in PopART: // IO tiles can only be used when virtual graphs are enabled. Virtual graph // modes enable to assign tensors and operations to a subset of IPUs, and // within each IPU, to a subset of tiles (such as compute and IO tiles). The // supported modes are one of: {Manual, Auto, ExecutionPhases}. popart::VirtualGraphMode graph_mode = popart::VirtualGraphMode::Auto; // If Pipelining wasn't set: enable it if more than 1 IPU is used. switch (_impl->options.execution_mode) { case detail::ExecutionMode::Pipelined: { _impl->setOptionIfNotSet(options.enablePipelining, _impl->used_ipus.size() > 1, "enablePipelining"); // If we are pipelining we want to turn on recompute by default. if (_impl->used_ipus.size() > 1) { graph_mode = popart::VirtualGraphMode::Manual; _impl->setOptionIfNotSet( options.autoRecomputation, popart::RecomputationType::Pipeline, "autoRecomputation", popart::toString(popart::RecomputationType::Pipeline)); } // TODO(T53152): AccumulateOuterFragmentSchedule::Serial is currently // incompatible with gradient clipping and pipelining. for (const auto &optimizer : optimizers) { if (optimizer.max_grad_norm != std::numeric_limits::infinity()) { _impl->setOptionIfNotSet( options.accumulateOuterFragmentSettings.schedule, popart::AccumulateOuterFragmentSchedule::Scheduler, "accumulateOuterFragmentSettings.schedule", "AccumulateOuterFragmentSchedule::Scheduler"); break; } } break; } case detail::ExecutionMode::Sharded: { _impl->setOptionIfNotSet(options.enablePipelining, false, "enablePipelining"); if (_impl->used_ipus.size() > 1 || _impl->using_overlapped_io) { graph_mode = popart::VirtualGraphMode::Manual; } break; } case detail::ExecutionMode::Phased: { _impl->setOptionIfNotSet(options.enablePipelining, false, "enablePipelining"); graph_mode = popart::VirtualGraphMode::ExecutionPhases; std::uint64_t num_phases = _impl->max_phase + 1; const std::uint64_t num_stages = _impl->options.serial_phases_execution ? 1 : 2; if (_impl->options.tensors_liveness != detail::Liveness::AlwaysLive) { // We want to send the tensors off chip: Tensors stay live through // phases N, N+1, N+2 so we need to have a gap of 3 before the bwd // pass, otherwise the bwd pass will start in the same phase as the // end of the fwd pass. num_phases += 3; } else if (_impl->options.separate_backward_phase) { // Make sure the backward pass will start with a new phase. num_phases += 1; } _impl->setOptionIfNotSet(options.executionPhaseSettings.phases, num_phases, "executionPhaseSettings.phases"); _impl->setOptionIfNotSet(options.executionPhaseSettings.stages, num_stages, "executionPhaseSettings.stages"); _impl->setOptionIfNotSet( options.activationTensorLocationSettings.location.storage, popart::TensorStorage::OffChip, "location_activation", "useOnChipStorage(False)"); _impl->setOptionIfNotSet( options.weightTensorLocationSettings.location.storage, popart::TensorStorage::OffChip, "location_weight", "useOnChipStorage(False)"); _impl->setOptionIfNotSet( options.optimizerStateTensorLocationSettings.location.storage, popart::TensorStorage::OffChip, "location_optimizer", "useOnChipStorage(False)"); _impl->setOptionIfNotSet( options.accumulatorTensorLocationSettings.location.storage, popart::TensorStorage::OffChip, "location_accumulator", "useOnChipStorage(False)"); break; } default: ERROR("ExecutionMode not supported"); } // By default allow the user to save / restore the RNG state (It uses slightly // more memory). _impl->setOptionIfNotSet(options.enableLoadAndOffloadRNGState, true, "enableLoadAndOffloadRNGState"); _impl->setOptionIfNotSet(options.virtualGraphMode, graph_mode, "virtualGraphMode", popart::toString(graph_mode)); _impl->setOptionIfNotSet(options.enableDistributedReplicatedGraphs, _impl->options.num_distributed_processes > 1, "enableDistributedReplicatedGraphs"); _impl->setOptionIfNotSet(options.globalReplicationFactor, _impl->options.num_distributed_processes * options.replicatedGraphCount, "globalReplicationFactor"); _impl->setOptionIfNotSet(options.globalReplicaOffset, _impl->options.distributed_process_id * options.replicatedGraphCount, "globalReplicaOffset"); _impl->setOptionIfNotSet(options.enableReplicatedGraphs, options.replicatedGraphCount > 1, "enableReplicatedGraphs"); // Disable constant_weights by default: causes problems with Popart _impl->setOptionIfNotSet(options.constantWeights, false, "constantWeights"); if (_impl->options.execution_mode == detail::ExecutionMode::Pipelined) { const auto num_pipeline_stages = _impl->numPipelineStages(); if (_impl->is_training) { const auto num_forward_stages = (num_pipeline_stages + 1) / 2; const auto num_backward_stages = (num_pipeline_stages - 1) / 2; const std::string err_msg = fmt::format( "poptorch.Options().Training.gradientAccumulation must be greater " "than or equal to the number of pipeline stages ({}) when using " "poptorch.PipelinedExecution. Please note that a model with {} " "pipeline stages in PopTorch will have an additional {} stages when " "training.", num_pipeline_stages, num_forward_stages, num_backward_stages); ERROR_ON_MSG(_impl->popart_options.accumulationFactor < static_cast(num_pipeline_stages), err_msg); } else { const std::string err_msg = fmt::format("poptorch.Options().deviceIterations must be greater " "than or equal to the number of pipeline stages ({}) " "when using PopTorch.PipelinedExecution.", num_pipeline_stages); ERROR_ON_MSG(_impl->options.steps < num_pipeline_stages, err_msg); } } _impl->setOptionIfNotSet(options.enableGradientAccumulation, options.accumulationFactor > 1, "enableGradientAccumulation"); // Only explicitly set these options if overlapped I/O are used // otherwise we might be overwriting the values set implicitly // by some other PopART options (like for example enableExplicitIR()). if (_impl->using_overlapped_io) { // This is needed for both overlapped IO and explicit pipelining (not yet) // supported. _impl->setOptionIfNotSet(options.useHostCopyOps, _impl->using_overlapped_io, "useHostCopyOps"); // This is needed but may cause regressions for existing models. When it is // more developed, this will become the default. _impl->setOptionIfNotSet(options.enableExplicitMainLoops, _impl->using_overlapped_io, "enableExplicitMainLoops"); } // Create the anchors, these are used to copy to the host. const auto data_flow = popart::DataFlow(_impl->options.steps, _impl->anchors); // Save the initializers to an external file if requested. if (!_impl->options.external_initializers_file.empty()) { const logging::LogContext ctx{ "popart::Builder::saveInitializersExternally"}; logging::trace("Saving initializers to external file {}", _impl->options.external_initializers_file); _impl->active_builder->saveInitializersExternally( _impl->weights.parameterIds(), _impl->options.external_initializers_file); } const auto model_name_set = _impl->options_set.count("model_name") > 0; // Tensor location in PopART includes a shardingDomain option which sets // which replicas to shard tensors across when using replicated tensor // sharding. For now, only one option works for multiple processes, which is // to set the type to consecutive across the number of local replica (which // is equal to options.replicatedGraphCount on each process). // // The setting for a single process remains the default (All) which shards // tensors across all replica. // // In future, GCL and PopART will support additional options, which can be // exposed to the user. if (_impl->options.num_distributed_processes > 1) { const popart::CommGroup sharding_domain(popart::CommGroupType::Consecutive, options.replicatedGraphCount); options.activationTensorLocationSettings.location.shardingDomain = sharding_domain; options.weightTensorLocationSettings.location.shardingDomain = sharding_domain; options.optimizerStateTensorLocationSettings.location.shardingDomain = sharding_domain; options.accumulatorTensorLocationSettings.location.shardingDomain = sharding_domain; } saveModelProtoIfNeeded(_impl->active_builder, export_proto_filename); // Create the popart session object to actually run the graph. if (!_impl->is_training) { // Create an inference session. const logging::LogContext ctx{ "popart::InferenceSession::createFromOnnxModel"}; _impl->session = popart::InferenceSession::createFromOnnxModel( _impl->active_builder->getModelProto(), data_flow, device, {}, options, popart::PatternsLevel::Default, model_name_set ? _impl->options.model_name : "inference"); } else { // Create the optimizer from user provided parameters. const std::unique_ptr optimizer = _impl->getPopartOptimizer(optimizers); // Create the training session. const logging::LogContext ctx{ "popart::TrainingSession::createFromOnnxModel"}; _impl->session = popart::TrainingSession::createFromOnnxModel( _impl->active_builder->getModelProto(), data_flow, _impl->loss, *optimizer, device, {}, options, _impl->options.patterns, model_name_set ? _impl->options.model_name : "training"); } } void Compiler::saveExecutableToFile(const char *export_filename) const { ERROR_ON_MSG(!_impl->session, "Nothing to export. This may be because the model does not run " "any op on the IPU."); const logging::LogContext ctx_function{"Compiler::saveExecutableToFile"}; const logging::LogContext ctx{"popart::Session::saveExecutable"}; _impl->session->saveExecutable(export_filename); } void Compiler::setRngState(std::uint64_t seed, const std::vector &rng_state) { ERROR_ON_MSG(!_impl->session, "Session should be initialised first"); logging::debug("Setting random seed to: {}", seed); if (_impl->session->getIr().getRequiresRandomSeed()) { _impl->session->setRandomSeed(seed); } else { logging::debug("Session has no random behaviour: nothing to do."); } if (!rng_state.empty()) { logging::debug("Setting RNG state"); _impl->session->setRNGState(rng_state); } } std::vector Compiler::getRngState() const { ERROR_ON_MSG(!_impl->session, "Session should be initialised first"); logging::debug("Reading RNG state"); return _impl->session->getRNGState(); } std::uint64_t Compiler::getRandomSeed() const { ERROR_ON_MSG(!_impl->session, "Session should be initialised first"); logging::debug("Reading random seed"); if (_impl->session->getIr().getRequiresRandomSeed()) { return _impl->session->getRandomSeed(); } logging::debug("Session has no random behaviour: using 0 as seed."); return 0; } void Compiler::loadExecutableAndPrepareDevice(const char *import_filename) { ERROR_ON_MSG(!_impl->session, "Nothing to import. This may be because the " "model does not run any op on an IPU."); const logging::LogContext ctx{"Compiler::loadExecutableAndPrepareDevice"}; const std::string path(import_filename); auto stream = std::make_shared(path, std::ifstream::binary); ERROR_ON_MSG(!stream->is_open(), "Failed to open " << path << " for reading"); _impl->session->loadExecutableFromStream(stream); // Don't automatically load the engine: we want to control when this happens // to make sure it happens at the same time in distributed environments. constexpr bool load_engine = false; _impl->session->prepareDevice(load_engine); _impl->cachePopartTypes(); } void Compiler::loadEngineAndConnectStreams() { if (!_impl->session) { logging::trace("Skipping loading engine"); return; } logging::trace("Loading engine"); _impl->session->loadEngineAndConnectStreams(); static const std::map, std::uint8_t, std::less> host_sizes{// word types {poplar::UNSIGNED_INT, 4}, {poplar::INT, 4}, {poplar::FLOAT, 4}, // half types {poplar::UNSIGNED_SHORT, 2}, {poplar::SHORT, 2}, {poplar::HALF, 2}, // byte types {poplar::BOOL, 1}, {poplar::CHAR, 1}, {poplar::SIGNED_CHAR, 1}, {poplar::UNSIGNED_CHAR, 1}}; // For each individual CPU operation (multiple calls to one op = still one op) for (detail::CallbackInternalMetadata &cb_data : _impl->callbacks) { // For each input we create a special callback which tracks how many inputs // have been added and once they're all in it calls back into python. const auto to_size_bytes = [&](const auto &shape, const auto &type) { const poplar::Type ptype = poptorch::popart_compiler::poplarTypeFromPoptorch(type); const auto it = host_sizes.find(ptype); ERROR_ON_MSG(it == host_sizes.cend(), "Unsupported host op type"); const std::size_t number_of_elems = std::accumulate( shape.cbegin(), shape.cend(), 1, std::multiplies()); return number_of_elems * it->second; }; // Store the amount of data to be transferred for each of the function's // input and output arguments. std::vector input_sizes(cb_data.input_shapes.size()); std::transform(cb_data.input_shapes.cbegin(), cb_data.input_shapes.cend(), cb_data.input_types.begin(), input_sizes.begin(), to_size_bytes); std::vector output_sizes(cb_data.output_shapes.size()); std::transform(cb_data.output_shapes.cbegin(), cb_data.output_shapes.cend(), cb_data.output_types.begin(), output_sizes.begin(), to_size_bytes); const auto poplar_callback = [input_sizes = std::move(input_sizes), output_sizes = std::move(output_sizes), &cb_data](const void *const *inputs, size_t number_of_inputs, void *const *outputs, size_t number_of_outputs) { ERROR_ON_MSG(number_of_inputs != input_sizes.size(), "Number of inputs does not match"); ERROR_ON_MSG(number_of_outputs != output_sizes.size(), "Number of outputs does not match"); ERROR_ON_MSG(inputs == nullptr, "CPU function callback given null inputs"); ERROR_ON_MSG(outputs == nullptr, "CPU function callback given null outputs"); ERROR_ON_MSG(number_of_inputs != cb_data.input_pointers.size(), "Number of inputs does not match cb data (got " << cb_data.input_pointers.size() << ")"); ERROR_ON_MSG(number_of_outputs != cb_data.output_pointers.size(), "Number of outputs does not match cb data (got " << cb_data.output_pointers.size() << ")"); for (std::size_t input = 0; input < number_of_inputs; ++input) { // Copy from IPU into the waiting pytorch tensor on host. std::memcpy(reinterpret_cast(cb_data.input_pointers[input]), reinterpret_cast(inputs[input]), input_sizes[input]); } // Call the pytorch function on CPU. cb_data.the_callback(); // We then do the outputs, these are much simpler since it is a // straight up dependency free data copy. for (std::size_t output = 0; output < number_of_outputs; ++output) { std::memcpy( reinterpret_cast(outputs[output]), reinterpret_cast(cb_data.output_pointers[output]), output_sizes[output]); } }; // Tell poplar about the callback. _impl->session->connectHostFunction(cb_data.handle, std::move(poplar_callback)); } } void Compiler::appendPoptorchMetadataToFile( const char *serialized_poptorch_metadata, const size_t metadata_length, const char *export_filename) { popef::Reader reader; reader.parseFile(export_filename); ERROR_ON_MSG(reader.executables().size() != 1, "Popef file does not contain exactly one Executable blob."); const std::string &executable_name = reader.executables().at(0).name; popef::FileWriter writer(export_filename, popef::FileWriter::Mode::APPEND); auto poptorch_blob = writer.createOpaqueBlob(poptorch_opaque_name, executable_name); poptorch_blob->stream.write(serialized_poptorch_metadata, metadata_length); poptorch_blob->close(); writer.close(); } std::vector Compiler::importPoptorchMetadataFromFile(const char *import_filename) { popef::Reader reader; reader.parseFile(import_filename); std::vector opaques = reader.opaqueBlobs(); auto poptorch_blob_it = std::find_if( opaques.begin(), opaques.end(), [](const popef::OpaqueReader &opaque) { return opaque.name == poptorch_opaque_name; }); ERROR_ON_MSG(poptorch_blob_it == opaques.end(), "Popef file does not contain Poptorch metadata."); const size_t buffer_size = poptorch_blob_it->getAvailableReadSize(); std::vector metadata_buffer(buffer_size); poptorch_blob_it->data.read(metadata_buffer.data(), buffer_size); return metadata_buffer; } void Compiler::compileAndPrepareDevice() { if (!_impl->session) { logging::trace("Skipping Poplar compilation"); // This includes host side tensors, so has to be run even without a session. _impl->cachePopartTypes(); return; } const logging::LogContext ctx_func{"Compiler::compileAndPrepareDevice"}; // Poplar compilation. try { const logging::LogContext ctx{"popart::Session::prepareDevice: Poplar " "compilation"}; logging::trace("Begining Poplar compilation."); constexpr bool load_engine = false; // Don't automatically load the engine: we want to control when this happens // to make sure it happens at the same time in distributed environments. _impl->session->prepareDevice(load_engine); logging::trace("Finished Poplar compilation."); } catch (popart::memory_allocation_err &e) { logging::err("Out of memory, the graph profile is available here: {}", e.getProfilePath()); std::rethrow_exception(std::current_exception()); } _impl->cachePopartTypes(); } std::unique_ptr Compiler::getExecutionInfo() const { std::string as_string; switch (_impl->options.execution_mode) { case detail::ExecutionMode::Pipelined: { as_string = fmt::format(" mode(Pipelined), ipu({}), stage({})", _impl->active_ipu, _impl->active_stage); break; } case detail::ExecutionMode::Sharded: { as_string = fmt::format(" mode(Sharded), ipu({}), stage({})", _impl->active_ipu, _impl->active_stage); break; } case detail::ExecutionMode::Phased: { as_string = fmt::format(" mode(Phased), ipu({}), phase({})", _impl->active_ipu, _impl->active_phase); break; } default: ERROR("Invalid ExecutionMode active"); } // Copy into a memory managed array to get around ABI. return stringToUniquePtr(as_string); } std::unique_ptr Compiler::getPopartIR() const { const std::string as_string = _impl->getPopartIR(); // Copy into a memory managed array to get around ABI. return stringToUniquePtr(as_string); } std::set> Compiler::getTensorNames() const { std::set> casted_ids; const auto tensor_ids = _impl->getTensorNames(); for (const auto &tensor_id : tensor_ids) { // Copy into a memory managed array to get around ABI. casted_ids.insert(stringToUniquePtr(tensor_id)); } return casted_ids; } // Write the weights into IPU memory from the pytorch tensor buffers in the // model. void Compiler::copyWeightsToDevice(const std::vector &host_buffers) { if (!_impl->session) { logging::trace("Skipping writing weights from host to IPU memory."); return; } logging::info("Writing weights from host to IPU memory."); // Do we need to update the host buffers pointers before // uploading to the IPU? if (!host_buffers.empty()) { _impl->weights.updateData(host_buffers); _impl->session->writeWeights(_impl->weights); } _impl->session->weightsFromHost(); } void Compiler::registerUpdatableNamedBuffer(const TensorId &id) { auto popart_id = _impl->ids.at(id); ERROR_ON_MSG(!_impl->weights.contains(popart_id), "Invalid updatable buffer " << popart_id); const auto &buffers = _impl->popart_options.updatableNamedBuffers; if (std::find(buffers.begin(), buffers.end(), popart_id) != buffers.end()) { const auto &weight = _impl->weights.weight(popart_id); _impl->updatable_named_buffers.registerParameter(popart_id, weight.info); } } // Write the buffers into IPU memory from the pytorch tensor buffers in the // model. void Compiler::copyNamedBuffersToDevice( const std::vector &host_buffers) { if (!_impl->session) { logging::trace("Skipping writing buffers from host to IPU memory."); return; } logging::info("Writing named buffers from host to IPU memory."); if (!host_buffers.empty()) { _impl->updatable_named_buffers.updateData(host_buffers); _impl->session->writeWeights(_impl->updatable_named_buffers); } _impl->session->buffersFromHost(); } // Read the weights from IPU memory into the pytorch tensor buffers. void Compiler::copyWeightsToHost(const std::vector &host_buffers) { if (!_impl->session) { logging::trace("Skipping writing weights from IPU to host."); return; } logging::info("Writing weights from IPU to host."); // In PopTorch we use copyWeightsToHost and copyWeightsToDevice as // synchronisation routines. // It means we expect to have one buffer on the host, one on the device and // to synchronise the two in one direction or the other. // // PopART works differently: it has one set of read source buffers and one // set of write destination buffers and we need to keep those in sync // manually by calling writeWeights() // Transfer from the IPU to PopART read source buffers. _impl->session->weightsToHost(); // Update the Poptorch destination buffers _impl->weights.updateData(host_buffers); // Copy from the PopART read source buffers to the Poptorch buffers. _impl->session->readWeights(_impl->weights); // Keep the PopART write destination buffer in sync with the PopTorch buffer. _impl->session->writeWeights(_impl->weights); } void Compiler::updateOptimizers(const std::vector &optimizers) { ERROR_ON(!_impl->session); ERROR_ON(optimizers.empty()); ERROR_ON(!_impl->is_training); // Each of the groups of parameters are stored in a single PopART // optimizer that's why the vector of optimizers translates into // a single PopART optimizer. const std::unique_ptr optimizer = _impl->getPopartOptimizer(optimizers); // Update the popart graph/poplar executable with new optimizer. popart::TrainingSession &session = dynamic_cast(*_impl->session); session.updateOptimizerFromHost(optimizer.get()); } void Compiler::run() { if (!_impl->session) { // Nothing to run on IPU ERROR_ON(!_impl->popart_incoming.empty()); ERROR_ON(!_impl->popart_outgoing.empty()); ERROR_ON(!_impl->outgoing_duplicates.empty()); ERROR_ON(!_impl->memory_manager.empty()); return; } if (!isAttachedToDevice()) { attachToDevice(); } _impl->stepio.setInputGroupings(_impl->options.input_cgt, _impl->options.input_group_size, _impl->popart_options.replicatedGraphCount); // Execute the model on IPU. _impl->stepio.populate(_impl->popart_incoming, _impl->popart_outgoing); _impl->session->run(_impl->stepio); // In case several outputs point at the same tensor: duplicate the data for (const auto &out : _impl->outgoing_duplicates) { auto &src = _impl->popart_outgoing.at(out.first); for (auto *ptr : out.second) { std::memcpy(ptr, src.data(), src.nelms() * popart::getDataTypeInfoMap().at(src.dataType()).nbytes()); } } // The buffers handle the communication between pytorch and popart, we set // them up each run. _impl->popart_incoming.clear(); _impl->popart_outgoing.clear(); _impl->outgoing_duplicates.clear(); _impl->memory_manager.clear(); // Log the number of cycles if instrumentation is enabled const popart::SessionOptions &options = _impl->popart_options; if (options.instrumentWithHardwareCycleCounter) { _cycle_count = _impl->session->getCycleCount(); logging::debug("Total number of IPU cycles: {}", _cycle_count); } } PopartType Compiler::getPopartType(TensorId id) const { return _impl->getPopartType(id); } const char *Compiler::tensorName(TensorId id) const { return _impl->ids.at(id).c_str(); } bool Compiler::tensorIdIsValid(TensorId id) const { return id < _impl->ids.size(); } const std::vector Compiler::invalid_size{-1}; std::vector Compiler::getSize(TensorId id) const { if (isHostSideConstant(id)) { return _impl->getHostSideConstant(id).shape(); } if (_impl->session) { return _impl->session->getInfo(_impl->ids[id]).shape(); } const auto popart_id = _impl->ids.at(id); if (!_impl->active_builder->hasValueInfo(popart_id)) { return invalid_size; } return _impl->active_builder->getTensorShape(popart_id); } std::unique_ptr Compiler::getTensorDTypeString(TensorId id) const { std::string type_str; if (_impl->session) { type_str = _impl->session->getInfo(_impl->ids[id]).data_type(); } else { const auto popart_id = _impl->ids.at(id); if (_impl->active_builder->hasValueInfo(popart_id)) { type_str = _impl->active_builder->getTensorDtypeString(popart_id); } else { type_str = "unknown"; } } return stringToUniquePtr(type_str); } void Compiler::setCurrentPythonCodeLocation(const char *torch_node, const char *filename, std::uint64_t line, std::uint64_t col) { UNUSED(col); _impl->torch_node = torch_node; _impl->code_location = popart::SourceLocation("", filename, line); } void Compiler::clearActiveIpu() { _impl->active_ipu = -1; } void Compiler::setActiveIpu(std::uint64_t stage_id, std::int64_t phase_id, std::int64_t ipu_id) { switch (_impl->options.execution_mode) { case detail::ExecutionMode::Phased: ERROR_ON_MSG(phase_id < 0, "Invalid phase for ExecutionMode::Phased"); if (_impl->options.tensors_liveness == detail::Liveness::OffChipAfterEachPhase) { ERROR_ON_MSG(!_impl->options.serial_phases_execution, "This is only supported for serial phase execution"); _impl->active_phase = phase_id * 4; } else if (_impl->options.tensors_liveness == detail::Liveness::OffChipAfterFwdNoOverlap) { ERROR_ON_MSG(!_impl->options.serial_phases_execution, "This is only supported for serial phase execution"); _impl->active_phase = phase_id * 2; } else { _impl->active_phase = phase_id; } _impl->max_phase = std::max(_impl->active_phase, _impl->max_phase); if (!_impl->options.serial_phases_execution) { ERROR_ON_MSG(_impl->active_phase % 2 != ipu_id % 2, "When phases are executed in parallel: even phases must run " "on even IPUs and odd phases on odd IPUs"); } break; case detail::ExecutionMode::Pipelined: case detail::ExecutionMode::Sharded: _impl->active_stage = stage_id; break; default: ERROR("Unsupported ExecutionMode"); } // Record a number of times the IPU switches as this is needed to calculate // number of pipeline stages. if (static_cast(ipu_id) != _impl->last_ipu_used) { _impl->num_ipu_switches++; } _impl->active_ipu = ipu_id; // The previous will revert to -1 but this will remain ipu_id until another // IPU is used. _impl->last_ipu_used = ipu_id; } bool Compiler::isHostSideConstant(TensorId id) const { return _impl->isHostSideConstant(id); } std::uint64_t Compiler::batchPerStep() const { return _impl->options.steps; } std::uint64_t Compiler::popartBatchDim() const { return _impl->popart_options.replicatedGraphCount * _impl->options.steps * _impl->popart_options.accumulationFactor; } std::uint64_t Compiler::popartBatchDimForAnchor(TensorId id) const { if (isHostSideConstant(id)) { return 1; // Cannot be batched as it is a constant } // Get the PopART tensor from our wrapper. const popart::TensorId &popart_id = _impl->ids[id]; // Check what the anchor is supposed to return. const auto iterator = _impl->anchors.find(popart_id); ERROR_ON_MSG(iterator == _impl->anchors.cend(), "Internal Error: Output op doesn't have an anchor."); const popart::AnchorReturnType &return_type = iterator->second; // If we are returning ALL then we are returning a full batch. if (return_type.id() == popart::AnchorReturnTypeId::All) { return popartBatchDim(); } // If we are copying EveryN then we will be returning N. if (return_type.id() == popart::AnchorReturnTypeId::EveryN) { return popartBatchDim() / return_type.rp(); } // Return an element for each replica. return _impl->popart_options.replicatedGraphCount; } void Compiler::setAvailableMemoryProportion( const std::vector> &inputs, float availableMemoryProportion) { for (const auto &ids : inputs) { std::set popart_ids; std::transform(std::cbegin(ids), std::cend(ids), std::inserter(popart_ids, std::begin(popart_ids)), [this](const TensorId &id) { return _impl->ids[id]; }); _impl->active_builder->setAvailableMemoryProportion( popart_ids, availableMemoryProportion); } } void Compiler::setMatMulSerialization(TensorId matmul, const char *mode, std::uint64_t factor, std::uint64_t keep_precision) { _impl->active_builder->setSerializeMatMul({_impl->ids[matmul]}, mode, factor, keep_precision != 0u); } void Compiler::optimizerGroup(const std::vector &inputs, int64_t group) { _impl->optimizerGroup(inputs, group); } std::vector Compiler::optimizerTensorMetadataList() const { std::vector metadata_list; auto fn_add_tensor_data = [&](popart::Tensor *t, bool state_tensor) { TensorMetadata tm; tm.id = t->id.c_str(); popart::TensorInfo ti(t->info); const auto global_replication_factor = _impl->session->getDevice().getGlobalReplicationFactor(); // obtain real tensor shape that is taking into account replication and // replica grouping ti.set(ti.dataType(), t->getVariableSettings().shapeOnHost( t->info.shape(), global_replication_factor)); tm.shape = ti.shape(); tm.dtype = ti.data_type().c_str(); // Optimiser state tensors are variables in PopART, and must be read/written // via WeightsIO. Optimiser parameters such as learning rate and loss // scaling are either stream or constant tensors, and so can be read/written // directly via memcpy if (state_tensor) { if (!_impl->optim_state_tensors.contains(t->id)) { _impl->optim_state_tensors.registerParameter(t->id, ti); } } else { tm.data = t->tensorData()->data(); tm.num_bytes = t->info.nbytes(); } metadata_list.push_back(std::move(tm)); }; for (auto *t : _impl->session->getIr().optimizerStateTensors()) { fn_add_tensor_data(t, true); } // Note: session->getIr().optimizerTensors() is empty for cached executables, // so get the optimizer tensors from the executable instead. for (auto *t : _impl->session->getExecutable().getOptimizerTensors()) { fn_add_tensor_data(t, false); } return metadata_list; } void Compiler::fillHostOptimizerStateTensorData( const std::vector &host_buffers) { logging::info("Writing optimiser state tensors from IPU to host."); // In PopTorch we use copyWeightsToHost and copyWeightsToDevice as // synchronisation routines. // It means we expect to have one buffer on the host, one on the device and // to synchronise the two in one direction or the other. // // PopART works differently: it has one set of read source buffers and one // set of write destination buffers and we need to keep those in sync // manually by calling writeWeights() // Transfer from the IPU to PopART read source buffers. _impl->session->weightsToHost(); // Update the Poptorch destination buffers _impl->optim_state_tensors.updateData(host_buffers); // Copy from the PopART read source buffers to the Poptorch buffers. _impl->session->readWeights(_impl->optim_state_tensors); // Keep the PopART write destination buffer in sync with the PopTorch buffer. _impl->session->writeWeights(_impl->optim_state_tensors); } void Compiler::writeDeviceOptimizerStateTensorData( const std::vector &host_buffers) { ERROR_ON_MSG(!_impl->session, "Session should be initialised first"); ERROR_ON_MSG(!isAttachedToDevice(), "Must be attached to a device to " "write the optimizer state."); logging::info("Writing optimiser state tensors from host to IPU memory."); _impl->optim_state_tensors.updateData(host_buffers); _impl->session->writeWeights(_impl->optim_state_tensors); _impl->session->weightsFromHost(); } Compiler::Compiler(Compiler &&compiler) : _cycle_count(compiler._cycle_count) { _impl = std::move(compiler._impl); } Compiler::Compiler(bool is_training, const SessionOptions &options) : _cycle_count(no_cycles) { _impl = std::make_unique(); _impl->is_training = is_training; _impl->popart_options = options._impl->popart_options; _impl->options = options._impl->poptorch_options; _impl->options_set = options._impl->options_set; } Compiler::~Compiler() = default; void Compiler::addOutputType(OutputTypeShape type) { _impl->output_types.emplace_back(type); } const std::vector &Compiler::outputTypes() const { return _impl->output_types; } void Compiler::startSubgraph() { popart::Builder *subgraph = &_impl->active_builder->createSubgraphBuilder(); _impl->active_builder = subgraph; _impl->active_builder->addInputTensor( popart::TensorInfo{"INT64", popart::Shape{}}); const popart::TensorId keep_going = _impl->active_builder->addInputTensor( popart::TensorInfo{"BOOL", popart::Shape{}}); _impl->active_builder->addOutputTensor({keep_going}); } void Compiler::setAttribute(const char *attribute, const char *key, const char *value) { _impl->setAttribute(std::string(attribute), std::string(key), std::string(value)); } void Compiler::clearAttribute(const char *attribute, const char *key) { _impl->clearAttribute(std::string(attribute), std::string(key)); } TensorId Compiler::endForLoop(std::int32_t trip_count, std::int64_t num_outputs, const std::vector &inputs) { ERROR_ON_MSG(_impl->is_training, "poptorch.for_loop() is only supported in inference."); popart::Builder *body = _impl->active_builder; // Switch back to main graph. _impl->active_builder = _impl->active_builder->getParent(); auto ai_onnx = _impl->active_builder->aiOnnxOpset11(); const popart::ConstVoidData trip_count_data(&trip_count, {"INT32", popart::Shape{}}); const bool true_const = true; const popart::ConstVoidData the_data(&true_const, {"BOOL", popart::Shape{}}); const popart::TensorId trip_count_as_tensor = ai_onnx.constant(trip_count_data); const popart::TensorId condition = ai_onnx.constant(the_data); std::vector transformed_ins = {trip_count_as_tensor, condition}; for (const TensorId id : inputs) { transformed_ins.push_back(_impl->ids[id]); } std::vector output = ai_onnx.loop(transformed_ins, num_outputs, *body); return HandleOutput>{}(output, false, _impl.get()); } void Compiler::startIfBlock() { popart::Builder *subgraph = &_impl->active_builder->createSubgraphBuilder(); _impl->active_builder = subgraph; _impl->if_true_stack.push(_impl->active_builder); } void Compiler::startElseBlock() { // Else must by definition be added after an if block. _impl->active_builder = _impl->active_builder->getParent(); popart::Builder *subgraph = &_impl->active_builder->createSubgraphBuilder(); _impl->active_builder = subgraph; _impl->if_false_stack.push(_impl->active_builder); } TensorId Compiler::endIfBlock(const TensorId &condition, std::size_t num_outputs) { ERROR_ON_MSG(_impl->is_training, "poptorch.cond() is only supported in inference."); // Pop back to the parent. _impl->active_builder = _impl->active_builder->getParent(); // Pop the false branch off the stack. popart::Builder *else_branch = _impl->if_false_stack.top(); _impl->if_false_stack.pop(); // Pop the true branch off the stack. popart::Builder *then_branch = _impl->if_true_stack.top(); _impl->if_true_stack.pop(); const popart::TensorId cond_as_popart = _impl->ids.at(condition); auto ai_onnx = _impl->active_builder->aiOnnxOpset11(); std::vector outputs = ai_onnx.logical_if( {cond_as_popart}, num_outputs, *else_branch, *then_branch); return HandleOutput>{}(outputs, false, _impl.get()); } void Compiler::pushNameScope(const char *name) { _impl->active_builder->pushNameScope(std::string(name)); } void Compiler::popNameScope() { _impl->active_builder->popNameScope(); } TensorId Compiler::addUntypedInputTensor() { const popart::TensorId out = _impl->active_builder->addUntypedInputTensor(); _impl->ids.push_back(out); return _impl->ids.size() - 1; } void Compiler::assertTensorIs(PopartType dataType, TensorId id) const { const PopartType actual_type = _impl->ids_types.at(id); if (__builtin_expect( static_cast(actual_type == PopartType::UNDEFINED), 0) != 0) { // Rare case of input tensor never used, so not in IR return; } ERROR_ON_MSG(actual_type != dataType, "One or more input data types have changed since the first model" " run. You will need to call \"destroy\" on the model before " "running with different input data types."); } void Compiler::addMultiConvPart(const std::vector &inputs, const std::vector &dilations, const std::vector &kernel_shape, const std::vector &pads, const std::vector &strides) { std::vector args; std::transform(inputs.cbegin(), inputs.cend(), std::back_inserter(args), [&](TensorId index) { return _impl->ids[index]; }); _impl->addMultiConvPart(args, dilations, kernel_shape, pads, strides); } void Compiler::setMultiConvAvailableMemoryProportions( const std::vector &v) { ERROR_ON_MSG( _impl->multi_conv_builder == nullptr, "Unexpected poptorch.MultiConv option: available_memory_proportions"); _impl->multi_conv_builder->setAvailableMemoryProportions( popart::vXtoY(v)); } void Compiler::setMultiConvPartialsTypes( const std::vector &partials_types) { ERROR_ON_MSG(_impl->multi_conv_builder == nullptr, "Unexpected poptorch.MultiConv option: partials_types"); _impl->multi_conv_builder->setPartialsTypes(partials_types); } void Compiler::setMultiConvEnableConvDithering( const std::vector &conv_ditherings) { ERROR_ON_MSG(_impl->multi_conv_builder == nullptr, "Unexpected poptorch.MultiConv option: enable_conv_dithering"); _impl->multi_conv_builder->setEnableConvDithering(conv_ditherings); } void Compiler::setMultiConvPlanType(int64_t plan_type) { ERROR_ON_MSG(_impl->multi_conv_builder == nullptr, "Unexpected poptorch.MultiConv option: plan_type"); _impl->multi_conv_builder->setPlanType(plan_type); } void Compiler::setMultiConvPerConvReservedTiles(int64_t v) { ERROR_ON_MSG(_impl->multi_conv_builder == nullptr, "Unexpected poptorch.MultiConv option: per_conv_reserved_tiles"); _impl->multi_conv_builder->setPerConvReservedTiles(static_cast(v)); } void Compiler::setMultiConvCycleBackOff(double c) { ERROR_ON_MSG(_impl->multi_conv_builder == nullptr, "Unexpected poptorch.MultiConv option: cycle_back_off"); _impl->multi_conv_builder->setCycleBackOff(static_cast(c)); } std::vector Compiler::endMultiConv() { auto outputs = _impl->endMultiConv(); const TensorId first = HandleOutput{}(outputs, false, _impl.get()); std::vector out_ids(outputs.size()); std::iota(out_ids.begin(), out_ids.end(), first); return out_ids; } TensorId Compiler::addCPUCallback(const std::vector &inputs, const CallbackMetadata &callback, std::vector input_types, std::vector> input_shapes, std::vector output_types, std::vector> output_shapes) { const logging::LogContext ctx{"Compiler::addCPUCallback"}; logging::trace("Starting CPU callback adding"); // Usual poptorch -> popart tensor conversion/lookup. std::vector ins; ins.reserve(inputs.size()); std::transform(inputs.begin(), inputs.end(), std::back_inserter(ins), [&](TensorId index) { return _impl->ids[index]; }); // Populate the metadata structure which will be used to communicate between // all the components involved in running the host op. _impl->callbacks.emplace_front(); detail::CallbackInternalMetadata &metadata = _impl->callbacks.front(); // Python function we're calling. metadata.the_callback = callback.the_callback; // Pointers to the waiting python buffers. metadata.input_pointers = callback.input_pointers; metadata.output_pointers = callback.output_pointers; // A tracker so we can see how many streams have been inited by the poplar // buffer callback so we can call the python callback once it equals the // number of inputs. metadata.number_of_input_streams_inited = 0; // Used to mangle the name. detail::CallbackInternalMetadata::number_of_added_ops++; // Create an ID for each op so we can give a unique name to poplar for each // output/input. metadata.handle = "poptorch.host_op_" + std::to_string(detail::CallbackInternalMetadata::number_of_added_ops); metadata.input_types = std::move(input_types); metadata.input_shapes = std::move(input_shapes); metadata.output_types = std::move(output_types); metadata.output_shapes = std::move(output_shapes); std::map attributes_map; // We have to smuggle this through as a pointer as popart attribute map // doesn't support generic types. detail::CallbackInternalMetadata *as_ptr = &metadata; const std::intptr_t as_int = reinterpret_cast(as_ptr); const std::int64_t to_int64 = static_cast(as_int); logging::trace("Add CPU callback has added pointer {}", to_int64); attributes_map.insert({poptorch_custom_ops::host_op_metadata_attr, to_int64}); std::vector output = _impl->active_builder->customOp( poptorch_custom_ops::host_op, 1, ins, metadata.output_types.size(), attributes_map); // Convert the popart tensors back to poptorch tensors. return HandleOutput{}(output, false, _impl.get()); } std::uint32_t detail::CallbackInternalMetadata::number_of_added_ops = 0; void Compiler::detachFromDevice() { _impl->detachFromDevice(); } void Compiler::attachToDevice() { _impl->attachToDevice(); } bool Compiler::isAttachedToDevice() const { return _impl->isAttachedToDevice(); } Timestamps Compiler::getTimestamps() const { const auto num_inputs = getNumInputs(); const auto num_outputs = getNumOutputs(); Timestamps ts; ts.input.reserve(num_inputs); ts.input_complete.reserve(num_inputs); ts.output.reserve(num_outputs); ts.output_complete.reserve(num_outputs); for (size_t i = 0; i < num_inputs; i++) { const auto id = _impl->inputs[i]; ts.input.push_back(_impl->stepio.getInputTimestamps(id)); ts.input_complete.push_back(_impl->stepio.getInputCompleteTimestamps(id)); } for (size_t i = 0; i < num_outputs; i++) { const auto id = _impl->outputs[i]; ts.output.push_back(_impl->stepio.getOutputTimestamps(id)); ts.output_complete.push_back(_impl->stepio.getOutputCompleteTimestamps(id)); } return ts; } uint64_t Compiler::getCycleCount() const { if (_cycle_count != no_cycles) { return _cycle_count; } ERROR_ON_MSG(!_impl->popart_options.instrumentWithHardwareCycleCounter, "Cycle count logging is disabled."); ERROR("Please run the model at least once before obtaining cycle count."); } size_t Compiler::getNumInputs() const { return _impl->inputs.size(); } size_t Compiler::getNumOutputs() const { return _impl->outputs.size(); } void Compiler::verifySettingsForOverlappedIO(PopartOutputMode output_mode) { if (_impl->options.execution_mode == detail::ExecutionMode::Pipelined) { ERROR("Overlapped IO is not supported with poptorch.PipelinedExecution. " "If you are using only one IPU, please switch to " "poptorch.ShardedExecution."); } ERROR_ON_MSG(_impl->popart_options.numIOTiles == 0, "No IO tiles allocated. You must allocate at least 32 IO tiles " "using poptorch.Options().TensorLocations.numIOTiles."); if (output_mode != PopartOutputMode::Sum && output_mode != PopartOutputMode::All) { ERROR("Unsupported output mode for overlapped IO. Please switch output " "mode to poptorch.OutputMode.All or poptorch.OutputMode.Sum."); } } void setPopartLogLevel(logging::Level level) { for (uint64_t module = 0; module < static_cast(popart::logging::Module::none); module++) { popart::logging::setLogLevel(static_cast(module), static_cast(level)); } } void throwTestError(TestErrorType type) { const logging::LogContext ctx_top{"throwTestError::topLevel"}; { const logging::LogContext ctx{"throwTestError::bottomLevel"}; switch (type) { case TestErrorType::Poptorch: { ERROR("This is a PopTorch error"); } case TestErrorType::Popart: { throw popart::error("This is a Popart error"); } case TestErrorType::PopartInternal: { throw popart::internal_error("This is a Popart error"); } case TestErrorType::Poplibs: { throw poputil::poplibs_error("This is a Poplibs error"); } case TestErrorType::PoplarUnrecoverable: { throw poplar::unrecoverable_runtime_error("This is not recoverable"); } case TestErrorType::PoplarUnknown: { throw poplar::unknown_runtime_error("Don't know what happened"); } case TestErrorType::PoplarRecoverableFullReset: { throw poplar::recoverable_runtime_error( poplar::RecoveryAction::FULL_RESET, "Reboot needed"); } case TestErrorType::PoplarLinkError: { throw poplar::link_error("Link error", "Library -lfoo not found\ncheck path"); } default: { break; } } } ERROR("Unknown TestErrorType"); } namespace { class PopExceptionInfo : public ExceptionInfo { public: ~PopExceptionInfo() override = default; const char *what() const noexcept override; const char *type() const override; int64_t stackDepth() const override; const char *stack(int64_t level) const override; const char *filename() const override; uint64_t line() const override; const char *recoveryAction() const override; ErrorCategory category() const override; void extractStack(const popart::error &e); std::string mwhat; std::string mtype; std::vector mstack; std::string mfilename; uint64_t mline; std::string mrecovery_action; ErrorCategory mcategory; }; const char *PopExceptionInfo::what() const noexcept { return mwhat.c_str(); } const char *PopExceptionInfo::type() const { return mtype.c_str(); } int64_t PopExceptionInfo::stackDepth() const { return mstack.size(); } const char *PopExceptionInfo::stack(int64_t level) const { return mstack.at(level).c_str(); } const char *PopExceptionInfo::filename() const { return mfilename.c_str(); } uint64_t PopExceptionInfo::line() const { return mline; } const char *PopExceptionInfo::recoveryAction() const { return mrecovery_action.c_str(); } ErrorCategory PopExceptionInfo::category() const { return mcategory; } void PopExceptionInfo::extractStack(const popart::error &e) { std::istringstream iss(e.stackreport()); std::string l; // PopART adds a numbered prefix to each stack line: remove it: // [0] top_level_fn() // [1] main() // // Becomes: // // top_level_fn() // main() while (std::getline(iss, l)) { size_t first_space = l.find_first_of(' '); if (first_space == std::string::npos) { first_space = 0; } else { // Start at the first character after the space ++first_space; } mstack.push_back(l.substr(first_space)); } } } // namespace void rethrowPopartOrPoplarException(const std::exception_ptr &eptr, const char *filename, uint64_t line) { PopExceptionInfo pei; pei.mfilename = logging::shortPoptorchFilename(filename); pei.mline = line; pei.mcategory = ErrorCategory::Other; const std::string extra_info; try { std::rethrow_exception(eptr); } catch (const popart::internal_error &ex) { pei.mwhat = ex.what(); pei.mtype = "popart_internal_exception"; pei.extractStack(ex); } catch (const popart::error &ex) { pei.mwhat = ex.what(); pei.mtype = "popart_exception"; pei.extractStack(ex); } catch (const poplar::link_error &ex) { // Note: for some reason this error doesn't set its type in Poplar pei.mwhat = ex.what(); pei.mwhat += ". Output: " + ex.output; pei.mtype = "poplar_link_error"; } catch (const poplar::recoverable_runtime_error &ex) { pei.mwhat = ex.what(); pei.mtype = "poplar_"; pei.mtype += ex.type; pei.mcategory = ErrorCategory::RuntimeRecoverable; pei.mrecovery_action = poplar::toString(ex.getRecoveryAction()); } catch (const poplar::unrecoverable_runtime_error &ex) { pei.mwhat = ex.what(); pei.mtype = "poplar_"; pei.mtype += ex.type; pei.mcategory = ErrorCategory::RuntimeUnrecoverable; } catch (const poplar::poplar_error &ex) { pei.mwhat = ex.what(); pei.mtype = "poplar_"; pei.mtype += ex.type; } catch (const poputil::poplibs_error &ex) { pei.mwhat = ex.what(); pei.mtype = "poplibs_exception"; } catch (...) { return; } throw pei; } } // namespace popart_compiler } // namespace poptorch ================================================ FILE: popart_compiler/source/CompilerImpl.cpp ================================================ // Copyright (c) 2020 Graphcore Ltd. All rights reserved. #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "popart_compiler/Compiler.hpp" #include "popart_compiler/CompilerImpl.hpp" #include "popart_compiler/CompilerOptions.hpp" #include "popart_compiler/MultiConvBuilder.hpp" #include "popart_compiler/PopartEnums.hpp" #include "popart_compiler/Utils.hpp" namespace poptorch { namespace popart_compiler { namespace { std::string toString(const std::vector &vec) { std::stringstream ss; ss << "["; std::string sep{}; for (const auto &s : vec) { ss << sep << s; sep = ", "; } ss << "]"; return ss.str(); } std::string toString(OptimizerType type) { switch (type) { case OptimizerType::SGD1: return "SGD1"; case OptimizerType::SGD2: return "SGD2"; case OptimizerType::LAMB: case OptimizerType::LAMB_NO_BIAS: return "LAMB"; case OptimizerType::ADAM: return "ADAM"; case OptimizerType::ADAMW: case OptimizerType::ADAMW_NO_BIAS: return "ADAMW"; case OptimizerType::RMSPROP_CENTERED: case OptimizerType::RMSPROP: return "RMSPROP"; default: ERROR("Unreachable: Unsupported optimizer."); } } // If is_default: return the list of keys accepted by the // `const std::map> ¶ms` parameter // of the Popart constructor: it is usually the list of OptimizerValue // accepted by the explicit constructor. // // Else: return the list of keys accepted by insertSpecific (It's usually // defined in the optimizer's cpp file in a function called getSpecificNames() // TODO(T33686): these names should be provided by PopART. std::vector getAttributeNames(OptimizerType type, bool is_default) { switch (type) { case OptimizerType::SGD1: case OptimizerType::SGD2: { if (is_default) { return { "defaultLearningRate", "defaultWeightDecay", "defaultMomentum", "defaultDampening", "defaultVelocityScaling", "nesterov", "lossScaling"}; } return {"learningRate", "weightDecay", "momentum", "dampening", "velocityScaling", "nesterov"}; } case OptimizerType::LAMB: case OptimizerType::LAMB_NO_BIAS: { if (is_default) { return {"defaultLearningRate", "defaultWeightDecay", "defaultBeta1", "defaultBeta2", "defaultEps", "defaultMaxWeightNorm", "lossScaling"}; } return {"learningRate", "weightDecay", "beta1", "beta2", "eps", "maxWeightNorm"}; } case OptimizerType::ADAM: case OptimizerType::ADAMW: case OptimizerType::ADAMW_NO_BIAS: { if (is_default) { return {"defaultLearningRate", "defaultWeightDecay", "defaultBeta1", "defaultBeta2", "defaultEps", "lossScaling"}; } return {"learningRate", "weightDecay", "beta1", "beta2", "eps"}; } case OptimizerType::RMSPROP_CENTERED: case OptimizerType::RMSPROP: { if (is_default) { return {"defaultLearningRate", "defaultWeightDecay", "defaultAlpha", "defaultMomentum", "defaultEps", "lossScaling"}; } return {"learningRate", "weightDecay", "alpha", "momentum", "eps"}; } default: ERROR("Unreachable: Unsupported optimizer."); } } int indexOf(const std::vector &vec, const std::string &v) { auto it = std::find(vec.begin(), vec.end(), v); if (it == vec.end()) { return -1; } return it - vec.begin(); } std::vector vectorDiff(const std::vector &provided, const std::vector &expected) { std::vector missing; for (const auto &exp : expected) { if (indexOf(provided, exp) < 0) { missing.push_back(exp); } } return missing; } // Convert a Poptorch Optimizer into a map of parameters + types that // can be understood by the Popart Optimizer / insertSpecific. struct OptimizerParameters { public: OptimizerParameters(const Optimizer &opt, bool is_default); std::string debug() const; OptimizerType type; bool accum_types_provided; popart::DataType accum_type; popart::DataType first_order_momentum_accum_type; popart::DataType second_order_momentum_accum_type; bool use_tf_variant; float max_grad_norm; std::map> params; }; std::string OptimizerParameters::debug() const { std::stringstream ss; ss << toString(type); for (const auto &p : params) { ss << ", " << p.first << "=" << p.second.first; if (p.second.second) { ss << " (const)"; } } if (accum_types_provided) { ss << ", accumType=" << accum_type; ss << ", firstOrderMomentumAccumType=" << first_order_momentum_accum_type; ss << ", secondOrderMomentumAccumType=" << second_order_momentum_accum_type; } ss << ", useTfVariant=" << use_tf_variant; ss << ", maxGradNorm=" << max_grad_norm; return ss.str(); } OptimizerParameters::OptimizerParameters(const Optimizer &opt, bool is_default) : type(opt.type), accum_types_provided(opt.accum_types_provided), accum_type(opt.accum_type_is_half ? popart::DataType::FLOAT16 : popart::DataType::FLOAT), first_order_momentum_accum_type( opt.first_order_momentum_accum_type_is_half ? popart::DataType::FLOAT16 : popart::DataType::FLOAT), second_order_momentum_accum_type( opt.second_order_momentum_accum_type_is_half ? popart::DataType::FLOAT16 : popart::DataType::FLOAT), use_tf_variant(opt.use_tf_variant), max_grad_norm(opt.max_grad_norm) { // In Popart the attributes which can be specified per group are prefixed with // "default" For example learningRate -> defaultLearningRate In order to keep // it simple the PopTorch frontend will always use the group name, therefore // here we need to remap the PopTorch names to the Popart ones in the default // case we then fall back onto the default names for the remaining attributes // (e.g lossScaling) std::vector poptorch_names = getAttributeNames(opt.type, false); std::vector popart_names = getAttributeNames(opt.type, is_default); if (is_default) { poptorch_names.reserve(popart_names.size()); for (std::uint64_t i = poptorch_names.size(); i < popart_names.size(); ++i) { poptorch_names.push_back(popart_names[i]); } } std::vector provided_names; provided_names.reserve(poptorch_names.size()); for (const auto &p : opt.parameters) { const std::string name = reinterpret_cast(p.name); provided_names.push_back(name); const auto idx = indexOf(poptorch_names, name); ERROR_ON_MSG(idx < 0, "Unexpected " << (is_default ? "" : "group ") << "attribute " << name << " for optimizer " << toString(type) << ", allowed values: " << toString(poptorch_names)); ERROR_ON( !params.emplace(popart_names[idx], std::make_pair(p.value, p.is_const)) .second); } ERROR_ON_MSG(opt.parameters.size() != poptorch_names.size(), "Missing attributes: " << toString(type) << " optimizers require values for " << toString(vectorDiff(provided_names, poptorch_names))); } void assertSingleInstanceMaxNumIPUs(std::size_t num_ipus) { ERROR_ON_MSG(num_ipus > 64, "Too many IPUs requested (" << num_ipus << "). Experiments that need more than 64 " "IPUs require distributed execution."); } } // namespace namespace detail { popart::ConstVoidData StepIO::in(popart::TensorId id, int64_t num_elems, bool prefetch, bool /*isBroadcast*/) { (void)prefetch; timestamp(&_in_times, id); return get(id, &_inputs_info, num_elems, true); } void StepIO::inComplete(popart::TensorId id, int64_t num_elems, bool /*isBroadcast*/) { (void)num_elems; timestamp(&_in_complete_times, id); } popart::MutableVoidData StepIO::out(popart::TensorId id, int64_t num_elems) { timestamp(&_out_times, id); return get(id, &_outputs_info, num_elems, false); } void StepIO::outComplete(popart::TensorId id) { timestamp(&_out_complete_times, id); } void StepIO::computeStepDataInfo(const popart::TensorId &id, popart::IArray *array) { if (_step_data_info.find(id) != _step_data_info.end()) { return; } const auto dtype = AccessorType::getArrayDataType(*array); const auto rank = AccessorType::getArrayRank(*array); std::vector shape; for (size_t i = 0; i < rank; ++i) { shape.push_back(AccessorType::getArrayDim(*array, i)); } _step_data_info.insert({id, popart::TensorInfo(dtype, shape)}); } void StepIO::populate(const TensorArrayMap &inputs, const TensorArrayMap &outputs) { _inputs_info.clear(); for (const auto &input : inputs) { _inputs_info.insert({input.first, {input.second, 0, 0, 0}}); _in_times[input.first].clear(); _in_complete_times[input.first].clear(); computeStepDataInfo(input.first, &input.second); } _outputs_info.clear(); for (const auto &output : outputs) { _outputs_info.insert({output.first, {output.second, 0, 0, 0}}); _out_times[output.first].clear(); _out_complete_times[output.first].clear(); computeStepDataInfo(output.first, &output.second); } } template T StepIO::get(const popart::TensorId &id, TensorArrayInfo *map, int64_t num_elems, bool is_input) { auto it = map->find(id); ERROR_ON_MSG(it == map->end(), "Internal Compiler Error in StepIO"); auto &array_info = it->second; auto it2 = _step_data_info.find(id); ERROR_ON_MSG(it2 == _step_data_info.end(), "Internal Compiler Error in StepIO"); T step_data; step_data.info = it2->second; uint8_t *ptr = static_cast(AccessorType::getDataPointer(array_info.array)); const int64_t num_bytes = static_cast(step_data.info.getDataTypeInfo()->nbytes()) * num_elems; if (is_input && array_info.offset == array_info.end_offset) { int64_t tidx; const int64_t input_group_count = _replica_count / _input_group_size; if (_input_cgt == popart::CommGroupType::Consecutive) { tidx = array_info.replica_idx / _input_group_size; } else { ERROR_ON_MSG(_input_cgt != popart::CommGroupType::Orthogonal, "Unexpected input CommGroupType " << _input_cgt); tidx = array_info.replica_idx % input_group_count; } array_info.offset = tidx * (step_data.info.nbytes() / input_group_count); array_info.end_offset = ((tidx + 1) * (step_data.info.nbytes() / input_group_count)) % step_data.info.nbytes(); array_info.replica_idx = (array_info.replica_idx + 1) % _replica_count; } ptr += array_info.offset; array_info.offset = (array_info.offset + num_bytes) % step_data.info.nbytes(); step_data.data = ptr; return step_data; } void StepIO::timestamp(TensorTimestamps *time, const popart::TensorId &id) { auto now = std::chrono::system_clock::now().time_since_epoch(); auto stamp = static_cast( std::chrono::duration_cast(now).count()) / 1000; time->at(id).push_back(stamp); } void StepIO::setInputGroupings(popart::CommGroupType type, int64_t input_group_size, int64_t replica_count) { _input_cgt = type; _input_group_size = input_group_size; _replica_count = replica_count; } const std::vector &WeightsIO::parameterIds() const { return _weights_order; } bool WeightsIO::contains(popart::TensorId id) const { return _weights.find(id) != _weights.end(); } popart::MutableVoidData WeightsIO::weight(popart::TensorId id) const { return _weights.at(id); } void WeightsIO::registerParameter(const popart::TensorId &id, const popart::TensorInfo &info) { ERROR_ON(contains(id)); _weights[id].info = info; _weights_order.push_back(id); } void WeightsIO::updateData(const std::vector &host_buffers) { ERROR_ON(host_buffers.size() != _weights_order.size()); for (std::uint64_t i = 0; i < host_buffers.size(); ++i) { if (host_buffers[i] != nullptr) { _weights[_weights_order[i]].data = host_buffers[i]; } } } bool ConstVoidDataLessThan::operator()(const popart::ConstVoidData &lhs, const popart::ConstVoidData &rhs) const { // Optional data should not be set ERROR_ON(lhs.storesData()); ERROR_ON(rhs.storesData()); // First compare on data type if (lhs.info.dataType() != rhs.info.dataType()) { return lhs.info.dataType() < rhs.info.dataType(); } // Next compare on shape const auto &lhs_shape = lhs.info.shape(); const auto &rhs_shape = rhs.info.shape(); if (lhs_shape != rhs_shape) { // Shape is a vector so uses std::lexicographical_compare return lhs_shape < rhs_shape; } // Otherwise, compare underlying data ERROR_ON(lhs.info.nbytes() != rhs.info.nbytes()); return std::memcmp(lhs.data, rhs.data, lhs.info.nbytes()) < 0; } CompilerImpl::~CompilerImpl() { if (_device && isAttachedToDevice()) { detachFromDevice(); } } void CompilerImpl::setExecutionStrategyAttributes( const std::set &tensors) { ERROR_ON_MSG(active_ipu < 0, "No active Block, all the ops must belong to a Block"); switch (options.execution_mode) { case ExecutionMode::Pipelined: case ExecutionMode::Sharded: active_builder->pipelineStage(tensors, active_stage); break; case ExecutionMode::Phased: ERROR_ON(active_phase < 0); active_builder->executionPhase(tensors, active_phase); break; default: ERROR("Invalid ExecutionMode active"); } used_ipus.insert(active_ipu); active_builder->virtualGraph(tensors, active_ipu); } std::string CompilerImpl::checkSystemConfig() const { ERROR_ON_MSG(num_ipus == 0, "Must call createDevice() first"); auto &dm = popart::DeviceManager::createDeviceManager(); if (dm.enumerateDevices().empty()) { return "\nNo IPU detected in the system. \nFor more information use " "the Graphcore command-line tool `gc-monitor`."; } if (options_set.count("ipu_id") != 0u) { return ""; } if (dm.enumerateDevices(options.sync_pattern, num_ipus).empty()) { return fmt::format("\nNo device found on the system with {} IPUs: the " "configuration needs changing", num_ipus); } return ""; } void CompilerImpl::updateUseModelConfig() { // The configuration set by the application takes precedence over everything // else. if (options_set.count("use_model") != 0u) { logging::info("From the user configuration: Ipu model: {}", options.ipu_model ? "Enabled" : "Disabled"); } else if (ipuModelEnvironmentVariableIsEnabled() || ipuSmallModelEnvironmentVariableIsEnabled()) { // As a fallback the model can be enabled by the POPTORCH_IPU_MODEL // environment variable. options.ipu_model = true; } else { options.ipu_model = false; } } std::uint64_t CompilerImpl::numPipelineStages() { ERROR_ON(options.execution_mode != ExecutionMode::Pipelined); // Every time the IPU ID changes, there is an additional stage. In PopTorch, // two blocks/stages with the same IPU ID will be merged. std::uint64_t const forward_stages = num_ipu_switches + 1; // If training, there are twice the number of stages for backpropagation // minus one (because on the last IPU, the backpropagation happens as part of // the same pipeline stage). // (NB this is an upper bound, as tensor.detach() could cut off stages, but // we ignore unusual edge cases.) if (is_training) { return forward_stages * 2 - 1; } return forward_stages; } void CompilerImpl::addMemoryToOutput(TensorId id, void *ptr, std::unique_ptr &&memory) { if (isHostSideConstant(id)) { getHostSideConstant(id).copyDataTo(ptr); return; } memory_manager.push_back(std::move(memory)); popart::TensorId const popart_id = ids[id]; if (!popart_outgoing.insert({popart_id, *memory_manager.back().get()}) .second) { // Insertion in the map failed because there is already a pointer associated // with that id. outgoing_duplicates[popart_id].push_back(ptr); } } void CompilerImpl::addOutputTensor( const std::vector &tensors) { active_builder->addOutputTensor(tensors.at(0)); } popart::TensorId CompilerImpl::reshape(const std::vector &tensors, const std::vector &shape) { auto ai_onnx = active_builder->aiOnnxOpset11(); popart::Shape const s = {static_cast(shape.size())}; popart::TensorInfo const tensor_info("INT64", s); auto new_shape = ai_onnx.constant({shape.data(), tensor_info}); return ai_onnx.reshape({tensors.at(0), new_shape}, getDebugContext("Reshape")); } std::vector CompilerImpl::customOperation( const std::vector &args, const std::string &op, const std::string &domain, std::int64_t version, std::int64_t num_outputs, const std::shared_ptr> &attributes) { logging::info("Adding custom op with {} inputs ", static_cast(args.size())); // Convert to the the format required for Popart. We cannot use popart::any // as a known type externally in poptorch to avoid needing popart headers. std::map attributes_map; for (auto &attribute : *attributes) { attributes_map[attribute.name()] = *(attribute.getValue()); } if (!attributes->empty()) { std::stringstream ss; ss << "Attributes: "; for (auto &attribute : *attributes) { ss << attribute.name(); if (&attribute != &attributes->back()) { ss << ", "; } } logging::trace(ss.str().c_str()); } const std::int32_t num_inputs = static_cast(args.size()); popart::OperatorIdentifier const id = {domain, op, 1, num_inputs}; return active_builder->customOp(id, version, args, num_outputs, attributes_map, getDebugContext(op)); } popart::TensorId CompilerImpl::recomputationCheckpoint( const std::vector &tensors) { // Popart is simply a for loop over vector inputs and it is better for the // PyTorch Graph to avoid Tuple/List packs and unpacks ERROR_ON(tensors.size() != 1); return active_builder->checkpointOutput(tensors)[0]; } popart::TensorId CompilerImpl::tensorConstant(const std::vector &tensors, const PopartConstant &constant) { UNUSED(tensors); // Use the cache for the active builder. This is effectively one cache per // subgraph as constants only exist on one graph. auto ¤t_cache(_constants_cache[active_builder]); // Reuse a tensor if an identical one exists already if (current_cache.count(constant.getPopartData()) != 0u) { return current_cache[constant.getPopartData()]; } // To preserve memory, use a clone of the data const size_t buff_size = constant.getPopartData().info.nbytes(); _constant_cloned_data.emplace_back(new char[buff_size]); auto *new_buff = reinterpret_cast(&_constant_cloned_data.back()[0]); std::memcpy(new_buff, constant.getPopartData().data, buff_size); popart::ConstVoidData const new_constant(new_buff, constant.getPopartData().info); auto ai_onnx = active_builder->aiOnnxOpset11(); popart::TensorId tensor = ai_onnx.constant( new_constant, false /*is_value_sparse*/, getDebugContext("Constant")); current_cache[new_constant] = tensor; return tensor; } TensorId CompilerImpl::hostSideTensorConstant( const std::vector &tensors, HostSideConstant constant) { UNUSED(tensors); _host_side_constants.emplace(std::make_pair(ids.size(), std::move(constant))); // Add a dummy into ids ids.emplace_back("__poptorch__host_side_constant"); return ids.size() - 1; } std::shared_ptr CompilerImpl::createDevice(bool must_attach) { auto connection_type = options.connection_type; if (must_attach) { ERROR_ON_MSG( connection_type == popart::DeviceConnectionType::Never, "[Internal] must_attach incompatible with connection type Never"); connection_type = popart::DeviceConnectionType::Always; _device = nullptr; } else { ERROR_ON_MSG(_device, "device already created"); } updateUseModelConfig(); ERROR_ON(used_ipus.empty()); // Sometimes phased execution doesn't use all of the IPUs in a range, so check // the Ids too. const auto max_ipu_id = *std::max_element(used_ipus.begin(), used_ipus.end()); num_ipus = std::max(used_ipus.size(), max_ipu_id + 1) * popart_options.replicatedGraphCount; ERROR_ON_MSG(num_ipus == 0, "Your compiled model is empty (All the " "operations have been optimised out)"); assertSingleInstanceMaxNumIPUs(num_ipus); if (options.ipu_model) { if (popart_options.enableEngineCaching) { logging::warn("enableExecutableCaching doesn't work with the IPU model"); } errorOnCycleLogging(); std::map model_options; model_options["numIPUs"] = std::to_string(num_ipus); std::string const env_ipu_model_version = getIpuModelVersion(); model_options["ipuVersion"] = env_ipu_model_version; const int num_tiles_per_ipu = getNumTilesPerIpu(env_ipu_model_version); model_options["tilesPerIPU"] = std::to_string(num_tiles_per_ipu); ERROR_ON_MSG(connection_type == popart::DeviceConnectionType::Never, "ConnectionType.Never / poptorch.Options.useOfflineIpuTarget " "not supported for the IPU model"); _device = popart::DeviceManager::createDeviceManager().createIpuModelDevice( model_options); // Acquired HW devices will be attached if the used connection type is // Always but createIpuModelDevice() doesn't take a connection type // so we manually attach to the device if the connection type is needed. if (connection_type == popart::DeviceConnectionType::Always) { ERROR_ON_MSG(!_device->attach(), "Internal error: attach can't fail for " "model devices"); } logging::debug("Instantiated device, running on IPU model with {} tiles.", num_tiles_per_ipu); } else { if (connection_type == popart::DeviceConnectionType::Never) { // Offline compilation path: create an offline device regardless of what's // present on the system. ERROR_ON_MSG(options_set.count("ipu_id"), "Offline compilation targeting a specific id not supported"); errorOnCycleLogging(); const auto get_ipu_version = [&]() -> std::int64_t { if (options.ipu_version == CompilerOptions::use_system_ipu_version) { return ipuHardwareVersion(); } return options.ipu_version; }; std::map device_options; device_options["numIPUs"] = std::to_string(num_ipus); device_options["ipuVersion"] = "ipu" + std::to_string(get_ipu_version()); device_options["syncPattern"] = popart::syncPatternToString(options.sync_pattern); _device = popart::DeviceManager::createDeviceManager().createOfflineIPUDevice( device_options); ERROR_ON_MSG(!_device, "Failed to create offline IPU device"); } else { // Round up number of ipus to a power of 2. const auto rounded_num_ipus = roundUpNumIPUs(num_ipus); if (rounded_num_ipus != num_ipus) { std::string const common_msg( ", because PopTorch must reserve a power of 2 or" " maximum of 64 IPUs per process"); if (options.auto_round_num_ipus) { logging::warn("Reserving {} IPUs when the model specifices the use " "of only {}{}. {} will be reserved but not used.", rounded_num_ipus, num_ipus, common_msg, rounded_num_ipus - num_ipus); num_ipus = rounded_num_ipus; } else { ERROR("The model specifies the use of " << num_ipus << " IPUs, " "however PopTorch must reserve a minimum of " << rounded_num_ipus << " in order to allow the model to run" << common_msg << ". Please reconfigure your model to use a " "different number of IPUs or set " "poptorch.Options().autoRoundNumIPUs(True)."); } } assertSingleInstanceMaxNumIPUs(num_ipus); do { // Regular IPU hardware target if (options_set.count("ipu_id") == 0u) { _device = popart::DeviceManager::createDeviceManager() .tryAcquireAvailableDevice( num_ipus, 0, options.sync_pattern, connection_type); ERROR_ON_MSG(!_device && !waitIfUnavailable(), "Failed to acquire " << num_ipus << " IPU(s)" << this->checkSystemConfig()); if (_device) { logging::debug("Acquired {} IPU(s): running on device Id {}.", num_ipus, _device->getId()); } } else { _device = popart::DeviceManager::createDeviceManager().tryAcquireDeviceById( options.ipu_id, options.sync_pattern, connection_type); ERROR_ON_MSG(!_device && !waitIfUnavailable(), "Failed to acquire device Id " << options.ipu_id << checkSystemConfig()); ERROR_ON_MSG(_device && static_cast( _device->getNumIpus()) < num_ipus, "Expected at least replication factor * used IPUs = " << used_ipus.size() << " * " << popart_options.replicatedGraphCount << " = " << num_ipus << " device Ids but the user provided " << _device->getNumIpus()); if (_device && static_cast(_device->getNumIpus()) != num_ipus) { logging::warn( "Expected replication factor * used IPUs = {} * {} " "= {} device Ids but the device selected has {} IPUs which " "means some of them will not be used.", used_ipus.size(), popart_options.replicatedGraphCount, num_ipus, _device->getNumIpus()); } if (_device) { logging::debug("Acquired IPU device with id {}, running on device.", options.ipu_id); } } } while (!_device && waitForAWhile()); } } if (_device->isAttached()) { logging::trace("Attached to device {}", _device->getId()); } return _device; } void CompilerImpl::detachFromDevice() { if (used_ipus.empty()) { return; } logging::trace("Begin detaching device {}", _device->getId()); ERROR_ON_MSG(!_device, "Cannot find a valid device"); ERROR_ON_MSG(!_device->isAttached(), "The device has already been detached"); _device->detach(); logging::debug("Detached from device {}", _device->getId()); } bool CompilerImpl::isAttachedToDevice() const { if (used_ipus.empty()) { // We are always attached to at least 0 IPUs. return true; } ERROR_ON_MSG(!_device, "Cannot find a valid device"); return _device->isAttached(); } template void CompilerImpl::updateGroups(OptimizerType *optimizer, const std::vector &optimizers) { // For each optimizer group. for (std::size_t idx = 1; idx < optimizers.size(); ++idx) { // Index 0 is 'defaults' const std::size_t group = idx - 1; const OptimizerParameters group_opt{optimizers[idx], false}; logging::debug( "Updating group {} optimizer with {} for (tensors affected {})", group, group_opt.debug(), toString(grad_update_groups[group])); // For each tensor in the group. for (const popart::TensorId &id : grad_update_groups[group]) { // Update the optimizer optimizer->insertSpecific(id, group_opt.params); } } } std::unique_ptr CompilerImpl::getPopartOptimizer(std::vector optimizers) { if (optimizers.empty()) { return nullptr; } // If using the separate tensor variant, glue velocity scaling to loss // scaling. When T39344 is completed, there will be no benefit to setting // velocity scaling different to loss scaling for the separate tensor case. // The first optimizer contains the default values. auto &default_value_optimizer(optimizers[0]); if (default_value_optimizer.type == OptimizerType::SGD2) { copyParam(default_value_optimizer, default_value_optimizer, "lossScaling", "velocityScaling"); } // The first optimizer contains the default values. const OptimizerParameters opt{optimizers[0], true}; // Print to debug the new optimizer. logging::debug("Updating graph optimizer with {}", opt.debug()); ERROR_ON_MSG(std::isnan(opt.max_grad_norm), "Maximum norm of gradients cannot be NaN"); std::vector clipnorms; if (opt.max_grad_norm != std::numeric_limits::infinity()) { clipnorms.push_back( popart::ClipNormSettings::clipAllWeights(opt.max_grad_norm)); } switch (opt.type) { case OptimizerType::SGD1: { ERROR_ON(!opt.accum_types_provided); auto optimizer = std::unique_ptr(new popart::SGD( opt.params, clipnorms, popart::SGDAccumulatorAndMomentum::Combined, popart::DataType::UNDEFINED, popart::DataType::UNDEFINED, getDebugContext("SGD"))); updateGroups(optimizer.get(), optimizers); return optimizer; } case OptimizerType::SGD2: { ERROR_ON(!opt.accum_types_provided); // Copy loss scaling to velocity scaling for all groups for (std::size_t idx = 1; idx < optimizers.size(); ++idx) { copyParam(optimizers[idx], default_value_optimizer, "lossScaling", "velocityScaling"); } auto optimizer = std::unique_ptr(new popart::SGD( opt.params, clipnorms, popart::SGDAccumulatorAndMomentum::Separate, opt.accum_type, opt.first_order_momentum_accum_type, getDebugContext("SGD"))); updateGroups(optimizer.get(), optimizers); return optimizer; } case OptimizerType::ADAM: case OptimizerType::ADAMW: case OptimizerType::ADAMW_NO_BIAS: case OptimizerType::LAMB: case OptimizerType::LAMB_NO_BIAS: { auto adam_mode = popart::AdamMode::Adam; auto decay_mode = popart::WeightDecayMode::Decay; if (opt.type == OptimizerType::ADAM) { decay_mode = popart::WeightDecayMode::L2Regularization; } else if (opt.type == OptimizerType::ADAMW) { adam_mode = popart::AdamMode::Adam; } else if (opt.type == OptimizerType::LAMB) { adam_mode = popart::AdamMode::Lamb; } else if (opt.type == OptimizerType::LAMB_NO_BIAS) { adam_mode = popart::AdamMode::LambNoBias; } // NB WeightDecayMode set to default WeightDecayMode::Decay meaning true // weight decay rather than L2 ERROR_ON(!opt.accum_types_provided); auto optimizer = std::make_unique( opt.params, adam_mode, decay_mode, opt.accum_type, opt.first_order_momentum_accum_type, opt.second_order_momentum_accum_type, clipnorms, false, getDebugContext("Adam")); updateGroups(optimizer.get(), optimizers); return optimizer; } case OptimizerType::RMSPROP: case OptimizerType::RMSPROP_CENTERED: { ERROR_ON(!opt.accum_types_provided); popart::AdaptiveMode const mode = opt.type == OptimizerType::RMSPROP ? popart::AdaptiveMode::RMSProp : popart::AdaptiveMode::CenteredRMSProp; auto optimizer = std::make_unique( opt.params, mode, popart::WeightDecayMode::L2Regularization, opt.accum_type, opt.first_order_momentum_accum_type, opt.second_order_momentum_accum_type, popart::DataType::FLOAT, opt.use_tf_variant, getDebugContext("Adaptive")); updateGroups(optimizer.get(), optimizers); return optimizer; } default: ERROR("Unreachable: Unsupported optimizer."); } } popart::TensorId CompilerImpl::addNotInPlace(const std::vector &in) { auto ai_onnx = active_builder->aiOnnxOpset11(); popart::TensorId output = ai_onnx.add(in, getDebugContext("AddNotInPlace")); active_builder->setInplacePreferences( output, {{"AddLhsInplace", -1}, {"AddRhsInplace", -1}}); return output; } popart::TensorId CompilerImpl::randomNormal(const std::vector &tensors, const std::vector &shape, float mean, float scale, const std::string &dtype) { UNUSED(tensors); auto ai_onnx = active_builder->aiOnnxOpset11(); const auto pdt = popart::dataTypeFromString(dtype); return ai_onnx.randomnormal(shape, popart::getONNXDataTypeAsInt(pdt), mean, scale, nonstd::optional(), getDebugContext("Randomnormal")); } popart::TensorId CompilerImpl::randomUniform(const std::vector &tensors, const std::vector &shape, float high, float low, const std::string &dtype) { UNUSED(tensors); auto ai_onnx = active_builder->aiOnnxOpset11(); const auto pdt = popart::dataTypeFromString(dtype); return ai_onnx.randomuniform(shape, popart::getONNXDataTypeAsInt(pdt), high, low, nonstd::optional(), getDebugContext("Randomuniform")); } popart::TensorId CompilerImpl::ones(const std::vector &tensors, const std::vector &shape, const std::string &dtype) { return zerosOrOnes(tensors, shape, dtype, false); } popart::TensorId CompilerImpl::zeros(const std::vector &tensors, const std::vector &shape, const std::string &dtype) { return zerosOrOnes(tensors, shape, dtype, true); } popart::TensorId CompilerImpl::zerosOrOnes(const std::vector &tensors, const std::vector &shape, const std::string &dtype, bool zeros) { auto total_size = static_cast(std::accumulate( shape.begin(), shape.end(), 1, std::multiplies())); if (dtype == "INT32") { std::vector const_buff(total_size, zeros ? 0 : 1); const PopartConstant popart_const(PopartType::INT32, const_buff.data(), shape); return tensorConstant(tensors, popart_const); } if (dtype == "FLOAT") { std::vector const_buff(total_size, zeros ? 0 : 1); const PopartConstant popart_const(PopartType::FLOAT, const_buff.data(), shape); return tensorConstant(tensors, popart_const); } if (dtype == "FLOAT16") { std::vector const_buff(total_size, popart::floatToHalf(zeros ? 0 : 1)); const PopartConstant popart_const(PopartType::FLOAT16, const_buff.data(), shape); return tensorConstant(tensors, popart_const); } if (dtype == "BOOL") { struct Bool { bool b; }; std::vector const_buff(total_size, {!zeros}); const PopartConstant popart_const(PopartType::BOOL, &(const_buff[0].b), shape); return tensorConstant(tensors, popart_const); } ERROR("Unsupported type " << dtype); } popart::TensorId CompilerImpl::unfold(const std::vector &tensors, int64_t dimension, int64_t size, int64_t step) { // Implements the TUnfoldHelper interface in Poprithms using ONNX operations. struct PoptorchUnfoldHelper { struct InternalState { CompilerImpl *parent; popart::Builder *builder; popart::TensorId tensor; popart::TensorId scalarConstI64(int64_t val) const { const PopartConstant val_const(PopartType::INT64, &val, {}); return parent->tensorConstant({}, val_const); } popart::TensorId shapeAsTensor(const std::vector &shape) const { std::vector new_shape(shape.begin(), shape.end()); const PopartConstant shape_const( PopartType::INT64, new_shape.data(), {static_cast(new_shape.size())}); return parent->tensorConstant({}, shape_const); } InternalState transform(popart::TensorId &&new_id) const { InternalState new_state(*this); new_state.tensor = std::move(new_id); parent->setExecutionStrategyAttributes({new_state.tensor}); return new_state; } }; static InternalState slice(const InternalState &state, uint64_t dim, uint64_t start, uint64_t end) { auto dims = state.scalarConstI64(dim); auto starts = state.scalarConstI64(start); auto ends = state.scalarConstI64(end); return state.transform(state.builder->aiOnnxOpset11().slice( {state.tensor, starts, ends, dims})); } static InternalState broadcast(const InternalState &state, uint64_t N, uint64_t dim) { auto new_shape = shape(state); ERROR_ON(new_shape[dim] != 1); new_shape[dim] *= N; auto shape_tensor = state.shapeAsTensor(new_shape); return state.transform( state.builder->aiOnnxOpset11().expand({state.tensor, shape_tensor})); } static InternalState reshape(const InternalState &state, const std::vector &shape) { auto shape_tensor = state.shapeAsTensor(shape); return state.transform( state.builder->aiOnnxOpset11().reshape({state.tensor, shape_tensor})); } static InternalState concat(const std::vector &states, uint64_t axis) { ERROR_ON(states.empty()); std::vector tensor_ids; tensor_ids.reserve(states.size()); for (const auto &tensor : states) { tensor_ids.push_back(tensor.tensor); } const auto &first = states.front(); return first.transform(first.builder->aiOnnxOpset11().concat( tensor_ids, static_cast(axis))); } static InternalState dimShuffle(const InternalState &state, const std::vector &permutation) { const std::vector permutation_ints(permutation.begin(), permutation.end()); state.builder->setAttribute("perm", permutation_ints); auto new_tensor = state.transform( state.builder->aiOnnxOpset11().transpose({state.tensor})); state.builder->clearAttribute("perm"); return new_tensor; } static uint64_t dim(const InternalState &state, uint64_t axis) { return state.builder->getTensorShape(state.tensor)[axis]; } static uint64_t rank_u64(const InternalState &state) { // NOLINT return static_cast( state.builder->getTensorShape(state.tensor).size()); } static std::vector shape(const InternalState &state) { const auto &&my_shape = state.builder->getTensorShape(state.tensor); return std::vector(my_shape.begin(), my_shape.end()); } }; ERROR_ON(dimension < 0); ERROR_ON(size < 0); ERROR_ON(step < 0); ERROR_ON(tensors.size() != 1); const auto &first = tensors.front(); using T = PoptorchUnfoldHelper::InternalState; using H = PoptorchUnfoldHelper; return poprithms::ndarray::Unfolder::unfold( {this, active_builder, first}, dimension, size, step) .tensor; } popart::TensorId CompilerImpl::prelu(std::vector &tensors) { const popart::TensorId &self = tensors[0]; popart::TensorId &weight = tensors[1]; const auto self_shape = active_builder->getTensorShape(self); const auto weight_shape = active_builder->getTensorShape(weight); if (self_shape.size() > weight_shape.size() + 1) { // PyTorch's implementation adds some extra singleton dimensions to the // weight to ensure it is 'unidirectionally broadcastable' with the input. std::vector unsqueeze_axes(self_shape.size() - weight_shape.size() - 1); std::iota(unsqueeze_axes.begin(), unsqueeze_axes.end(), weight_shape.size()); weight = active_builder->aiOnnxOpset11().unsqueeze( {weight}, unsqueeze_axes, getDebugContext("Unsqueeze")); setExecutionStrategyAttributes({weight}); } return active_builder->aiOnnxOpset11().prelu(tensors, getDebugContext("Prelu")); } const HostSideConstant &CompilerImpl::getHostSideConstant(TensorId id) const { return _host_side_constants.at(id); } bool CompilerImpl::isHostSideConstant(TensorId id) const { return _host_side_constants.count(id) != 0u; } void CompilerImpl::addMultiConvPart( const std::vector &tensors, const std::vector &dilations, const std::vector &kernel_shape, const std::vector &pads, const std::vector &strides) { if (multi_conv_builder == nullptr) { multi_conv_builder = std::make_unique(); } multi_conv_builder->addConv(tensors, dilations, kernel_shape, pads, strides); } void CompilerImpl::setAttribute(const std::string &attribute, const std::string &key, const std::string &value) { auto &attrs = _attribute_key_value_map[attribute]; attrs[key] = value; std::vector attrs_vec; for (auto &attr : attrs) { attrs_vec.push_back(attr.first); attrs_vec.push_back(attr.second); } active_builder->setAttribute(attribute, attrs_vec); } void CompilerImpl::clearAttribute(const std::string &attribute, const std::string &key) { auto &attrs = _attribute_key_value_map[attribute]; ERROR_ON_MSG(attrs.erase(key) == 0, "Unknown key '" << key << "' for attribute '" << attribute << "'."); active_builder->clearAttribute(attribute); if (attrs.empty()) { ERROR_ON_MSG(_attribute_key_value_map.erase(attribute) == 0, "Unknown attribute '" << attribute << "'."); } else { std::vector attrs_vec; for (auto &attr : attrs) { attrs_vec.push_back(attr.first); attrs_vec.push_back(attr.second); } active_builder->setAttribute(attribute, attrs_vec); } } popart::DebugContext CompilerImpl::getDebugContext(const std::string &name) { std::string const op_name = op_builder->getNameScope() + name; popart::DebugContext const dc(name, code_location); popart::DebugInfo di(dc, "poptorch"); di.setValue("torch_schema", torch_node); di.setValue("op_type", name); di.setValue("op_name", op_name); return {di}; } std::vector CompilerImpl::endMultiConv() { ERROR_ON_MSG(multi_conv_builder == nullptr, "Unexpected end_multi_conv."); auto outs = multi_conv_builder->build(active_builder); multi_conv_builder.reset(); return outs; } bool CompilerImpl::waitIfUnavailable() const { // Force disable the wait if the system doesn't contain an IPU that // matches the requested config. static const bool should_wait = waitIfIpuIsUnavailable() && checkSystemConfig().empty(); return should_wait; } void CompilerImpl::attachToDevice() { if (used_ipus.empty()) { // We are always attached to at least 0 IPUs. return; } ERROR_ON_MSG(_device->isAttached(), "Already attached to a device"); // TODO(T21799): PopART onDemand connection will only try to connect to // the first device matching the requested config which means if several // tests only need 1 IPU, they will all wait on IPU 0. // As a workaround we request a new device from PopART and swap the device // in the live session. session->getDevice().setDeviceInfo(createDevice(/*must_attach*/ true)); ERROR_ON_MSG(!_device, "Cannot find a valid device"); ERROR_ON_MSG(!_device->isAttached(), "Still not attached to a device"); session->getDevice().loadEngineAndConnectStreams(); } std::string CompilerImpl::getPopartIR() const { if (used_ipus.empty()) { return "unavailable (No IPUs used)"; } if (session->getExecutable().isDeserialized()) { return "unavailable (Cached executable)"; } return session->serializeIr(popart::IrSerializationFormat::JSON); } std::set CompilerImpl::getTensorNames() const { return session->getAllTensorIds(); } PopartType CompilerImpl::getPopartType(TensorId id) const { if (isHostSideConstant(id)) { return getHostSideConstant(id).popartType(); } popart::DataType dtype; const auto popart_id = ids[id]; if (!session) { if (!active_builder->hasValueInfo(popart_id)) { return PopartType::UNDEFINED; } dtype = active_builder->getTensorDataType(popart_id); } else { if (!session->hasInfo(popart_id)) { return PopartType::UNDEFINED; } popart::TensorInfo const info = session->getInfo(popart_id); dtype = info.dataType(); } #define DEFINE_CASE(value) \ case popart::DataType::value: { \ return PopartType::value; \ } switch (dtype) { FOR_ALL_POPART_TYPES(DEFINE_CASE) } #undef DEFINE_CASE ERROR("Unsupported popart type in return: " << dtype); } void CompilerImpl::cachePopartTypes() { for (size_t idx = 1; idx < ids.size(); idx++) { ids_types.push_back(getPopartType(idx)); } } void CompilerImpl::errorOnCycleLogging() const { ERROR_ON_MSG(popart_options.instrumentWithHardwareCycleCounter, "Cycle count logging is only supported on actual IPU hardware."); } } // namespace detail } // namespace popart_compiler } // namespace poptorch ================================================ FILE: popart_compiler/source/SessionOptions.cpp ================================================ // Copyright (c) 2020 Graphcore Ltd. All rights reserved. #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "popart_compiler/Compiler.hpp" #include "popart_compiler/CompilerOptions.hpp" #include "popart_compiler/PopartEnums.hpp" #include "popart_compiler/SessionOptionsImpl.hpp" #include "popart_compiler/Utils.hpp" #include "poptorch_logging/Error.hpp" #include "poptorch_logging/Logging.hpp" namespace { // To avoid code duplication we use the same std::pair inserter // to add values in map, vector, set containers but in practice only map // actually takes a pair of values (The others take a single element). So, for // containers taking only a single value, this magic string should be passed as // the second element of the pair. const std::string value_not_set = "__poptorch_value_not_set__"; // Wrapper functor used to print to the debug channel the value // of the options set by poptorch.Options template class Setter { public: Setter(std::function fn, std::string name) : _fn(std::move(fn)), _name(std::move(name)) {} void operator()(Value value); private: std::function _fn; const std::string _name; }; template <> void Setter>::operator()( std::pair value) { // NOLINT _fn(value); if (value.second == value_not_set) { poptorch::logging::debug("poptorch.Options added {} to {}", value.first, _name); } else { poptorch::logging::debug("poptorch.Options set {}[{}] to {}", _name, value.first, value.second); } } template void Setter::operator()(Value value) { _fn(value); poptorch::logging::debug("poptorch.Options set {} to value {}", _name, value); } template void registerSetter(std::map> &options, const std::string &name, Lambda setter) { std::function fn = setter; options[name] = Setter(fn, name); } } // namespace namespace poptorch { namespace popart_compiler { namespace detail { SessionOptionsImpl::SessionOptionsImpl() { // The keys must match the name and type of the attributes of SessionOptions // in python/__init__.py registerSetter(bool_options, "auto_round_num_ipus", [&](bool value) { poptorch_options.auto_round_num_ipus = value; }); registerSetter(bool_options, "use_model", [&](bool value) { poptorch_options.ipu_model = value; }); registerSetter(bool_options, "serial_phases_execution", [&](bool value) { poptorch_options.serial_phases_execution = value; }); registerSetter(bool_options, "separate_backward_phase", [&](bool value) { poptorch_options.separate_backward_phase = value; }); registerSetter(bool_options, "broadcast_buffers", [&](bool value) { poptorch_options.broadcast_buffers = value; }); registerSetter(bool_options, "enableExplicitIR", [&](bool enable) { popart_options.enableExplicitIR(enable); }); registerSetter(uint64_options, "device_iterations", [&](std::uint64_t value) { poptorch_options.steps = value; }); registerSetter(uint64_options, "num_distributed_processes", [&](std::uint64_t value) { poptorch_options.num_distributed_processes = value; }); registerSetter(uint64_options, "distributed_process_id", [&](std::uint64_t value) { poptorch_options.distributed_process_id = value; }); registerSetter(uint64_options, "ipu_version", [&](std::uint64_t value) { poptorch_options.ipu_version = value; }); registerSetter(uint64_options, "ipu_id", [&](std::uint64_t value) { poptorch_options.ipu_id = value; }); registerSetter( uint64_options, "gradient_accumulation", [&](std::uint64_t value) { popart_options.accumulationFactor = value; }); registerSetter(uint64_options, "output_return_period", [&](std::uint64_t value) { poptorch_options.output_return_period = value; }); registerSetter(uint64_options, "replication_factor", [&](std::uint64_t value) { popart_options.replicatedGraphCount = value; }); registerSetter(uint64_options, "input_group_size", [&](std::uint64_t value) { poptorch_options.input_group_size = static_cast(value); }); registerSetter(uint64_options, "input_cgt", [&](std::uint64_t value) { poptorch_options.input_cgt = static_cast(value); }); registerSetter(uint64_options, "execution_mode", [&](std::uint64_t value) { ERROR_ON_MSG(value >= static_cast(ExecutionMode::N), "Value for ExecutionMode out of range"); poptorch_options.execution_mode = static_cast(value); }); registerSetter(uint64_options, "tensors_liveness", [&](std::uint64_t value) { ERROR_ON_MSG(value >= static_cast(Liveness::N), "Value for Liveness out of range"); poptorch_options.tensors_liveness = static_cast(value); }); registerSetter(uint64_options, "output_mode", [&](std::uint64_t value) { ERROR_ON_MSG(value >= static_cast(PopartOutputMode::N), "Value for PopartOutputMode out of range"); poptorch_options.output_mode = static_cast(value); }); registerSetter(uint64_options, "connection_type", [&](std::uint64_t value) { ERROR_ON_MSG( value > static_cast(popart::DeviceConnectionType::Never), "Value for DeviceConnectionType out of range"); poptorch_options.connection_type = static_cast(value); }); registerSetter( uint64_options, "accumulateOuterFragmentSettings.schedule", [&](std::uint64_t value) { ERROR_ON_MSG( value > static_cast( popart::AccumulateOuterFragmentSchedule:: OverlapMemoryOptimized), "Value for popart::AccumulateOuterFragmentSchedule out of range"); popart_options.accumulateOuterFragmentSettings.schedule = static_cast(value); }); registerSetter(uint64_options, "max_repeat_logs", [&](std::uint64_t value) { logging::setRepeatLimit(value); }); registerSetter(container_options, "accumulateOuterFragmentSettings.excludedVirtualGraphs", [&](const std::pair &p) { std::int64_t value = std::stoi(p.first); popart_options.accumulateOuterFragmentSettings .excludedVirtualGraphs.push_back(value); }); registerSetter(uint64_options, "accumulation_and_replication_reduction_type", [&](std::uint64_t value) { ERROR_ON_MSG(value > static_cast( popart::ReductionType::NoReduction), "Value for popart::ReductionType out of range"); popart_options.accumulationAndReplicationReductionType = static_cast(value); }); registerSetter(uint64_options, "sync_pattern", [&](std::uint64_t value) { ERROR_ON_MSG(value > static_cast( popart::SyncPattern::ReplicaAndLadder), "Value for SyncPattern out of range"); poptorch_options.sync_pattern = static_cast(value); }); registerSetter(uint64_options, "random_seed", [&](std::uint64_t value) { poptorch_options.random_seed = value; }); registerSetter(string_options, "log_dir", [&](const std::string &value) { popart_options.logDir = value; }); registerSetter(string_options, "saveInitializersToFile", [&](const std::string &value) { poptorch_options.external_initializers_file = value; }); string_options["logDir"] = [&](const std::string &log_dir) { UNUSED(log_dir); logging::warn( "Ignoring call to poptorch.Options._Popart.set(\"logDir\",...): use " "poptorch.Options.logDir() instead"); }; registerSetter(string_options, "model_name", [&](const std::string &value) { poptorch_options.model_name = value; }); registerSetter(container_options, "dotChecks", [&](const std::pair &p) { popart_options.dotChecks.insert(p.first); }); registerSetter(container_options, "hardwareInstrumentations", [&](const std::pair &p) { std::uint64_t value = std::stoul(p.first); ERROR_ON_MSG(value >= static_cast( popart::Instrumentation::N), "Value for Instrumentation out of range"); // clang-format off popart_options.hardwareInstrumentations.insert( static_cast(value)); // clang-format on }); registerSetter(container_options, "customCodelets", [&](const std::pair &p) { popart_options.customCodelets.push_back(p.first); }); registerSetter(container_options, "engineOptions", [&](const std::pair &p) { popart_options.engineOptions.emplace(p); }); registerSetter(container_options, "reportOptions", [&](const std::pair &p) { popart_options.reportOptions.emplace(p); }); registerSetter(container_options, "convolutionOptions", [&](const std::pair &p) { popart_options.convolutionOptions.emplace(p); }); registerSetter(container_options, "matmulOptions", [&](const std::pair &p) { popart_options.matmulOptions.emplace(p); }); registerSetter(container_options, "lstmOptions", [&](const std::pair &p) { popart_options.lstmOptions.emplace(p); }); registerSetter(container_options, "gclOptions", [&](const std::pair &p) { popart_options.gclOptions.emplace(p); }); registerSetter(container_options, "updatableNamedBuffers", [&](const std::pair &p) { popart_options.updatableNamedBuffers.push_back(p.first); }); #define ADD_POPART_ENUM_OPTION(name, EnumType) \ registerSetter(uint64_options, #name, [&](std::uint64_t value) { \ ERROR_ON_MSG(value >= static_cast(popart::EnumType::N), \ "Value for " << #EnumType << " out of range"); \ popart_options.name = static_cast(value); \ }) #define ADD_POPART_BOOL_OPTION(name) \ registerSetter(bool_options, #name, \ [&](bool value) { popart_options.name = value; }) #define ADD_POPART_UINT64_OPTION(name) \ registerSetter(uint64_options, #name, \ [&](std::uint64_t value) { popart_options.name = value; }) #define ADD_POPART_DOUBLE_OPTION(name) \ registerSetter(double_options, #name, \ [&](double value) { popart_options.name = value; }) #define ADD_POPART_STRING_OPTION(name) \ registerSetter(string_options, #name, [&](const std::string &value) { \ popart_options.name = value; \ }) ADD_POPART_ENUM_OPTION(autodiffSettings.stitchStrategy, AutodiffStitchStrategy); ADD_POPART_ENUM_OPTION(batchSerializationSettings.transformContext, BatchSerializationTransformContext); ADD_POPART_ENUM_OPTION(batchSerializationSettings.method, BatchSerializationMethod); ADD_POPART_ENUM_OPTION(batchSerializationSettings.batchSchedule, BatchSerializationBatchSchedule); ADD_POPART_ENUM_OPTION(autoRecomputation, RecomputationType); ADD_POPART_ENUM_OPTION(mergeVarUpdate, MergeVarUpdateType); ADD_POPART_ENUM_OPTION(virtualGraphMode, VirtualGraphMode); ADD_POPART_ENUM_OPTION(syntheticDataMode, SyntheticDataMode); ADD_POPART_ENUM_OPTION(subgraphCopyingStrategy, SubgraphCopyingStrategy); ADD_POPART_ENUM_OPTION(accumulationAndReplicationReductionType, ReductionType); ADD_POPART_ENUM_OPTION(meanAccumulationAndReplicationReductionStrategy, MeanReductionStrategy); ADD_POPART_ENUM_OPTION( automaticLossScalingSettings.gradientTensorTrackingMethod, GradientTensorTrackingMethod); ADD_POPART_STRING_OPTION(logDir); ADD_POPART_STRING_OPTION(cachePath); ADD_POPART_STRING_OPTION(partialsTypeMatMuls); ADD_POPART_STRING_OPTION(customCodeletCompileFlags); ADD_POPART_STRING_OPTION(serializedPoprithmsShiftGraphsDir); ADD_POPART_STRING_OPTION(kahnTieBreaker); ADD_POPART_UINT64_OPTION(executionPhaseSettings.phases); ADD_POPART_UINT64_OPTION(executionPhaseSettings.stages); ADD_POPART_UINT64_OPTION(batchSerializationSettings.factor); ADD_POPART_UINT64_OPTION(firstDotOp); ADD_POPART_UINT64_OPTION(finalDotOp); ADD_POPART_UINT64_OPTION(numIOTiles); ADD_POPART_UINT64_OPTION(mergeVarUpdateMemThreshold); ADD_POPART_UINT64_OPTION(looseThresholdAtPeak); ADD_POPART_UINT64_OPTION(accumulationFactor); ADD_POPART_UINT64_OPTION(swapLimitScheduler); ADD_POPART_UINT64_OPTION(globalReplicationFactor); ADD_POPART_UINT64_OPTION(globalReplicaOffset); ADD_POPART_UINT64_OPTION(defaultBufferingDepth); ADD_POPART_UINT64_OPTION(defaultPrefetchBufferingDepth); ADD_POPART_UINT64_OPTION(compilationProgressTotal); ADD_POPART_UINT64_OPTION(transitiveClosureOptimizationThreshold); ADD_POPART_UINT64_OPTION(automaticLossScalingSettings.updatePeriod); ADD_POPART_BOOL_OPTION(enableInplaceAmbiguityChecking); ADD_POPART_BOOL_OPTION(enableLoadAndOffloadRNGState); ADD_POPART_BOOL_OPTION(batchSerializationSettings.concatOnVirtualGraphChange); ADD_POPART_BOOL_OPTION( batchSerializationSettings.concatOnExecutionPhaseChange); ADD_POPART_BOOL_OPTION( batchSerializationSettings.concatOnPipelineStageChange); ADD_POPART_BOOL_OPTION(strictOpVersions); ADD_POPART_BOOL_OPTION(opxAliasChecking); ADD_POPART_BOOL_OPTION(opxModifyChecking); ADD_POPART_BOOL_OPTION(dotOpNames); ADD_POPART_BOOL_OPTION(exportPoplarComputationGraph); ADD_POPART_BOOL_OPTION(exportPoplarVertexGraph); ADD_POPART_BOOL_OPTION(separateCallOpPdfs); ADD_POPART_BOOL_OPTION(enableOutlining); ADD_POPART_BOOL_OPTION(enableOutliningCopyCostPruning); ADD_POPART_BOOL_OPTION(rearrangeAnchorsOnHost); ADD_POPART_BOOL_OPTION(rearrangeStreamsOnHost); ADD_POPART_BOOL_OPTION(enablePrefetchDatastreams); ADD_POPART_BOOL_OPTION(enableNonStableSoftmax); ADD_POPART_BOOL_OPTION(enableReplicatedGraphs); ADD_POPART_BOOL_OPTION(enableGradientAccumulation); ADD_POPART_BOOL_OPTION(instrumentWithHardwareCycleCounter); ADD_POPART_BOOL_OPTION(enablePipelining); ADD_POPART_BOOL_OPTION(disableGradAccumulationTensorStreams); ADD_POPART_BOOL_OPTION(disableOptimizerStateTensorStreams); ADD_POPART_BOOL_OPTION(compileEngine); ADD_POPART_BOOL_OPTION(constantWeights); ADD_POPART_BOOL_OPTION(enableEngineCaching); ADD_POPART_BOOL_OPTION(enableMergeExchange); ADD_POPART_BOOL_OPTION(enableFloatingPointChecks); ADD_POPART_BOOL_OPTION(enableStochasticRounding); ADD_POPART_BOOL_OPTION(ensureFp32LossScaleTensor); ADD_POPART_BOOL_OPTION(explicitRecomputation); ADD_POPART_BOOL_OPTION(enableExplicitMainLoops); ADD_POPART_BOOL_OPTION(useHostCopyOps); ADD_POPART_BOOL_OPTION(aliasZeroCopy); ADD_POPART_BOOL_OPTION(delayVarUpdates); ADD_POPART_BOOL_OPTION(enableFullyConnectedPass); ADD_POPART_BOOL_OPTION(enableSerializedMatmuls); ADD_POPART_BOOL_OPTION(enableStableNorm); ADD_POPART_BOOL_OPTION(decomposeGradSum); ADD_POPART_BOOL_OPTION(enableDistributedReplicatedGraphs); ADD_POPART_BOOL_OPTION(groupHostSync); ADD_POPART_BOOL_OPTION(automaticLossScalingSettings.enabled); ADD_POPART_BOOL_OPTION(instrumentWithHardwareCycleCounter); ADD_POPART_BOOL_OPTION(enableSupportedDataTypeCasting); ADD_POPART_BOOL_OPTION(groupNormStridedChannelGrouping); ADD_POPART_BOOL_OPTION(scheduleNonWeightUpdateGradientConsumersEarly); ADD_POPART_BOOL_OPTION( replicatedCollectivesSettings.prepareScheduleForMergingCollectives); ADD_POPART_BOOL_OPTION( replicatedCollectivesSettings.mergeAllReduceCollectives); ADD_POPART_DOUBLE_OPTION(outlineSequenceBreakCost); ADD_POPART_DOUBLE_OPTION(outlineThreshold); ADD_POPART_DOUBLE_OPTION(timeLimitScheduler); ADD_POPART_DOUBLE_OPTION( automaticLossScalingSettings.thresholdUpperCountProportion); ADD_POPART_DOUBLE_OPTION(automaticLossScalingSettings.binEdgeLocation); #undef ADD_POPART_STRING_OPTION #undef ADD_POPART_UINT64_OPTION #undef ADD_POPART_BOOL_OPTION #undef ADD_POPART_DOUBLE_OPTION #undef ADD_POPART_ENUM_OPTION } } // namespace detail SessionOptions::SessionOptions() : _impl(std::make_unique()) {} SessionOptions::SessionOptions(SessionOptions &&src) : _impl(std::move(src._impl)) {} void SessionOptions::addStringOption(const char *option, const char *value) { _impl->set(option, value, _impl->string_options, "string"); } void SessionOptions::addUint64Option(const char *option, std::uint64_t value) { _impl->set(option, value, _impl->uint64_options, "uint64"); } void SessionOptions::addBoolOption(const char *option, bool value) { _impl->set(option, value, _impl->bool_options, "bool"); } void SessionOptions::addDoubleOption(const char *option, double value) { _impl->set(option, value, _impl->double_options, "floating point"); } void SessionOptions::insertStringOption(const char *option, const char *value) { _impl->set(option, std::pair(value, value_not_set), _impl->container_options, "set / vector"); } void SessionOptions::insertStringPairOption(const char *option, const char *key, const char *value) { _impl->set(option, std::pair(key, value), _impl->container_options, "map"); } bool SessionOptions::broadcastBuffers() const { return _impl->poptorch_options.broadcast_buffers; } bool SessionOptions::hasInputReplication() const { return _impl->poptorch_options.input_group_size < _impl->popart_options.replicatedGraphCount; } void SessionOptions::setMemoryProportion(std::uint32_t ipu, float memory) { _impl->setMemoryProportion(ipu, memory); } void SessionOptions::setPatternsLevel(std::uint64_t level) { _impl->options_set.insert("patterns"); ERROR_ON(level > static_cast(popart::PatternsLevel::All)); _impl->poptorch_options.patterns = popart::Patterns(static_cast(level)); } void SessionOptions::addPattern(const char *pattern, bool enabled) { _impl->poptorch_options.patterns.enablePattern(pattern, enabled); } void SessionOptions::setTensorLocation(const char *tensor, const char *option, std::uint64_t value) { logging::debug("Setting {} to {} for location {}", option, value, tensor); std::string location_tensor{tensor}; std::string opt{option}; popart::TensorLocationSettings *settings; _impl->options_set.insert(location_tensor); if (location_tensor == "location_activation") { settings = &_impl->popart_options.activationTensorLocationSettings; } else if (location_tensor == "location_weight") { settings = &_impl->popart_options.weightTensorLocationSettings; } else if (location_tensor == "location_optimizer") { settings = &_impl->popart_options.optimizerStateTensorLocationSettings; } else if (location_tensor == "location_accumulator") { settings = &_impl->popart_options.accumulatorTensorLocationSettings; } else { ERROR("Unknown tensor location " << location_tensor); } if (opt == "minElementsForOffChip") { settings->minElementsForOffChip = value; } else if (opt == "minElementsForReplicatedTensorSharding") { settings->minElementsForReplicatedTensorSharding = value; } else if (opt == "onChip") { settings->location.storage = value > 0 ? popart::TensorStorage::OnChip : popart::TensorStorage::OffChip; } else if (opt == "useReplicatedTensorSharding") { settings->location.replicatedTensorSharding = value > 0 ? popart::ReplicatedTensorSharding::On : popart::ReplicatedTensorSharding::Off; } else if (opt == "useIOTilesToLoad") { settings->location.loadTileSet = value > 0 ? popart::TileSet::IO : popart::TileSet::Compute; } else if (opt == "useIOTilesToStore") { settings->location.storageTileSet = value > 0 ? popart::TileSet::IO : popart::TileSet::Compute; } else { ERROR("Unknown option '" << opt << "' for tensor location " << location_tensor); } } void SessionOptions::setCompilationProgressLogger( const std::function &logger) { _impl->popart_options.compilationProgressLogger = logger; } SessionOptions::~SessionOptions() = default; } // namespace popart_compiler } // namespace poptorch ================================================ FILE: popart_compiler/source/Utils.cpp ================================================ // Copyright (c) 2020 Graphcore Ltd. All rights reserved. #include #include #include #include #include #include "popart_compiler/CompilerImpl.hpp" #include "popart_compiler/PopartEnums.hpp" #include "popart_compiler/Utils.hpp" #include "poptorch_logging/Error.hpp" #include "poptorch_logging/Logging.hpp" // These symbols exist in popart but are not declared publicly namespace ONNX_NAMESPACE { enum class TensorProto_DataType; } // namespace ONNX_NAMESPACE namespace popart { namespace onnxutil { DataType getDataType(int); ONNX_NAMESPACE::TensorProto_DataType getTPDataType(DataType data_type); } // namespace onnxutil } // namespace popart namespace poptorch { namespace popart_compiler { bool ipuModelEnvironmentVariableIsEnabled() { if (const char *env_use_model = std::getenv("POPTORCH_IPU_MODEL")) { const bool model_enabled = std::stoi(env_use_model) != 0; logging::info("From POPTORCH_IPU_MODEL environment variable: Ipu model: {}", model_enabled ? "Enabled" : "Disabled"); return model_enabled; } return false; } bool ipuSmallModelEnvironmentVariableIsEnabled() { // POPTORCH_IPU_MODEL takes precedence over the small model. if (ipuModelEnvironmentVariableIsEnabled()) { return false; } if (const char *env_use_model = std::getenv("POPTORCH_SMALL_IPU_MODEL")) { const bool model_enabled = std::stoi(env_use_model) != 0; logging::info("From POPTORCH_SMALL_IPU_MODEL environment variable: small " "Ipu model: {}", model_enabled ? "Enabled" : "Disabled"); return model_enabled; } return false; } std::string getIpuModelVersion() { if (const char *env_ipu_model_version = std::getenv("POPTORCH_IPU_MODEL_VERSION")) { std::string str(env_ipu_model_version); return str; } return "ipu2"; // Default to MK2 if unspecified } int getNumTilesPerIpu(const std::string &ipu_model_version) { int num_tiles_per_ipu = 0; if (ipu_model_version == "ipu1") { num_tiles_per_ipu = 1216; // MK1 } if (ipu_model_version == "ipu2") { num_tiles_per_ipu = 1472; // MK2 } if (ipu_model_version == "ipu21") { num_tiles_per_ipu = 1472; // C600 } if (ipuSmallModelEnvironmentVariableIsEnabled()) { num_tiles_per_ipu = 4; } ERROR_ON_MSG((ipu_model_version.find("ipu:") == std::string::npos) && (num_tiles_per_ipu == 0), "Invalid IPU model version. Valid versions: ipu1, ipu2, ipu21."); return num_tiles_per_ipu; } // Round up the number of IPUs, if required, to the minimum number which need // to be reservered std::uint64_t roundUpNumIPUs(std::uint64_t num_ipus) { std::uint64_t rounded_num_ipus = 1; // If fewer than 64, find the next power of 2 while (rounded_num_ipus < num_ipus) { rounded_num_ipus *= 2; } return rounded_num_ipus; } bool waitIfIpuIsUnavailable() { bool wait = false; if (const char *env_wait_for_ipu = std::getenv("POPTORCH_WAIT_FOR_IPU")) { wait = std::stoi(env_wait_for_ipu) != 0; logging::info("From POPTORCH_WAIT_FOR_IPU environment variable: If no IPU " "is available: {}", wait ? "Wait" : "Fail & exit"); } return wait; } bool waitForAWhile() { constexpr std::int64_t sleep_time = 15; logging::trace("No IPU available, sleeping for {} seconds", sleep_time); std::this_thread::sleep_for(std::chrono::seconds(sleep_time)); return true; } std::int64_t ipuHardwareVersion(std::uint64_t num_ipus) { if (ipuModelEnvironmentVariableIsEnabled() || ipuSmallModelEnvironmentVariableIsEnabled()) { return 0; } auto devices = popart::DeviceManager::createDeviceManager().enumerateDevices( popart::SyncPattern::Full, num_ipus); if (devices.empty()) { return 0; } const std::string arch = devices.front()->getTarget().getTargetArchString(); // The architecture string must be 'ipu' followed by one or more non-zero // digits. bool is_valid = arch.size() > 3 && arch.find("ipu", 0) == 0; for (size_t i = 3; is_valid && i < arch.size(); ++i) { is_valid = arch[i] > '0' && arch[i] <= '9'; } if (!is_valid) { logging::warn("Unknown IPU version: {} (Expected 'ipuX' " " where X is one or more strictly positive digits)", arch); return -1; } return std::atoi(arch.substr(3).c_str()); } std::unique_ptr stringToUniquePtr(const std::string &str) { auto ptr = std::unique_ptr(new char[str.size() + 1]); str.copy(ptr.get(), std::string::npos); ptr.get()[str.size()] = '\0'; return ptr; } int64_t dtypeIntFromOnnxStr(const char *onnx_type) { auto popart_type = popart::dataTypeFromString(onnx_type); return static_cast(popart::onnxutil::getTPDataType(popart_type)); } const char *onnxStrFromDtypeInt(int64_t dtype) { auto popart_type = popart::onnxutil::getDataType(dtype); const auto &data_type_map(popart::getDataTypeInfoMap()); // data_type_map is static so the c_str() remains valid return data_type_map.at(popart_type).name().c_str(); } poplar::Type poplarTypeFromPoptorch(PopartType type) { const popart::DataType popart_type = popartTypeFromPoptorch(type); return popart::popx::popType(popart_type); } popart::DataType popartTypeFromPoptorch(PopartType type) { switch (type) { case PopartType::UINT8: return popart::DataType::UINT8; case PopartType::INT8: return popart::DataType::INT8; case PopartType::UINT16: return popart::DataType::UINT16; case PopartType::INT16: return popart::DataType::INT16; case PopartType::INT32: return popart::DataType::INT32; case PopartType::INT64: return popart::DataType::INT64; case PopartType::UINT32: return popart::DataType::UINT32; case PopartType::UINT64: return popart::DataType::UINT64; case PopartType::BOOL: return popart::DataType::BOOL; case PopartType::FLOAT: return popart::DataType::FLOAT; case PopartType::FLOAT16: return popart::DataType::FLOAT16; case PopartType::BFLOAT16: return popart::DataType::BFLOAT16; case PopartType::FLOAT8_143: return popart::DataType::FLOAT8_143; case PopartType::FLOAT8_152: return popart::DataType::FLOAT8_152; case PopartType::DOUBLE: return popart::DataType::DOUBLE; case PopartType::COMPLEX64: return popart::DataType::COMPLEX64; case PopartType::COMPLEX128: return popart::DataType::COMPLEX128; case PopartType::STRING: return popart::DataType::STRING; case PopartType::UNDEFINED: return popart::DataType::UNDEFINED; default: ERROR("Unsupported type in popartTypeFromPoptorchType"); } return popart::DataType::UNDEFINED; } } // namespace popart_compiler } // namespace poptorch ================================================ FILE: popart_compiler/source/custom_operations/Embedding.cpp ================================================ // Copyright (c) 2021, Graphcore Ltd, All rights reserved. #include #include #include #include #include #include "popart_compiler/CustomOps.hpp" #include #include #include #include #include #include #include #include #include #include #include #include namespace poptorch { namespace poptorch_custom_ops { class EmbeddingGradOp; // EmbeddingOp needs to be convertible to popart::GatherOp so that the tied // gather pattern can match this implementation. class EmbeddingOp : public popart::GatherOp { public: EmbeddingOp(const popart::OperatorIdentifier &_opid, const nonstd::optional &padding_idx, const nonstd::optional &available_memory_proportion_, const popart::Op::Settings &settings_) : popart::GatherOp(_opid, /*axis=*/0, /*groupSize=*/1, settings_, available_memory_proportion_), _padding_idx(padding_idx) {} std::unique_ptr clone() const final { return std::make_unique(*this); } std::vector> getGradOps() final { std::vector> result; result.emplace_back(std::make_unique(*this)); return result; } static popart::InIndex weightInIndex() { return 0; } static popart::InIndex indicesInIndex() { return 1; } static popart::OutIndex outIndex() { return 0; } void appendOutlineAttributes(popart::OpSerialiserBase &os) const final { popart::GatherOp::appendOutlineAttributes(os); os.appendAttribute("padding_idx", paddingIndex()); } nonstd::optional paddingIndex() const { return _padding_idx; } private: nonstd::optional _padding_idx; }; class EmbeddingGradOp : public popart::Op { public: explicit EmbeddingGradOp(const EmbeddingOp &fwd_op) : popart::Op(poptorch_custom_ops::embedding_grad, fwd_op.getSettings()), _padding_idx(fwd_op.paddingIndex()), _available_memory_proportion(fwd_op.getAvailableMemoryProportion()), _wieght_info(fwd_op.inInfo(EmbeddingOp::weightInIndex())) {} std::unique_ptr clone() const final { return std::make_unique(*this); } const std::vector &gradInputInfo() const final { static const std::vector info = { {gradInIndex(), EmbeddingOp::outIndex(), popart::GradOpInType::GradOut}, {indicesInIndex(), EmbeddingOp::indicesInIndex(), popart::GradOpInType::In}}; return info; } const std::map &gradOutToNonGradIn() const final { static const std::map out = { {gradOutIndex(), EmbeddingOp::weightInIndex()}}; return out; } void setup() final { outInfo(gradOutIndex()) = _wieght_info; } static popart::InIndex gradInIndex() { return 0; } static popart::InIndex indicesInIndex() { return 1; } static popart::OutIndex gradOutIndex() { return 0; } void appendOutlineAttributes(popart::OpSerialiserBase &os) const final { popart::Op::appendOutlineAttributes(os); os.appendAttribute("padding_idx", paddingIndex()); os.appendAttribute(popart::sAvailMemAttribute, availableMemoryProportion()); } float getSubgraphValue() const final { return getLowSubgraphValue(); } nonstd::optional paddingIndex() const { return _padding_idx; } nonstd::optional availableMemoryProportion() const { return _available_memory_proportion; } private: nonstd::optional _padding_idx; nonstd::optional _available_memory_proportion; popart::TensorInfo _wieght_info; }; namespace { popart::OpDefinition::DataTypes weight_dtypes = { popart::DataType::UINT8, popart::DataType::UINT16, popart::DataType::UINT32, popart::DataType::UINT64, popart::DataType::INT8, popart::DataType::INT16, popart::DataType::INT32, popart::DataType::INT64, popart::DataType::FLOAT16, popart::DataType::FLOAT}; popart::OpDefinition::DataTypes indices_dtypes = { popart::DataType::UINT8, popart::DataType::UINT16, popart::DataType::UINT32, popart::DataType::UINT64, popart::DataType::INT8, popart::DataType::INT16, popart::DataType::INT32, popart::DataType::INT64}; popart::OpDefinition embedding_def({popart::OpDefinition::Inputs({{"weight", weight_dtypes}, {"indices", indices_dtypes}}), popart::OpDefinition::Outputs({{"output", weight_dtypes}}), popart::OpDefinition::Attributes({ {"padding_idx", {"*"}}, })}); popart::OpCreator embedding_creator( popart::OpDefinitions({{poptorch_custom_ops::embedding, embedding_def}}), [](const popart::OpCreatorInfo &info) { nonstd::optional padding_idx; if (info.attributes.hasAttribute("padding_idx")) { padding_idx = info.attributes.getAttribute( "padding_idx"); } nonstd::optional available_memory_proportion; if (info.attributes.hasAttribute(popart::sAvailMemAttribute)) { available_memory_proportion = info.attributes.getAttribute( popart::sAvailMemAttribute); } return std::unique_ptr(new EmbeddingOp( info.opid, padding_idx, available_memory_proportion, info.settings)); }, true); } // namespace class EmbeddingOpx : public popart::popx::Opx { public: EmbeddingOpx(popart::Op *op, popart::popx::Devicex *devicex) : popart::popx::Opx(op, devicex) { verifyOp(op, {poptorch_custom_ops::embedding}); // We always want the EmbeddingOpx to layout its inputs inputCreatorPriority = std::numeric_limits::max(); auto options = popart::popx::createSlicePlanOptions( popart::popx::SlicePlanUsedFor::Slice, getOp().getAvailableMemoryProportion()); _plan = popart::popx::createSlicePlan( graph(), inInfo(EmbeddingOp::weightInIndex()), inInfo(EmbeddingOp::indicesInIndex()), options, /*axis=*/0); } void grow(poplar::program::Sequence &prog) const final { auto weight = getInTensor(EmbeddingOp::weightInIndex()); auto indices = getInTensor(EmbeddingOp::indicesInIndex()); // Assume non-negative indices. indices = indices.reinterpret(poplar::UNSIGNED_INT); indices = indices.flatten(); indices = indices.expand({1}); auto result = popops::multiSlice(graph(), weight, indices, {0}, {1}, prog, _plan, poplar::OptionFlags()); result = result.reshape(outInfo(EmbeddingOp::outIndex()).shape_szt()); setOutTensor(EmbeddingOp::outIndex(), result); } poplar::Tensor createInputTensor(popart::InIndex index, const poplar::DebugNameAndId &dnai) const final { if (index != EmbeddingOp::weightInIndex() && index != EmbeddingOp::indicesInIndex()) { throw popart::error( "EmbeddingOpx::createInputTensor : Invalid index = {}", index); } if (index == EmbeddingOp::weightInIndex()) { const auto &weight_info = inInfo(index); auto weight = popops::createSliceableTensor( graph(), popart::popx::popType(weight_info), weight_info.shape_szt(), {0}, {1}, _plan, poplar::OptionFlags(), dnai); return weight; } const auto &indices_info = inInfo(index); auto num_lookups = static_cast(indices_info.nelms()); auto indices = popops::createIndicesTensor(graph(), {0}, num_lookups, _plan, poplar::OptionFlags(), dnai); indices = indices.reinterpret(popart::popx::popType(indices_info)); indices = indices.reshape(indices_info.shape_szt()); return indices; } popart::popx::InputCreatorType getInputCreatorType(popart::InIndex index) const final { if (index == EmbeddingOp::weightInIndex() || index == EmbeddingOp::indicesInIndex()) { return popart::popx::InputCreatorType::CanCreate; } return Opx::getInputCreatorType(index); } std::set mustExistBeforeCreate(popart::InIndex index) const final { (void)index; // unused return {}; } private: popops::SlicePlan _plan; }; class EmbeddingGradOpx : public popart::popx::Opx { public: EmbeddingGradOpx(popart::Op *op, popart::popx::Devicex *devicex) : popart::popx::Opx(op, devicex) { verifyOp(op, {poptorch_custom_ops::embedding_grad}); // We always want the EmbeddingGradOpx to layout its inputs inputCreatorPriority = std::numeric_limits::max(); auto grad_op = getOp(); _padding_idx = grad_op.paddingIndex(); auto options = popart::popx::createSlicePlanOptions( popart::popx::SlicePlanUsedFor::UpdateAdd, grad_op.availableMemoryProportion()); _plan = popart::popx::createSlicePlan( graph(), outInfo(EmbeddingGradOp::gradOutIndex()), inInfo(EmbeddingGradOp::indicesInIndex()), options, /*axis=*/0); } void grow(poplar::program::Sequence &prog) const final { auto grad_in = getInTensor(EmbeddingGradOp::gradInIndex()); auto indices = getInTensor(EmbeddingGradOp::indicesInIndex()); auto output_shape = outInfo(EmbeddingGradOp::gradOutIndex()).shape_szt(); auto out = popops::createSliceableTensor( graph(), grad_in.elementType(), output_shape, {0}, {1}, _plan, poplar::OptionFlags(), debugContext("embedding_grad_out")); popops::zero(graph(), out, prog, debugContext("zero")); auto scale = graph().addConstant(grad_in.elementType(), {}, 1.0f, debugContext("const_1")); graph().setTileMapping(scale, 0); auto inputs = popart::popx::GatherGradOpx::handleNDMultiUpdate( out, grad_in, indices, 0, 1); auto &target_nd = std::get<0>(inputs); auto &update_nd = std::get<1>(inputs); auto &indices_nd = std::get<2>(inputs); popops::multiUpdateAdd(graph(), target_nd, update_nd, indices_nd, scale, {0}, {1}, prog, _plan, poplar::OptionFlags(), debugContext("embedding_grad")); if (_padding_idx) { auto start = static_cast(*_padding_idx); auto padding = out.slice(start, start + 1, 0); popops::zero(graph(), padding, prog, debugContext("zero_padding_idx")); } setOutTensor(EmbeddingGradOp::gradOutIndex(), out); } poplar::Tensor createInputTensor(popart::InIndex index, const poplar::DebugNameAndId &dnai) const final { if (index != EmbeddingGradOp::gradInIndex() && index != EmbeddingGradOp::indicesInIndex()) { throw popart::error( "EmbeddingGradOpx::createInputTensor : Invalid index = {}", index); } if (index == EmbeddingGradOp::gradInIndex()) { const auto &grad_info = inInfo(index); auto weight = popops::createSliceableTensor( graph(), popart::popx::popType(grad_info), grad_info.shape_szt(), {0}, {1}, _plan, poplar::OptionFlags(), dnai); return weight; } const auto &indices_info = inInfo(index); auto num_lookups = static_cast(indices_info.nelms()); auto indices = popops::createIndicesTensor(graph(), {0}, num_lookups, _plan, poplar::OptionFlags(), dnai); indices = indices.reinterpret(popart::popx::popType(indices_info)); indices = indices.reshape(indices_info.shape_szt()); return indices; } popart::popx::InputCreatorType getInputCreatorType(popart::InIndex index) const final { if (index == EmbeddingGradOp::gradInIndex() || index == EmbeddingGradOp::indicesInIndex()) { return popart::popx::InputCreatorType::CanCreate; } return Opx::getInputCreatorType(index); } std::set mustExistBeforeCreate(popart::InIndex index) const final { (void)index; // unused return {}; } private: nonstd::optional _padding_idx; popops::SlicePlan _plan; }; namespace { popart::popx::OpxCreator embedding_opx(poptorch_custom_ops::embedding); popart::popx::OpxCreator embedding_grad_opx(poptorch_custom_ops::embedding_grad); } // namespace } // namespace poptorch_custom_ops } // namespace poptorch ================================================ FILE: popart_compiler/source/custom_operations/FastGatherLastDim.cpp ================================================ // Copyright (c) 2022 Graphcore Ltd. All rights reserved. #include #include #include #include #include #include "FastGatherLastDim.hpp" #include "popart_compiler/CodeletsCompilation.hpp" #include "popart_compiler/CustomOps.hpp" #include "popart_compiler/Utils.hpp" namespace poptorch { namespace poptorch_custom_ops { FastGatherLastDimOp::FastGatherLastDimOp( const popart::OperatorIdentifier &opid_, const popart::Op::Settings &settings_, const std::string &debug_str) : popart::Op(opid_, settings_) { this->_axis = -1; this->_debug_str = debug_str; } std::vector> FastGatherLastDimOp::getGradOps() { std::vector> upops; upops.emplace_back(std::make_unique(*this)); return upops; } std::unique_ptr FastGatherLastDimOp::clone() const { return std::make_unique(*this); } void FastGatherLastDimOp::setup() { if (popart_compiler::ipuModelEnvironmentVariableIsEnabled() || popart_compiler::ipuSmallModelEnvironmentVariableIsEnabled()) { throw popart::error( "FastGatherLastDimOp requires hardware but IPU model is enabled"); } popart::Shape data_shape = this->inInfo(0).shape(); popart::Shape idx_shape = this->inInfo(1).shape(); popart::Shape out_shape = data_shape; // idx rank and data rank should be the same if (data_shape.size() != idx_shape.size()) { throw popart::error( "FastGatherLastDimOp::setup(), " "Input and Index tensors do not have same rank in Op {}", this->getDebugStr()); } // idx should have same dimensions as data except for last dim const int data_rank = static_cast(data_shape.size()); for (unsigned i = 0; i < data_shape.size() - 1; i++) { if (idx_shape[i] != data_shape[i]) { throw popart::error("FastGatherLastDimOp::setup(), " "Index tensor must have same dimensions as Input " "except for last dim. Op {}", this->getDebugStr()); } } int axis = this->_axis; if (axis < 0) { axis = data_rank + axis; } for (unsigned i = 0; i < data_shape.size(); i++) { if (static_cast(axis) != i) { out_shape[i] = data_shape[i]; } } out_shape[axis] = idx_shape[axis]; this->_in_shape = data_shape; this->_out_shape = out_shape; this->outInfo(0) = {this->inInfo(0).dataType(), out_shape}; } // register op static popart::OpDefinition::DataTypes fast_gather_op_data_tensor_type = { popart::DataType::FLOAT16, popart::DataType::FLOAT}; static popart::OpDefinition::DataTypes fast_gather_op_idx_tensor_type = { popart::DataType::INT32, popart::DataType::INT16}; static popart::OpDefinition fast_gather_op_def( {popart::OpDefinition::Inputs({ {"data", fast_gather_op_data_tensor_type}, {"index", fast_gather_op_idx_tensor_type}, }), popart::OpDefinition::Outputs({{"out", fast_gather_op_data_tensor_type}}), popart::OpDefinition::Attributes({})}); static popart::OpCreator fast_gather_op_creator( popart::OpDefinitions({{poptorch_custom_ops::fast_gather_last_dim, fast_gather_op_def}}), [](const popart::OpCreatorInfo &info) -> std::unique_ptr { popart::OperatorIdentifier const &opid = info.opid; popart::Op::Settings const &settings = info.settings; popart::Attributes const &attr = info.attributes; std::string const debug_str = attr.getAttribute("debug_str", "fast_gather_last_dim"); return std::unique_ptr( new FastGatherLastDimOp(opid, settings, debug_str)); }, true); FastGatherLastDimOpx::FastGatherLastDimOpx(popart::Op *op, popart::popx::Devicex *devicex) : popart::popx::Opx(op, devicex) { verifyOp(op, poptorch_custom_ops::fast_gather_last_dim); // Get around the ABI issues. auto managed_ptr = popart_compiler::compileCustomCodeletIfNeeded( "FastGatherLastDimFwdCodelets.inc.cpp", /*hw_only_codelet=*/true); const char *compiled_codelet_path = static_cast(managed_ptr.get()); graph().addCodelets(std::string(compiled_codelet_path)); } void FastGatherLastDimOpx::grow(poplar::program::Sequence &prog) const { auto data_tensor = getInTensor(0); auto idx_tensor = getInTensor(1); const FastGatherLastDimOp &fast_gather_last_dim_op = getOp(); popart::Shape fwd_op_out_shape = fast_gather_last_dim_op.getOutShape(); std::vector fwd_out_shape(fwd_op_out_shape.size()); for (unsigned i = 0; i < fwd_op_out_shape.size(); i++) { fwd_out_shape[i] = fwd_op_out_shape[i]; } poplar::Tensor const out_tensor = addGraphProg(graph(), prog, data_tensor, idx_tensor, fwd_out_shape); setOutTensor(0, out_tensor); } poplar::Tensor FastGatherLastDimOpx::addGraphProg( poplar::Graph &graph, poplar::program::Sequence &prog, const poplar::Tensor &data_tensor, const poplar::Tensor &idx_tensor, const std::vector &fwd_out_shape) { poplar::Tensor output_tensor = graph.addVariable(data_tensor.elementType(), fwd_out_shape, "sel_out"); auto target = graph.getTarget(); const unsigned num_tiles = target.getNumTiles(); const unsigned out_rank = idx_tensor.rank(); std::size_t alloc_cnt = 1; std::size_t channel_cnt = 1; for (unsigned i = 0; i < out_rank; i++) { if (i < out_rank - 1) { alloc_cnt = alloc_cnt * fwd_out_shape[i]; } if (i < out_rank - 2) { channel_cnt = channel_cnt * fwd_out_shape[i]; } } auto in_shape = data_tensor.shape(); auto out_shape = fwd_out_shape; poplar::ComputeSet const gather_cs = graph.addComputeSet("FastGatherCS"); std::vector tile_start(num_tiles, 0); std::vector tile_count(num_tiles, 0); poplar::Tensor const data_tensor_clone = graph.clone(data_tensor); poplar::Tensor const data_tensor_reshape = data_tensor_clone.reshape({alloc_cnt, in_shape[out_rank - 1]}); poplar::Tensor const idx_tensor_clone = graph.clone(idx_tensor); poplar::Tensor const idx_tensor_reshape = idx_tensor_clone.reshape({alloc_cnt, out_shape[out_rank - 1]}); poplar::Tensor const result_tensor_reshape = output_tensor.reshape({alloc_cnt, out_shape[out_rank - 1]}); std::size_t tile_idx_last = 1; for (std::size_t i = 0; i < alloc_cnt; ++i) { std::size_t const idx = (i * num_tiles) / alloc_cnt; graph.setTileMapping(data_tensor_reshape[i], idx); graph.setTileMapping(idx_tensor_reshape[i], idx); graph.setTileMapping(result_tensor_reshape[i], idx); if (tile_idx_last != idx) { tile_start[idx] = i; } tile_count[idx] += 1; tile_idx_last = idx; } prog.add(poplar::program::Copy(data_tensor, data_tensor_clone)); prog.add(poplar::program::Copy(idx_tensor, idx_tensor_clone)); for (unsigned i = 0; i < num_tiles; ++i) { if (0 == tile_count[i]) { continue; } poplar::VertexRef const gather_vertex = graph.addVertex( gather_cs, poputil::templateVertex("FastGatherVertex", data_tensor.elementType(), idx_tensor.elementType()), {{"data_", data_tensor_reshape.slice(tile_start[i], tile_start[i] + tile_count[i])}, {"idx_", idx_tensor_reshape.slice(tile_start[i], tile_start[i] + tile_count[i])}, {"result_", result_tensor_reshape.slice( tile_start[i], tile_start[i] + tile_count[i])}}); graph.setTileMapping(gather_vertex, i); graph.setInitialValue(gather_vertex["dst_shape_"], out_shape); } prog.add(poplar::program::Execute(gather_cs)); return output_tensor; } FastGatherLastDimGradOp::FastGatherLastDimGradOp( const FastGatherLastDimOp &fwdOp) : popart::Op(poptorch_custom_ops::fast_gather_last_dim_grad, fwdOp.getSettings()) { this->_axis = -1; this->_fwd_in_shape = fwdOp.getInShape(); this->_debug_str = fwdOp.getDebugStr(); } std::unique_ptr FastGatherLastDimGradOp::clone() const { return std::make_unique(*this); } FastGatherLastDimGradOpx::FastGatherLastDimGradOpx( popart::Op *op, popart::popx::Devicex *devicex) : popart::popx::Opx(op, devicex) { verifyOp( op, poptorch_custom_ops::fast_gather_last_dim_grad); // Get around the ABI issues. auto managed_ptr = popart_compiler::compileCustomCodeletIfNeeded( "FastGatherLastDimBwdCodelets.inc.cpp", /*hw_only_codelet=*/true); const char *compiled_codelet_path = static_cast(managed_ptr.get()); graph().addCodelets(std::string(compiled_codelet_path)); } void FastGatherLastDimGradOpx::grow(poplar::program::Sequence &prog) const { poplar::Tensor const grad_output_tensor = getInTensor(0); poplar::Tensor const idx_tensor = getInTensor(1); const FastGatherLastDimGradOp &grad_op = getOp(); popart::Shape fwd_in_shape = grad_op.getFwdInShape(); std::vector fwd_in_shape_2(fwd_in_shape.size()); for (unsigned i = 0; i < fwd_in_shape.size(); i++) { fwd_in_shape_2[i] = static_cast(fwd_in_shape[i]); } auto zero = getScalarVariable(grad_output_tensor.elementType(), "zero"); graph().setInitialValue(zero, 0); auto output = zero; for (unsigned i = 0; i < fwd_in_shape.size(); ++i) { output = output.expand({0}); } for (unsigned i = 0; i < fwd_in_shape.size(); ++i) { output = output.broadcast(static_cast(fwd_in_shape[i]), i); } auto out_tensor = cloneNcopy(prog, output); poplar::Tensor const grad_input_tensor = addGraphProg(graph(), prog, grad_output_tensor, out_tensor, fwd_in_shape_2, idx_tensor); setOutTensor(0, out_tensor); } poplar::Tensor FastGatherLastDimGradOpx::addGraphProg( poplar::Graph &graph, poplar::program::Sequence &prog, const poplar::Tensor &grad_output_tensor, poplar::Tensor &grad_input_tensor, const std::vector &fwd_in_shape, const poplar::Tensor &idx_tensor) { auto target = graph.getTarget(); const unsigned num_tiles = target.getNumTiles(); const unsigned grad_output_rank = grad_output_tensor.rank(); std::size_t alloc_cnt = 1; std::size_t channel_cnt = 1; for (unsigned i = 0; i < grad_output_rank; i++) { if (i < grad_output_rank - 1) { alloc_cnt = alloc_cnt * grad_output_tensor.dim(i); } if (i < grad_output_rank - 2) { channel_cnt = channel_cnt * grad_output_tensor.dim(i); } } auto grad_output_shape = grad_output_tensor.shape(); auto grad_input_shape = fwd_in_shape; poplar::ComputeSet const gather_grad_cs = graph.addComputeSet("FastGatherGradCS"); std::vector tile_start(num_tiles, 0); std::vector tile_count(num_tiles, 0); poplar::Tensor const grad_output_tensor_clone = graph.clone(grad_output_tensor); poplar::Tensor const grad_output_tensor_reshape = grad_output_tensor_clone.reshape( {alloc_cnt, grad_output_shape[grad_output_rank - 1]}); poplar::Tensor const idx_tensor_clone = graph.clone(idx_tensor); poplar::Tensor const idx_tensor_reshape = idx_tensor_clone.reshape( {alloc_cnt, grad_output_shape[grad_output_rank - 1]}); poplar::Tensor const grad_input_tensor_reshape = grad_input_tensor.reshape( {alloc_cnt, grad_input_shape[grad_output_rank - 1]}); std::size_t tile_idx_last = 1; for (std::size_t i = 0; i < alloc_cnt; ++i) { std::size_t const idx = (i * num_tiles) / alloc_cnt; graph.setTileMapping(grad_output_tensor_reshape[i], idx); graph.setTileMapping(idx_tensor_reshape[i], idx); graph.setTileMapping(grad_input_tensor_reshape[i], idx); if (tile_idx_last != idx) { tile_start[idx] = i; } tile_count[idx] += 1; tile_idx_last = idx; } prog.add(poplar::program::Copy(idx_tensor, idx_tensor_clone)); prog.add(poplar::program::Copy(grad_output_tensor, grad_output_tensor_clone)); for (unsigned i = 0; i < num_tiles; ++i) { if (0 == tile_count[i]) { continue; } poplar::VertexRef const gather_vertex = graph.addVertex( gather_grad_cs, poputil::templateVertex("FastGatherGradVertex", grad_output_tensor.elementType(), idx_tensor.elementType()), {{"grad_out_", grad_output_tensor_reshape.slice( tile_start[i], tile_start[i] + tile_count[i])}, {"idx_", idx_tensor_reshape.slice(tile_start[i], tile_start[i] + tile_count[i])}, {"grad_in_", grad_input_tensor_reshape.slice( tile_start[i], tile_start[i] + tile_count[i])}}); graph.setTileMapping(gather_vertex, i); graph.setInitialValue(gather_vertex["grad_out_shape_"], grad_output_shape); graph.setInitialValue(gather_vertex["grad_in_shape_"], grad_input_shape); } prog.add(poplar::program::Execute(gather_grad_cs)); return grad_input_tensor; } namespace { popart::popx::OpxCreator fast_gather_last_dim_opx(poptorch_custom_ops::fast_gather_last_dim); popart::popx::OpxCreator fast_gather_last_dim_grad_opx( poptorch_custom_ops::fast_gather_last_dim_grad); } // namespace } // namespace poptorch_custom_ops } // namespace poptorch ================================================ FILE: popart_compiler/source/custom_operations/FastGatherLastDim.hpp ================================================ // Copyright (c) 2022 Graphcore Ltd. All rights reserved. #ifndef GUARD_POPTORCH_STATICGATHER_HPP #define GUARD_POPTORCH_STATICGATHER_HPP #include #include #include #include #include #include #include #include #include namespace poptorch { namespace poptorch_custom_ops { class FastGatherLastDimOp : public popart::Op { public: FastGatherLastDimOp(const popart::OperatorIdentifier &opid_, const popart::Op::Settings &settings_, const std::string &debug_str); FastGatherLastDimOp(const FastGatherLastDimOp &) = default; FastGatherLastDimOp &operator=(const FastGatherLastDimOp &) = delete; ~FastGatherLastDimOp() override = default; std::vector> getGradOps() final; std::unique_ptr clone() const final; void setup() final; float getSubgraphValue() const final { return getHighSubgraphValue(); } int64_t getAxis() const { return _axis; } popart::Shape getInShape() const { return _in_shape; } popart::Shape getOutShape() const { return _out_shape; } const std::string &getDebugStr() const { return _debug_str; } private: int64_t _axis; popart::Shape _in_shape; popart::Shape _out_shape; std::string _debug_str; }; class FastGatherLastDimOpx : public popart::popx::Opx { public: FastGatherLastDimOpx(popart::Op *, popart::popx::Devicex *); ~FastGatherLastDimOpx() override = default; void grow(poplar::program::Sequence &prog) const final; private: static poplar::Tensor addGraphProg(poplar::Graph &graph, poplar::program::Sequence &prog, const poplar::Tensor &data_tensor, const poplar::Tensor &idx_tensor, const std::vector &fwd_out_shape); }; class FastGatherLastDimGradOp : public popart::Op { public: explicit FastGatherLastDimGradOp(const FastGatherLastDimOp &fwdOp); std::unique_ptr clone() const final; virtual void setup() { this->outInfo(0) = {this->inInfo(0).dataType(), _fwd_in_shape}; } /* Describes the relationship of the inputs of the grad op to the inputs/outputs of the non-grad op */ virtual const std::vector &gradInputInfo() const { static const std::vector in_info = { // The input of grad op at index 0 is the gradient of the output at // index 0 of the non-grad op {0, 0, popart::GradOpInType::GradOut}, // The input of grad op at index 1 is the input at index 1 // of the non-grad op {1, 1, popart::GradOpInType::In}}; return in_info; } /* Describes the relationship of the outputs of the grad op to the inputs/outputs of the non-grad op */ virtual const std::map &gradOutToNonGradIn() const { static const std::map out_info = { // The output at index 0 is dLhs, i.e the gradient of the input at index // 0 of non-grad op {0, 0}, }; return out_info; } float getSubgraphValue() const final { return getLowSubgraphValue(); } const std::string &getDebugStr() const { return _debug_str; } popart::Shape getFwdInShape() const { return _fwd_in_shape; } private: int64_t _axis; popart::Shape _fwd_in_shape; std::string _debug_str; }; class FastGatherLastDimGradOpx : public popart::popx::Opx { public: FastGatherLastDimGradOpx(popart::Op *op, popart::popx::Devicex *devicex); ~FastGatherLastDimGradOpx() override = default; void grow(poplar::program::Sequence &prog) const final; private: static poplar::Tensor addGraphProg(poplar::Graph &graph, poplar::program::Sequence &prog, const poplar::Tensor &grad_output_tensor, poplar::Tensor &grad_input_tensor, const std::vector &fwd_in_shape, const poplar::Tensor &idx_tensor); }; } // namespace poptorch_custom_ops } // namespace poptorch #endif ================================================ FILE: popart_compiler/source/custom_operations/FastGatherLastDimBwdCodelets.inc.cpp ================================================ // Copyright (c) 2022, Graphcore Ltd, All rights reserved. #ifdef __IPU__ #include #else #error Not supported on IPU Model #endif #include #include using namespace poplar; static constexpr auto ONE_PTR = poplar::VectorLayout::ONE_PTR; template struct FloatDef{ }; template<> struct FloatDef{ typedef float2 FVType; typedef int2 IVType; static inline constexpr float2 kZeroV = { 0.0f, 0.0f }; }; template<> struct FloatDef{ typedef float2 FVType; typedef short2 IVType; static inline constexpr float2 kZeroV = { 0.0f, 0.0f }; }; template<> struct FloatDef{ typedef half4 FVType; typedef int2 IVType; static inline constexpr half4 kZeroV = { 0.0f, 0.0f, 0.0f, 0.0f }; }; template<> struct FloatDef{ typedef half4 FVType; typedef short4 IVType; static inline constexpr half4 kZeroV = { 0.0f, 0.0f, 0.0f, 0.0f }; }; template struct OutputDef{ }; template<> struct OutputDef{ typedef Vector>> OutputType; }; template<> struct OutputDef{ typedef Vector>> OutputType; }; template class FastGatherGradVertex : public Vertex { public: FastGatherGradVertex() ; Vector>> grad_out_; Vector>> idx_; //Vector>> grad_in_; typename OutputDef::OutputType grad_in_; const Vector grad_out_shape_; const Vector grad_in_shape_; template::value, void>::type* = nullptr> static void run(Vector>> const& grad_out, Vector>> const& idx, typename OutputDef::OutputType& grad_in, Vector const& grad_out_shape, Vector const& grad_in_shape) { int c = grad_out.size(); int grad_out_dim_size = grad_out_shape[grad_out_shape.size() - 1]; int grad_out_dim_size_half = grad_out_dim_size >> 1; int grad_out_dim_size2 = grad_out_dim_size_half << 1; int grad_in_dim_size = grad_in_shape[grad_out_shape.size() - 1]; int grad_in_dim_size_half = grad_in_dim_size >> 1; int grad_in_dim_size2 = grad_in_dim_size_half << 1; for(int i = 0 ; i < c ; i ++) { typename FloatDef::FVType const* cur_grad_out_ptr2 = (typename FloatDef::FVType*)(&(grad_out[i][0])); typename FloatDef::IVType const* cur_idx_ptr2 = (typename FloatDef::IVType*)(&(idx[i][0])); typename FloatDef::FVType* cur_grad_in_ptr2 = (typename FloatDef::FVType*)(&(grad_in[i][0])); FT const* cur_grad_out_ptr = (FT*)cur_grad_out_ptr2; IT const* cur_idx_ptr = (IT const*)cur_idx_ptr2; FT* cur_grad_in_ptr = (FT*)cur_grad_in_ptr2; int j = 0; for(j = 0 ; j < grad_out_dim_size_half ; j ++) { typename FloatDef::FVType cur_grad_out = cur_grad_out_ptr2[j]; typename FloatDef::IVType idx = cur_idx_ptr2[j]; cur_grad_in_ptr[idx[0]] += cur_grad_out[0]; cur_grad_in_ptr[idx[1]] += cur_grad_out[1]; } if(0 != (grad_out_dim_size & 1)) { FT cur_grad_out = cur_grad_out_ptr[grad_out_dim_size2]; IT idx = cur_idx_ptr[grad_out_dim_size2]; cur_grad_in_ptr[idx] += cur_grad_out; } } }; template::value, void>::type* = nullptr> static void run(Vector>> const& grad_out, Vector>> const& idx, typename OutputDef::OutputType& grad_in, Vector const& grad_out_shape, Vector const& grad_in_shape) { int c = grad_out.size(); int grad_out_dim_size = grad_out_shape[grad_out_shape.size() - 1]; int grad_out_dim_size_q = grad_out_dim_size >> 2; int grad_out_dim_size4 = grad_out_dim_size_q << 2; int grad_in_dim_size = grad_in_shape[grad_out_shape.size() - 1]; int grad_in_dim_size_q = grad_in_dim_size >> 2; int grad_in_dim_size4 = grad_out_dim_size_q << 2; for(int i = 0 ; i < c ; i ++) { typename FloatDef::FVType const* cur_grad_out_ptr4 = (typename FloatDef::FVType*)(&(grad_out[i][0])); typename FloatDef::IVType const* cur_idx_ptr2 = (typename FloatDef::IVType const*)(&(idx[i][0])); typename FloatDef::FVType* cur_grad_in_ptr4 = (typename FloatDef::FVType*)(&(grad_in[i][0])); FT const* cur_grad_out_ptr = (FT*)cur_grad_out_ptr4; IT const* cur_idx_ptr = (IT const*)cur_idx_ptr2; FT* cur_grad_in_ptr = (FT*)cur_grad_in_ptr4; int j = 0; for(j = 0 ; j < grad_out_dim_size_q ; j ++) { typename FloatDef::FVType cur_grad_out = cur_grad_out_ptr4[j]; typename FloatDef::IVType idx0 = cur_idx_ptr2[2 * j]; typename FloatDef::IVType idx1 = cur_idx_ptr2[2 * j + 1]; cur_grad_in_ptr[idx0[0]] += cur_grad_out[0]; cur_grad_in_ptr[idx0[1]] += cur_grad_out[1]; cur_grad_in_ptr[idx1[0]] += cur_grad_out[2]; cur_grad_in_ptr[idx1[1]] += cur_grad_out[3]; } for(j = grad_out_dim_size4 ; j < grad_out_dim_size ; j ++) { FT cur_grad_out = cur_grad_out_ptr[j]; IT idx = cur_idx_ptr[j]; cur_grad_in_ptr[idx] += cur_grad_out; } } } bool compute() { run(grad_out_, idx_, grad_in_, grad_out_shape_, grad_in_shape_); return true; } }; template class FastGatherGradVertex; template class FastGatherGradVertex; ================================================ FILE: popart_compiler/source/custom_operations/FastGatherLastDimFwdCodelets.inc.cpp ================================================ // Copyright (c) 2022, Graphcore Ltd, All rights reserved. #ifdef __IPU__ #include #else #error Not supported on IPU Model #endif #include #include using namespace poplar; static constexpr auto ONE_PTR = poplar::VectorLayout::ONE_PTR; template struct FloatDef{ }; template<> struct FloatDef{ typedef float2 FVType; typedef int2 IVType; static inline constexpr float2 kZeroV = { 0.0f, 0.0f }; }; template<> struct FloatDef{ typedef float2 FVType; typedef short2 IVType; static inline constexpr float2 kZeroV = { 0.0f, 0.0f }; }; template<> struct FloatDef{ typedef half4 FVType; typedef int2 IVType; static inline constexpr half4 kZeroV = { 0.0f, 0.0f, 0.0f, 0.0f }; }; template<> struct FloatDef{ typedef half4 FVType; typedef short4 IVType; static inline constexpr half4 kZeroV = { 0.0f, 0.0f, 0.0f, 0.0f }; }; template class FastGatherVertex : public Vertex { public: FastGatherVertex() ; Vector>> data_; Vector>> idx_; Vector>> result_; const Vector dst_shape_; template::value, void>::type* = nullptr> static void run(Vector>> const& data, Vector>> const& idx, Vector>>& result, Vector const& dst_shape) { int c = data.size(); int out_dim_size = dst_shape[dst_shape.size() - 1]; int out_dim_size_v_r = out_dim_size >> 1; int out_dim_size_v = out_dim_size_v_r << 1; for(int i = 0 ; i < c ; i ++) { typename FloatDef::FVType const* cur_data_ptrv = (typename FloatDef::FVType const*)(&(data[i][0])); typename FloatDef::IVType const* cur_idx_ptrv = (typename FloatDef::IVType const*)(&(idx[i][0])); typename FloatDef::FVType* cur_out_ptrv = (typename FloatDef::FVType*)(&(result[i][0])); float const* cur_data_ptr = (float const*)cur_data_ptrv; int const* cur_idx_ptr = (int const*)cur_idx_ptrv; float* cur_out_ptr = (float*)cur_out_ptrv; int j = 0; for(j = 0 ; j < out_dim_size_v_r ; j ++) { typename FloatDef::IVType idx = cur_idx_ptrv[j]; typename FloatDef::FVType cur_val = { cur_data_ptr[idx[0]], cur_data_ptr[idx[1]] }; cur_out_ptrv[j] = cur_val; } if(0 != (out_dim_size & 1)) { int idx = cur_idx_ptr[out_dim_size_v]; cur_out_ptr[out_dim_size_v] = cur_data_ptr[idx]; } } }; template::value, void>::type* = nullptr> static void run(Vector>> const& data, Vector>> const& idx, Vector>>& result, Vector const& dst_shape) { int c = data.size(); int out_dim_size = dst_shape[dst_shape.size() - 1]; int out_dim_size_v_r = out_dim_size >> 2; int out_dim_size_v = out_dim_size_v_r << 2; for(int i = 0 ; i < c ; i ++) { typename FloatDef::FVType const* cur_data_ptrv = (typename FloatDef::FVType const*)(&(data[i][0])); typename FloatDef::IVType const* cur_idx_ptrv = (typename FloatDef::IVType const*)(&(idx[i][0])); typename FloatDef::FVType* cur_out_ptrv = (typename FloatDef::FVType*)(&(result[i][0])); FT const* cur_data_ptr = (FT const*)cur_data_ptrv; IT const* cur_idx_ptr = (IT const*)cur_idx_ptrv; FT* cur_out_ptr = (FT*)cur_out_ptrv; int j = 0; for(j = 0 ; j < out_dim_size_v_r ; j ++) { typename FloatDef::IVType idx0 = cur_idx_ptrv[2 * j]; typename FloatDef::IVType idx1 = cur_idx_ptrv[2 * j + 1]; typename FloatDef::FVType cur_val = { cur_data_ptr[idx0[0]], cur_data_ptr[idx0[1]], cur_data_ptr[idx1[0]], cur_data_ptr[idx1[1]] }; cur_out_ptrv[j] = cur_val; } for(j = out_dim_size_v ; j < out_dim_size ; j ++) { IT idx = cur_idx_ptr[j]; cur_out_ptr[j] = cur_data_ptr[idx]; } } }; bool compute() { run(data_, idx_, result_, dst_shape_); return true; } }; template class FastGatherVertex; template class FastGatherVertex; ================================================ FILE: popart_compiler/source/custom_operations/HostOp.cpp ================================================ // Copyright (c) 2021 Graphcore Ltd. All rights reserved. #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "popart_compiler/CompilerImpl.hpp" #include "popart_compiler/CustomOps.hpp" namespace poptorch { namespace poptorch_custom_ops { const char host_op_metadata_attr[] = "func_info"; } // namespace poptorch_custom_ops /* * A popart custom operation to handle Host operations. Takes a callback and * sets up the IPU->CPU communication for the needed tensors. */ namespace popart_compiler { namespace { // Get the popart type info for a given output from the stream metadata. popart::TensorInfo shapeInferOutput(detail::CallbackInternalMetadata *func_info, std::uint32_t i) { // Get type and shape from metadata. const popart::DataType type = popartTypeFromPoptorch(func_info->output_types[i]); const std::vector &shape = func_info->output_shapes[i]; // Convert from the poptorch/poplar type (std::size_t) to the popart one // (std::uint64_t). popart::Shape as_popart_shape; as_popart_shape.reserve(shape.size()); for (std::size_t elem : shape) { as_popart_shape.push_back(elem); } // Create popart info. return popart::TensorInfo{type, as_popart_shape}; } detail::CallbackInternalMetadata * getMetadataFromAttributeMap(const popart::Attributes &attrs) { // Pointer smuggled in via an integer. std::int64_t as_int = attrs.getAttribute( poptorch_custom_ops::host_op_metadata_attr); logging::trace("Pointer retrieved by CPU op {}", as_int); std::intptr_t as_ptr = static_cast(as_int); logging::trace("Casted from {} to {}", as_int, as_ptr); // Cast to the correct type. // NOLINTNEXTLINE performance-no-int-to-ptr return reinterpret_cast(as_ptr); } } // namespace /* Popart custom op which uses the metadata gathered by the compiler to setup poplar tensors and copy into/from them from/to host. */ class HostOp : public popart::Op { public: HostOp(const popart::OperatorIdentifier &_opid, detail::CallbackInternalMetadata *info, const popart::Op::Settings &settings_) : popart::Op(_opid, settings_), func_info(info) {} // Configure the output popart Tensor void setup() override { // Tell popart what the output should look like. for (std::uint32_t i = 0; i < func_info->output_types.size(); ++i) { outInfo(i) = shapeInferOutput(func_info, i); } } std::unique_ptr clone() const final { return std::make_unique(*this); } float getSubgraphValue() const final { return getLowSubgraphValue(); } detail::CallbackInternalMetadata *func_info; }; class HostOpx : public popart::popx::Opx { public: HostOpx(popart::Op *op, popart::popx::Devicex *devicex) : popart::popx::Opx(op, devicex) { verifyOp(op, poptorch_custom_ops::host_op); func_info = dynamic_cast(op)->func_info; } void grow(poplar::program::Sequence &sequence) const override { poplar::Graph &graph = this->graph(); // Get basic op info from metadata. const std::uint32_t num_inputs = func_info->input_types.size(); const std::uint32_t num_outputs = func_info->output_types.size(); // For each input create the FIFO and copy from it into the poplar tensor // popart has already created/ std::vector input_args; std::vector inputs; inputs.reserve(num_inputs); input_args.reserve(num_inputs); for (std::uint32_t input_index = 0; input_index < num_inputs; ++input_index) { // poplar::Tensor from popart. poplar::Tensor input_tensor = getInTensor(input_index); inputs.push_back(input_tensor); input_args.emplace_back(input_tensor.elementType(), input_tensor.numElements()); } std::vector output_args; std::vector outputs; outputs.reserve(num_outputs); output_args.reserve(num_outputs); for (std::uint32_t output = 0; output < num_outputs; ++output) { const poplar::Type type = poplarTypeFromPoptorch(func_info->output_types[output]); const std::vector &shape = func_info->output_shapes[output]; // Add the poplar tensor. std::string name = func_info->handle + "::out" + std::to_string(output); poplar::Tensor output_tensor = graph.addVariable( type, shape, poplar::VariableMappingMethod::LINEAR, std::move(name)); outputs.push_back(output_tensor); output_args.emplace_back(output_tensor.elementType(), output_tensor.numElements()); // Tell popart this is the output. setOutTensor(output, output_tensor); } poplar::HostFunction hf = graph.addHostFunction(func_info->handle, input_args, output_args); sequence.add(poplar::program::Call(hf, inputs, outputs)); } detail::CallbackInternalMetadata *func_info; }; } // namespace popart_compiler } // namespace poptorch static popart::OpCreator host_op_creator( {{poptorch::poptorch_custom_ops::host_op, {}}}, [](const popart::OpCreatorInfo &info) { // Get the stream info from the attribute map we passed to // create the op. auto *func_info = poptorch::popart_compiler::getMetadataFromAttributeMap( info.attributes); return std::unique_ptr(new poptorch::popart_compiler::HostOp( info.opid, func_info, info.settings)); }, true); static popart::popx::OpxCreator host_opx_creator(poptorch::poptorch_custom_ops::host_op); static popart::RegisterShapeInferenceFunction host_op_shape_inference( poptorch::poptorch_custom_ops::host_op, [](popart::ShapeInferenceContext &ctx) { // Get the stream info from the attribute map we passed to create the op. auto *func_info = poptorch::popart_compiler::getMetadataFromAttributeMap( ctx.getAttributes()); // Tell popart what the output should look like. for (std::uint32_t i = 0; i < func_info->output_types.size(); ++i) { ctx.outInfo(i) = poptorch::popart_compiler::shapeInferOutput(func_info, i); } }); ================================================ FILE: popart_compiler/source/custom_operations/TorchSoftplus.cpp ================================================ // Copyright (c) 2021 Graphcore Ltd. All rights reserved. #include "TorchSoftplus.hpp" #include "popart_compiler/CustomOps.hpp" #include #include #include #include #include #include #include #include namespace poptorch { namespace poptorch_custom_ops { TorchSoftplusOp::TorchSoftplusOp(const popart::OperatorIdentifier &opid_, float beta, float threshold, const popart::Op::Settings &opSettings) : popart::ElementWiseUnaryOp(opid_, opSettings), _beta(beta), _threshold(threshold) {} std::unique_ptr TorchSoftplusOp::clone() const { return std::make_unique(*this); } std::vector> TorchSoftplusOp::getGradOps() { std::vector> result; result.emplace_back(std::make_unique(*this)); return result; } std::vector> TorchSoftplusOp::inplacePriorityDefault() const { // see T6768: choosing default inplace priorities return {{poptorch_custom_ops::torch_softplus_inplace, 10}}; } std::unique_ptr TorchSoftplusOp::getInplaceVariant( const popart::OperatorIdentifier &operator_id) const { if (operator_id == poptorch_custom_ops::torch_softplus_inplace) { return std::make_unique(*this); } return popart::Op::getInplaceVariant(operator_id); } void TorchSoftplusOp::appendOutlineAttributes( popart::OpSerialiserBase &os) const { popart::Op::appendOutlineAttributes(os); os.appendAttribute("beta", beta()); os.appendAttribute("threshold", threshold()); } TorchSoftplusInplaceOp::TorchSoftplusInplaceOp(const TorchSoftplusOp &op) : popart::ElementWiseInplaceUnaryOp( poptorch_custom_ops::torch_softplus_inplace, op.getSettings()), _beta(op.beta()), _threshold(op.threshold()) {} std::unique_ptr TorchSoftplusInplaceOp::clone() const { return std::make_unique(*this); } void TorchSoftplusInplaceOp::appendOutlineAttributes( popart::OpSerialiserBase &os) const { popart::Op::appendOutlineAttributes(os); os.appendAttribute("beta", beta()); os.appendAttribute("threshold", threshold()); } TorchSoftplusGradOp::TorchSoftplusGradOp(const TorchSoftplusOp &fwd_op) : popart::ElementWiseNonLinearUnaryGradOp( poptorch_custom_ops::torch_softplus_grad, fwd_op), _beta(fwd_op.beta()), _threshold(fwd_op.threshold()) {} std::unique_ptr TorchSoftplusGradOp::clone() const { return std::make_unique(*this); } void TorchSoftplusGradOp::appendOutlineAttributes( popart::OpSerialiserBase &os) const { popart::Op::appendOutlineAttributes(os); os.appendAttribute("beta", beta()); os.appendAttribute("threshold", threshold()); } namespace { popart::OpDefinition::DataTypes dtypes = { popart::DataType::UINT8, popart::DataType::UINT16, popart::DataType::UINT32, popart::DataType::UINT64, popart::DataType::INT8, popart::DataType::INT16, popart::DataType::INT32, popart::DataType::INT64, popart::DataType::FLOAT16, popart::DataType::FLOAT}; popart::OpDefinition softplus_def({popart::OpDefinition::Inputs({{"input", dtypes}}), popart::OpDefinition::Outputs({{"output", dtypes}}), popart::OpDefinition::Attributes({{"beta", {"*"}}, {"threshold", {"*"}}})}); popart::OpCreator softplus_creator( popart::OpDefinitions({{poptorch_custom_ops::torch_softplus, softplus_def}}), [](const popart::OpCreatorInfo &info) { float const beta = info.attributes.getAttribute("beta", 1.0); float const threshold = info.attributes.getAttribute("threshold", 1.0); return std::unique_ptr( new TorchSoftplusOp(info.opid, beta, threshold, info.settings)); }, true); } // namespace namespace pe = popops::expr; template std::unique_ptr create(popart::Op *op) { auto *x = dynamic_cast(op); if (x == nullptr) { throw popart::error("Invalid torch softplus operator."); } return TorchSoftplusComputex::get(x->beta(), x->threshold()); } TorchSoftplusOpx::TorchSoftplusOpx(popart::Op *op, popart::popx::Devicex *devicex) : ElementWiseUnaryOutplaceOpx(op, devicex, create(op)) { verifyOp(op, {poptorch_custom_ops::torch_softplus}); } void TorchSoftplusComputex::inplace(poplar::program::Sequence &prog, poplar::Graph &graph, const poplar::Tensor &tensor, const poplar::DebugNameAndId &dnai, const std::string &prefix) const { // Torch Softplus definition: // 1/beta * log[1 + exp(beta * x)] for beta * x <= threshold // x for beta * x > threshold // // To avoid overflow when evaluating the exp, we use the following equivalent // formula for softplus: // 1/beta * log[1 + exp(-abs(beta * x))] + max(x, 0) (void)prefix; // unused input parameter using ExprPtr = std::unique_ptr; std::vector exprs; exprs.push_back(std::make_unique(pe::_1)); if (_beta != 1.0f) { exprs.push_back(std::make_unique(pe::Const(_beta), *exprs.back())); } auto &bx = *exprs.back(); // log1p(-exp(|beta * x|)) exprs.push_back(std::make_unique(-pe::Abs(*exprs.back()))); exprs.push_back(std::make_unique(*exprs.back())); if (_beta != 1.0f) { exprs.push_back( std::make_unique(*exprs.back(), pe::Const(_beta))); } // 1/beta * log1p(-exp(|beta * x|)) + max(x, 0) exprs.push_back(std::make_unique(*exprs.back(), pe::Max(pe::_1, pe::Const(0.0f)))); // beta * x <= threshold ? 1/beta * log1p(-exp(|beta * x|)) + max(x, 0) : x exprs.push_back(std::make_unique(*exprs.back(), pe::_1, bx <= pe::Const(_threshold))); popops::mapInPlace(graph, *exprs.back(), {tensor}, prog, {dnai, "torch_softplus"}); } std::unique_ptr TorchSoftplusComputex::get(float beta, float threshold) { return std::make_unique(beta, threshold); } TorchSoftplusInplaceOpx::TorchSoftplusInplaceOpx(popart::Op *op, popart::popx::Devicex *devicex) : ElementWiseUnaryInplaceOpx(op, devicex, create(op)) { verifyOp(op, poptorch_custom_ops::torch_softplus_inplace); } TorchSoftplusGradOpx::TorchSoftplusGradOpx(popart::Op *op, popart::popx::Devicex *devicex) : Opx(op, devicex), _beta(), _threshold() { verifyOp(op, poptorch_custom_ops::torch_softplus_grad); auto &grad_op = getOp(); _beta = grad_op.beta(); _threshold = grad_op.threshold(); } void TorchSoftplusGradOpx::grow(poplar::program::Sequence &prog) const { // The derivative of the softplus activation function is: // // exp(beta*x)/(exp(beta*x) + 1) = 1/(exp(-beta*x) + 1) = sigmoid(beta*x) // // To match the Torch definition: // // grad_out = grad_in * sigmoid(beta*x) for beta * x <= threshold // grad_in for beta * x > threshold const auto &grad_in = getInTensor(TorchSoftplusGradOp::getGradInIndex()); const auto &fwd_input = getInTensor(TorchSoftplusGradOp::getFwdArgInIndex()); using ExprPtr = std::unique_ptr; std::vector exprs; exprs.push_back(std::make_unique(pe::_2)); if (_beta != 1.0f) { exprs.push_back(std::make_unique(pe::Const(_beta), *exprs.back())); } auto &bx = *exprs.back(); // grad_in * sigmoid(beta*x) exprs.push_back(std::make_unique(pe::_1, pe::Sigmoid(bx))); // beta * x <= threshold ? grad_in * sigmoid(beta*x) : grad_in exprs.push_back(std::make_unique(*exprs.back(), pe::_1, bx <= pe::Const(_threshold))); auto output = popops::map(graph(), *exprs.back(), {grad_in, fwd_input}, prog, debugContext("torch_softplus_grad")); setOutTensor(TorchSoftplusGradOp::getOutIndex(), output); } namespace { popart::popx::OpxCreator softplus_opx(poptorch_custom_ops::torch_softplus); popart::popx::OpxCreator softplus_inplace_opx(poptorch_custom_ops::torch_softplus_inplace); popart::popx::OpxCreator softplus_grad_opx(poptorch_custom_ops::torch_softplus_grad); } // namespace } // namespace poptorch_custom_ops } // namespace poptorch ================================================ FILE: popart_compiler/source/custom_operations/TorchSoftplus.hpp ================================================ // Copyright (c) 2021 Graphcore Ltd. All rights reserved. #ifndef GUARD_POPTORCH_SOFTPLUS_HPP #define GUARD_POPTORCH_SOFTPLUS_HPP #include #include #include #include #include #include #include namespace poptorch { namespace poptorch_custom_ops { class TorchSoftplusOp : public popart::ElementWiseUnaryOp { public: TorchSoftplusOp(const popart::OperatorIdentifier &opid, float beta, float threshold, const popart::Op::Settings &settings); std::unique_ptr clone() const final; std::vector> getGradOps() final; std::vector> inplacePriorityDefault() const final; std::unique_ptr getInplaceVariant(const popart::OperatorIdentifier &id) const final; void appendOutlineAttributes(popart::OpSerialiserBase &os) const final; float beta() const { return _beta; } float threshold() const { return _threshold; } private: float _beta; float _threshold; }; class TorchSoftplusInplaceOp : public popart::ElementWiseInplaceUnaryOp { public: explicit TorchSoftplusInplaceOp(const TorchSoftplusOp &op); std::unique_ptr clone() const final; void appendOutlineAttributes(popart::OpSerialiserBase &os) const final; float beta() const { return _beta; } float threshold() const { return _threshold; } private: float _beta; float _threshold; }; class TorchSoftplusGradOp : public popart::ElementWiseNonLinearUnaryGradOp { public: explicit TorchSoftplusGradOp(const TorchSoftplusOp &fwd_op); std::unique_ptr clone() const final; void appendOutlineAttributes(popart::OpSerialiserBase &os) const final; float beta() const { return _beta; } float threshold() const { return _threshold; } private: float _beta; float _threshold; }; class TorchSoftplusComputex : public popart::popx::EwuComputex { public: TorchSoftplusComputex(float beta, float threshold) : _beta(beta), _threshold(threshold) {} void inplace(poplar::program::Sequence &prog, poplar::Graph &graph, const poplar::Tensor &tensor, const poplar::DebugNameAndId &dnai, const std::string &prefix) const final; static std::unique_ptr get(float beta, float threshold); private: float _beta; float _threshold; }; class TorchSoftplusOpx : public popart::popx::ElementWiseUnaryOutplaceOpx { public: TorchSoftplusOpx(popart::Op *op, popart::popx::Devicex *devicex); }; class TorchSoftplusInplaceOpx : public popart::popx::ElementWiseUnaryInplaceOpx { public: TorchSoftplusInplaceOpx(popart::Op *op, popart::popx::Devicex *devicex); }; class TorchSoftplusGradOpx : public popart::popx::Opx { public: TorchSoftplusGradOpx(popart::Op *op, popart::popx::Devicex *devicex); void grow(poplar::program::Sequence &prog) const final; private: float _beta; float _threshold; }; } // namespace poptorch_custom_ops } // namespace poptorch #endif ================================================ FILE: popart_compiler/source/custom_operations/UpsampleBilinear2d.cpp ================================================ // Copyright (c) 2021, Graphcore Ltd, All rights reserved. #include #include #include #include #include #include #include #include #include #include #include #include #include "popart_compiler/CodeletsCompilation.hpp" #include "popart_compiler/CompilerImpl.hpp" #include "popart_compiler/CustomOps.hpp" namespace { struct BilinearParams { size_t input0; size_t input1; float lambda0; float lambda1; }; float areaPixelComputeSourceIndex(float scale, size_t dst_index, bool align_corners, bool cubic) { if (align_corners) { return scale * dst_index; } const float src_idx = static_cast(scale * (dst_index + 0.5) - 0.5); // [Note] Follow Opencv resize logic: // We allow negative src_idx here and later will use // dx = src_idx - floorf(src_idx) // to compute the "distance"(which affects weights). // For linear modes, weight distribution doesn't matter // for negative indices as they use 2 pixels to interpolate. // For example, [-1, 0], they both use pixel 0 value so it // doesn't affect if we bound the src_idx to 0 or not. // TODO(mihailp): Our current linear mode impls use unbound indices // where we should and then remove this cubic flag. // This matters in cubic mode, as we might need [-1, 0, 1, 2] // to interpolate and the weights can be affected. return (!cubic && src_idx < 0) ? 0.0f : src_idx; } BilinearParams computeSourceIndexAndLambda(const float scale, size_t output_index, size_t input_size, bool align_corners) { if (scale == 1.0) { // scale_factor = 1, simply copy return {output_index, output_index, 1.0, 0.0}; } const float ratio = align_corners ? static_cast(input_size - 1) / (scale * input_size - 1.0) : 1.0f / scale; const float real_input_index = areaPixelComputeSourceIndex( ratio, output_index, align_corners, /*cubic=*/false); const size_t index0 = static_cast(real_input_index); const size_t offset = (index0 < input_size - 1) ? 1 : 0; const float lambda1 = real_input_index - index0; return {index0, index0 + offset, 1.0f - lambda1, lambda1}; } poplar::VertexRef connectVertex( poplar::Graph &graph, poplar::ComputeSet &cs, // NOLINT const std::string &vertexName, // NOLINT const std::unordered_map &vars, const std::unordered_map> &vectors, size_t tile) { poplar::VertexRef vtx = graph.addVertex(cs, vertexName); for (const auto &p : vars) { graph.connect(vtx[p.first], p.second); } for (const auto &p : vectors) { graph.connect(vtx[p.first], p.second); } graph.setPerfEstimate(vtx, 1); graph.setTileMapping(vtx, tile); return vtx; } using WeightKey = std::tuple; using WeightMap = std::map; struct TileInputs { std::vector i00, i01, i10, i11, output; std::vector> weights; std::vector weights_t; }; using MultipleTileMap = std::map; poplar::Tensor bilinearMap(poplar::Graph &graph, // NOLINT poplar::program::Sequence &prog, // NOLINT const poplar::Tensor &input, float scale_factor, const bool align_corners = false, const poplar::DebugContext &dc = {}) { poputil::PoplibsOpDebugInfo const di(dc, DI_ARGS(input, scale_factor)); const auto input_dims = input.shape(); assert(input_dims.size() == 4); // NOLINT auto output_dims = input_dims; output_dims[2] = static_cast(std::floor(output_dims[2] * scale_factor)); output_dims[3] = static_cast(std::floor(output_dims[3] * scale_factor)); auto input_shuffled = input.dimShuffle({2, 3, 0, 1}) .reshape({input_dims[2], input_dims[3], input_dims[0] * input_dims[1]}); std::vector i00s; std::vector i01s; std::vector i10s; std::vector i11s; std::vector w00s; std::vector w01s; std::vector w10s; std::vector w11s; for (size_t h = 0; h < output_dims[2]; ++h) { const BilinearParams params_h = computeSourceIndexAndLambda( scale_factor, h, input_dims[2], align_corners); for (size_t w = 0; w < output_dims[3]; ++w) { const BilinearParams params_w = computeSourceIndexAndLambda( scale_factor, w, input_dims[3], align_corners); w00s.push_back(params_h.lambda0 * params_w.lambda0); w01s.push_back(params_h.lambda0 * params_w.lambda1); w10s.push_back(params_h.lambda1 * params_w.lambda0); w11s.push_back(params_h.lambda1 * params_w.lambda1); i00s.push_back(input_shuffled[params_h.input0][params_w.input0]); i01s.push_back(input_shuffled[params_h.input0][params_w.input1]); i10s.push_back(input_shuffled[params_h.input1][params_w.input0]); i11s.push_back(input_shuffled[params_h.input1][params_w.input1]); } } poplar::Tensor const i00 = poplar::concat(i00s).reshape( {output_dims[2], output_dims[3], output_dims[0], output_dims[1]}); poplar::Tensor const i01 = poplar::concat(i01s).reshape( {output_dims[2], output_dims[3], output_dims[0], output_dims[1]}); poplar::Tensor const i10 = poplar::concat(i10s).reshape( {output_dims[2], output_dims[3], output_dims[0], output_dims[1]}); poplar::Tensor const i11 = poplar::concat(i11s).reshape( {output_dims[2], output_dims[3], output_dims[0], output_dims[1]}); const poplar::ArrayRef w00_ref{w00s}; const poplar::ArrayRef w01_ref{w01s}; const poplar::ArrayRef w10_ref{w10s}; const poplar::ArrayRef w11_ref{w11s}; poplar::Tensor const w00 = graph.addConstant( input.elementType(), {output_dims[2], output_dims[3], 1, 1}, w00_ref, {di, "w00"}); poputil::mapTensorLinearly(graph, w00); poplar::Tensor const w01 = graph.addConstant( input.elementType(), {output_dims[2], output_dims[3], 1, 1}, w01_ref, {di, "w01"}); poputil::mapTensorLinearly(graph, w01); poplar::Tensor const w10 = graph.addConstant( input.elementType(), {output_dims[2], output_dims[3], 1, 1}, w10_ref, {di, "w10"}); poputil::mapTensorLinearly(graph, w10); poplar::Tensor const w11 = graph.addConstant( input.elementType(), {output_dims[2], output_dims[3], 1, 1}, w11_ref, {di, "w11"}); poputil::mapTensorLinearly(graph, w11); poplar::Tensor const output = popops::map( graph, popops::expr::_1 * popops::expr::_2 + popops::expr::_3 * popops::expr::_4 + popops::expr::_5 * popops::expr::_6 + popops::expr::_7 * popops::expr::_8, {i00, w00, i01, w01, i10, w10, i11, w11}, prog, {di, "mapUpsampling"}); return output.dimShuffle({2, 3, 0, 1}); } using GradMultipleKey = std::pair; struct GradMultipleVal { float lambda0, lambda1; size_t h, w; }; using GradMultipleMap = std::map>; GradMultipleMap computeGradMap(size_t in_height, size_t in_width, size_t out_height, size_t out_width, float scale_factor, bool align_corners) { GradMultipleMap m; for (size_t h = 0; h < in_height; ++h) { const BilinearParams params_h = computeSourceIndexAndLambda(scale_factor, h, out_height, align_corners); for (size_t w = 0; w < in_width; ++w) { const BilinearParams params_w = computeSourceIndexAndLambda( scale_factor, w, out_width, align_corners); m[{params_h.input0, params_w.input0}].push_back( GradMultipleVal{params_h.lambda0, params_w.lambda0, h, w}); m[{params_h.input0, params_w.input1}].push_back( GradMultipleVal{params_h.lambda0, params_w.lambda1, h, w}); m[{params_h.input1, params_w.input0}].push_back( GradMultipleVal{params_h.lambda1, params_w.lambda0, h, w}); m[{params_h.input1, params_w.input1}].push_back( GradMultipleVal{params_h.lambda1, params_w.lambda1, h, w}); } } return m; } std::pair, std::vector> computeInputsWeights(const std::vector &vals, const poplar::Tensor &inputTensor) { std::vector inputs; std::vector weights; size_t prev_w = -1; size_t prev_h = -1; for (const auto &v : vals) { const float weight = v.lambda0 * v.lambda1; if (weight > 0.0f) { if (v.h == prev_h && v.w == prev_w) { weights.back() += weight; } else { weights.push_back(weight); inputs.push_back(inputTensor[v.h][v.w]); prev_w = v.w; prev_h = v.h; } } } return std::make_pair(inputs, weights); } void splitIntervalMultiple( poplar::Graph &graph, poplar::ComputeSet &cs, // NOLINT size_t tile, const std::vector &intervals, const poplar::Tensor &input, poplar::Tensor &output, // NOLINT const GradMultipleMap &m, const poplar::DebugContext &di) { const auto &full_interval = *intervals.begin(); size_t start_block = full_interval.begin(); const size_t block_size = output.shape()[2]; while (start_block < full_interval.end()) { const size_t end_block = std::min(start_block + block_size - (start_block % block_size), full_interval.end()); std::vector start_coords = poputil::unflattenIndex(output.shape(), start_block); std::vector end_coords = poputil::unflattenIndex(output.shape(), end_block - 1); assert(start_coords[0] == end_coords[0]); // NOLINT assert(start_coords[1] == end_coords[1]); // NOLINT const auto iter = m.find({start_coords[0], start_coords[1]}); assert(iter != m.end()); // NOLINT std::vector inputs; std::vector weights; std::tie(inputs, weights) = computeInputsWeights(iter->second, input); poplar::Tensor weights_t = graph.addConstant( input.elementType(), {weights.size()}, poplar::ArrayRef(weights), {di, "upsamplingGradWeights"}); graph.setTileMapping(weights_t, tile); poplar::Tensor const full_input_t = poplar::concat(inputs).reshape({inputs.size(), block_size}); poplar::Tensor const input_t = full_input_t.slice( {0, start_coords[2]}, {inputs.size(), end_coords[2] + 1}); graph.setTileMapping(input_t, tile); poplar::Interval const interval{start_block, end_block}; (void)connectVertex( graph, cs, poputil::templateVertex("BilinearGradVertex", input.elementType()), {{"out", output.flatten().slice(interval)}, {"w", weights_t}, {"input", input_t.transpose().flatten()}}, {}, tile); start_block = end_block; } } void splitInterval(poplar::Graph &graph, poplar::ComputeSet &cs, // NOLINT size_t tile, const std::vector &intervals, const poplar::Tensor &input, poplar::Tensor &output, // NOLINT const GradMultipleMap &m, const poplar::DebugContext &di) { const auto regions = poputil::splitRegionsBetweenWorkers(graph.getTarget(), intervals, 1); const size_t block_size = output.shape()[2]; const auto &full_interval = *intervals.begin(); std::vector start_coords = poputil::unflattenIndex(output.shape(), full_interval.begin()); std::vector end_coords = poputil::unflattenIndex(output.shape(), full_interval.end() - 1); assert(start_coords[0] == end_coords[0]); // NOLINT assert(start_coords[1] == end_coords[1]); // NOLINT const auto iter = m.find({start_coords[0], start_coords[1]}); assert(iter != m.end()); // NOLINT std::vector inputs; std::vector weights; std::tie(inputs, weights) = computeInputsWeights(iter->second, input); poplar::Tensor weights_t = graph.addConstant( input.elementType(), {weights.size()}, poplar::ArrayRef(weights), {di, "upsamplingGradWeights"}); graph.setTileMapping(weights_t, tile); poplar::Tensor const full_input_t = poplar::concat(inputs).reshape({inputs.size(), block_size}); for (const auto &r : regions) { assert(r.size() == 1); // NOLINT const auto &interval = *r.begin(); start_coords = poputil::unflattenIndex(output.shape(), interval.begin()); end_coords = poputil::unflattenIndex(output.shape(), interval.end() - 1); assert(start_coords[0] == end_coords[0]); // NOLINT assert(start_coords[1] == end_coords[1]); // NOLINT poplar::Tensor const input_t = full_input_t.slice( {0, start_coords[2]}, {inputs.size(), end_coords[2] + 1}); graph.setTileMapping(input_t, tile); (void)connectVertex( graph, cs, poputil::templateVertex("BilinearGradVertex", input.elementType()), {{"out", output.flatten().slice(interval)}, {"w", weights_t}, {"input", input_t.transpose().flatten()}}, {}, tile); } } void splitIntervalMultiplePixels(poplar::Graph &graph, // NOLINT poplar::ComputeSet &cs, // NOLINT size_t tile, const std::vector &intervals, const poplar::Tensor &input, poplar::Tensor &output, // NOLINT const GradMultipleMap &m, const poplar::DebugContext &di) { const size_t block_size = output.shape()[2]; // each pixel is block_size in length const auto regions = poputil::splitRegionsBetweenWorkers( graph.getTarget(), intervals, block_size); for (const auto &r : regions) { assert(r.size() == 1); // NOLINT const auto &interval = *r.begin(); assert((interval.size() % block_size) == 0); // NOLINT size_t start_block = interval.begin(); std::vector full_inputs; std::vector full_weights; std::vector limits; while (start_block < interval.end()) { const size_t end_block = start_block + block_size; std::vector start_coords = poputil::unflattenIndex(output.shape(), start_block); const std::vector end_coords = poputil::unflattenIndex(output.shape(), end_block - 1); assert(start_coords[0] == end_coords[0]); // NOLINT assert(start_coords[1] == end_coords[1]); // NOLINT const auto iter = m.find({start_coords[0], start_coords[1]}); assert(iter != m.end()); // NOLINT std::vector inputs; std::vector weights; std::tie(inputs, weights) = computeInputsWeights(iter->second, input); limits.push_back(weights.size()); std::copy(weights.begin(), weights.end(), std::back_inserter(full_weights)); std::copy(inputs.begin(), inputs.end(), std::back_inserter(full_inputs)); start_block = end_block; } poplar::Tensor weights_t = graph.addConstant( input.elementType(), {full_weights.size()}, poplar::ArrayRef(full_weights), {di, "upsamplingGradWeights"}); graph.setTileMapping(weights_t, tile); poplar::Tensor limits_t = graph.addConstant( poplar::UNSIGNED_INT, {limits.size()}, poplar::ArrayRef(limits), {di, "upsamplingGradLimits"}); graph.setTileMapping(limits_t, tile); poplar::Tensor const full_input_t = poplar::concat(full_inputs).reshape({full_inputs.size(), block_size}); graph.setTileMapping(full_input_t, tile); assert(0 == (interval.size() % block_size)); // NOLINT (void)connectVertex(graph, cs, poputil::templateVertex("BilinearGradMultipleVertex", input.elementType()), {{"out", output.flatten().slice(interval)}, {"w", weights_t}, {"limits", limits_t}, {"input", full_input_t.transpose().flatten()}}, {}, tile); } } void processTile(poplar::Graph &graph, poplar::ComputeSet &cs, // NOLINT size_t tile, const std::vector &intervals, const poplar::Tensor &input, poplar::Tensor &output, // NOLINT const GradMultipleMap &m, const poplar::DebugContext &di) { assert(intervals.size() == 1); // NOLINT const poplar::Interval &interval = *intervals.begin(); const size_t block_size = output.shape()[2]; const size_t block_start = interval.begin() - (interval.begin() % block_size); const size_t aligned_size = interval.end() - block_start; const uint32_t nb_blocks = std::ceil( static_cast(aligned_size / static_cast(block_size))); if (nb_blocks == 1) { splitInterval(graph, cs, tile, intervals, input, output, m, di); } else { if (nb_blocks <= 6) { splitIntervalMultiple(graph, cs, tile, intervals, input, output, m, di); } else { splitIntervalMultiplePixels(graph, cs, tile, intervals, input, output, m, di); } } } using Mapping = std::vector>; std::vector splitMapping(const Mapping &m, uint32_t partitions, uint32_t block_size) { if (partitions == 1) { return {m}; } std::vector res(partitions); for (const auto &m_i : m) { const auto regions = poputil::splitRegions(m_i, block_size, partitions); for (size_t j = 0; j < regions.size(); ++j) { res[j].push_back(regions[j]); } } return res; } poplar::Tensor bilinearMapGrads(poplar::Graph &graph, // NOLINT poplar::program::Sequence &prog, // NOLINT const poplar::Tensor &grad_output, float scale_factor, bool align_corners, uint32_t nb_splits = 0, const poplar::DebugContext &dc = {}) { poputil::PoplibsOpDebugInfo const di(dc, DI_ARGS(grad_output, scale_factor)); const auto grad_output_dims = grad_output.shape(); assert(grad_output_dims.size() == 4); // NOLINT auto grad_input_dims = grad_output_dims; grad_input_dims[2] = static_cast(std::floor(grad_output_dims[2] / scale_factor)); grad_input_dims[3] = static_cast(std::floor(grad_output_dims[3] / scale_factor)); auto grad_input = graph.addVariable( grad_output.elementType(), grad_input_dims, {di, "gradientsInput_" + std::to_string(grad_input_dims[2])}); auto grad_input_shuffled = grad_input.dimShuffle({2, 3, 0, 1}) .reshape({grad_input_dims[2], grad_input_dims[3], grad_input_dims[0] * grad_input_dims[1]}); size_t grain_size = 1; const size_t nb_pixels = grad_input_dims[2] * grad_input_dims[3]; const size_t num_tiles = graph.getTarget().getNumTiles(); const size_t num_workers = graph.getTarget().getNumWorkerContexts(); if (nb_pixels / num_tiles > num_workers) { grain_size = grad_output_dims[0] * grad_output_dims[1]; } poputil::mapTensorLinearly(graph, grad_input_shuffled, 1, grain_size); auto grad_output_shuffled = grad_output.dimShuffle({2, 3, 0, 1}) .reshape({grad_output_dims[2], grad_output_dims[3], grad_output_dims[0] * grad_output_dims[1]}); const GradMultipleMap m = computeGradMap( grad_output_dims[2], grad_output_dims[3], grad_input_dims[2], grad_input_dims[3], scale_factor, align_corners); const auto &full_mapping = graph.getTileMapping(grad_input_shuffled); if (nb_splits == 0) { // try to guess a good split nb_splits = 1; const uint32_t blocks_per_tile = std::ceil(static_cast(nb_pixels) / static_cast(num_tiles)); if (blocks_per_tile > 6) { if (blocks_per_tile <= 12) { nb_splits = 2; } else { if (blocks_per_tile > 12) { // ? nb_splits = 3; } } } } const auto mappings = splitMapping(full_mapping, nb_splits, grad_output_dims[0] * grad_output_dims[1]); for (size_t split = 0; split < mappings.size(); ++split) { poplar::ComputeSet compute_set = graph.addComputeSet({di, "upsamplingGrad_" + std::to_string(split) + "_" + std::to_string(grad_input_dims[2])}); const auto &mapping = mappings[split]; for (size_t tile = 0; tile < mapping.size(); ++tile) { const auto &intervals = mapping[tile]; if (!intervals.empty()) { processTile(graph, compute_set, tile, intervals, grad_output_shuffled, grad_input_shuffled, m, di); } } prog.add(poplar::program::Execute(compute_set, di)); } return grad_input; } // For training with a custom Op, four classes need to be implemented, // one for each of: // {forward, gradient} x {Op, Opx}. // // If only inference is required, then two classes need to be implemented: // {forward} x {Op, Opx}. // // The Op is a poplar/hardware agnostic description of the computation. // the Opx is the poplar implementation of the Op. // // We do training in this example, so the four classes implemented are: // class UpsampleOp; class UpsampleGradOp; class UpsampleOpx; class UpsampleGradOpx; namespace { // for C++11 compatibility, we don't use std::make_unique template std::unique_ptr makeUnique(Args &&...args) { return std::unique_ptr(new T(std::forward(args)...)); } } // namespace // The gradient Op class UpsampleGradOp : public popart::Op { public: explicit UpsampleGradOp(const UpsampleOp &fwdOp); std::unique_ptr clone() const final { return makeUnique(*this); } // The output popart Tensor has the same inputInfo and numerical type // (i.e. the same TensorInfo) as the input Tensor. This function is // required for inputInfo/type inference // void setup() final { auto input_info = inInfo(0); assert(input_info.rank() == 4); // NOLINT auto batch_size = input_info.dim(0); auto channels = input_info.dim(1); auto height = input_info.dim(2); auto width = input_info.dim(3); const int64_t output_height = static_cast(std::floor(height / _scalingFactor)); const int64_t output_width = static_cast(std::floor(width / _scalingFactor)); outInfo(0).set(input_info.dataType(), {batch_size, channels, output_height, output_width}); } // function describing the inputs and output(s) of UpsampleGradOp // The Gradient Op which we are implementing (UpsampleGradOp) has 2 inputs. // The input at index 0 is: // the gradient of the 0'th output Tensor of the UpsampleOp. // The input at index 1 is : // the 0'th output Tensor of the UpsampleOp. // Supposing the UpsampleOp has input Tensor T0 and output Tensor T1, // // input at index 0 (T0) // | // UpsampleOp // | // output at index 0 (T1) // // Then the picture described by the map below looks like, // // // input at index 0 (gradient of T1) // | input at index 1 (T1) // | | // | | // UpsampleGradOp // | // | // output at index 0 (gradient of T0) // const std::vector &gradInputInfo() const override { static const std::vector in_info = { {0, 0, popart::GradOpInType::GradOut}, {1, 0, popart::GradOpInType::Out}}; return in_info; } // The Grad Op only has one output, at index 0. The output at index 0 // is the gradient of the input at index 0 of the UpsampleOp const std::map &gradOutToNonGradIn() const override { static const std::map out_info = {{0, 0}}; return out_info; } // an estimate of how valuable sub-graph matching will be float getSubgraphValue() const final { return getLowSubgraphValue(); } float getScalingFactor() const { return _scalingFactor; } bool getAlignCorners() const { return _alignCorners; } // Implementation defined below void appendAttributes(popart::OpSerialiserBase &os) const override; // Implementation defined below void appendOutlineAttributes(popart::OpSerialiserBase &os) const override; private: float _scalingFactor; bool _alignCorners; }; // The forward Op class UpsampleOp : public popart::Op { public: UpsampleOp(const popart::OperatorIdentifier &_opid, float scalingFactor, bool alignCorners, const popart::Op::Settings &settings_) : popart::Op(_opid, settings_), _scalingFactor{scalingFactor}, _alignCorners(alignCorners) {} // same comment as for UpsampleGradOp, for running shape/type inference // "statically" void setup() override { auto input_info = inInfo(0); assert(input_info.rank() == 4); // NOLINT auto batch_size = input_info.dim(0); auto channels = input_info.dim(1); auto height = input_info.dim(2); auto width = input_info.dim(3); const int64_t output_height = static_cast(std::floor(height * _scalingFactor)); const int64_t output_width = static_cast(std::floor(width * _scalingFactor)); outInfo(0).set(input_info.dataType(), {batch_size, channels, output_height, output_width}); } std::unique_ptr clone() const final { return makeUnique(*this); } // There is only one Gradient Op for UpsampleOp, a UpsampleGradOp // It is possible to have multiple Gradient Ops // (Conv has 2 in popart, one for weights and one for activations) // std::vector> getGradOps() override { std::vector> upops; // NOLINT upops.emplace_back(new UpsampleGradOp(*this)); // NOLINT return upops; } void appendAttributes(popart::OpSerialiserBase &os) const override { Op::appendAttributes(os); os.appendAttribute("scaling_factor", getScalingFactor()); os.appendAttribute("align_corners", getAlignCorners()); } void appendOutlineAttributes(popart::OpSerialiserBase &os) const override { Op::appendOutlineAttributes(os); os.appendAttribute("scaling_factor", getScalingFactor()); os.appendAttribute("align_corners", getAlignCorners()); } // an estimate of how valuable sub-graph matching will be float getSubgraphValue() const final { return getLowSubgraphValue(); } float getScalingFactor() const { return _scalingFactor; } bool getAlignCorners() const { return _alignCorners; } private: float _scalingFactor; bool _alignCorners; }; // describe the inputs and outputs that are supported by the operation popart::OpDefinition::DataTypes t = {popart::DataType::FLOAT16, popart::DataType::FLOAT}; popart::OpDefinition upsample_op_def( {popart::OpDefinition::Inputs({{"input", t}}), popart::OpDefinition::Outputs({{"output", t}}), popart::OpDefinition::Attributes({{"scaling_factor", {"*"}}, {"align_corners", {"*"}}})}); popart::OpCreator upsample_op_creator( popart::OpDefinitions({{poptorch::poptorch_custom_ops::upsample_bilinear2d, upsample_op_def}}), [](const popart::OpCreatorInfo &info) { // default scalingFactor is 2.0 float const scaling_factor = info.attributes.getAttribute( "scaling_factor", 2.0f); int const align_corners = info.attributes.getAttribute("align_corners", 0); return std::make_unique(info.opid, scaling_factor, align_corners, info.settings); }, true); // forward Opx (poplar implementation of the forward Op) class UpsampleOpx : public popart::popx::Opx { public: UpsampleOpx(popart::Op *op, popart::popx::Devicex *devicex) : popart::popx::Opx(op, devicex) { // not strictly necessary, we check that op is castable to a UpsampleOp *. verifyOp(op, poptorch::poptorch_custom_ops::upsample_bilinear2d); // Get around the ABI issues. auto managed_ptr = poptorch::popart_compiler::compileCustomCodeletIfNeeded( "UpsampleBilinear2dCodelets.inc.cpp", /*hw_only_codelet=*/false); const char *compiled_codelet_path = static_cast(managed_ptr.get()); graph().addCodelets(std::string(compiled_codelet_path)); } void grow(poplar::program::Sequence &prog) const final { // Upsample the input. We create a poplar::Tensor of name outId(0) std::cerr << "Debug UpsampleOpx::grow\n"; auto op = getOp(); const float scaling_factor = op.getScalingFactor(); const bool align_corners = op.getAlignCorners(); auto input = getInTensor(0); setOutTensor( 0, bilinearMap(graph(), prog, input, scaling_factor, align_corners)); } }; // backward Opx (poplar implementation of the backward Op) class UpsampleGradOpx : public popart::popx::Opx { public: UpsampleGradOpx(popart::Op *op, popart::popx::Devicex *devicex) : popart::popx::Opx(op, devicex) { verifyOp( op, poptorch::poptorch_custom_ops::upsample_bilinear2d_grad); } // Create the gradient poplar::Tensor, which is // 3 * input_to_upsample**2 * gradient_of_upsample_output void grow(poplar::program::Sequence &prog) const final { std::cerr << "Debug UpsampleGradOpx::grow\n"; auto fwd_input = getInTensor(0); auto grad_out = getInTensor(1); auto op = getOp(); const float scaling_factor = op.getScalingFactor(); const bool align_corners = op.getAlignCorners(); setOutTensor(0, bilinearMapGrads(graph(), prog, grad_out, scaling_factor, align_corners)); } }; UpsampleGradOp::UpsampleGradOp(const UpsampleOp &fwdOp) : popart::Op(poptorch::poptorch_custom_ops::upsample_bilinear2d_grad, fwdOp.settings), _scalingFactor{fwdOp.getScalingFactor()}, _alignCorners{ fwdOp.getAlignCorners()} {} void UpsampleGradOp::appendAttributes(popart::OpSerialiserBase &os) const { Op::appendAttributes(os); os.appendAttribute("scaling_factor", getScalingFactor()); os.appendAttribute("align_corners", getAlignCorners()); } void UpsampleGradOp::appendOutlineAttributes( popart::OpSerialiserBase &os) const { Op::appendOutlineAttributes(os); os.appendAttribute("scaling_factor", getScalingFactor()); os.appendAttribute("align_corners", getAlignCorners()); } popart::popx::OpxCreator upsample_opx_creator(poptorch::poptorch_custom_ops::upsample_bilinear2d); popart::popx::OpxCreator upsample_grad_opx_creator( poptorch::poptorch_custom_ops::upsample_bilinear2d_grad); } // namespace ================================================ FILE: popart_compiler/source/custom_operations/UpsampleBilinear2dCodelets.inc.cpp ================================================ // Copyright (c) 2021, Graphcore Ltd, All rights reserved. #include #include #include #include #ifdef __IPU__ #include #endif #include #include static constexpr auto ONE_PTR = poplar::VectorLayout::ONE_PTR; template class BilinearMultipleVertex : public poplar::Vertex { public: poplar::Input> inputs; poplar::Output> out; poplar::Input> w; bool compute() { unsigned int offset = 0; for (unsigned int i = 0; i < out.size(); ++i) { out[i] = inputs[offset] * w[0] + inputs[offset + 1] * w[1] + inputs[offset + 2] * w[2] + inputs[offset + 3] * w[3]; offset += 4; } return true; } }; template class BilinearMultipleVertex; template class BilinearMultipleVertex; template class BilinearGradVertex : public poplar::Vertex { public: poplar::Input> input; poplar::Input> w; poplar::Output> out; bool compute() { unsigned int offset = 0; for (unsigned int i = 0; i < out.size(); ++i) { // b x c float res = 0.0f; for (unsigned int j = 0; j < w.size(); ++j) { res += float(input[offset + j] * w[j]); } out[i] = res; offset += w.size(); } return true; } }; template class BilinearGradVertex; template class BilinearGradVertex; template class BilinearGradMultipleVertex : public poplar::Vertex { public: poplar::Input> input; poplar::Input> w; poplar::Input> limits; poplar::Output> out; bool compute() { unsigned int offset = 0; const size_t block_size = out.size() / limits.size(); for (unsigned int i = 0; i < block_size; ++i) { // b x c unsigned int w_offset = 0; unsigned int pixel = 0; for (unsigned int limit : limits) { float res = 0.0f; for (unsigned int j = 0; j < limit; ++j) { res += float(input[offset + j] * w[w_offset + j]); } out[pixel * block_size + i] = res; offset += limit; w_offset += limit; ++pixel; } } return true; } }; template class BilinearGradMultipleVertex; template class BilinearGradMultipleVertex; ================================================ FILE: popart_compiler/source/include/popart_compiler/CompilerImpl.hpp ================================================ // Copyright (c) 2020 Graphcore Ltd. All rights reserved. #pragma once #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "popart_compiler/Compiler.hpp" #include "popart_compiler/CompilerOptions.hpp" #include "popart_compiler/MultiConvBuilder.hpp" #include "poptorch_logging/Logging.hpp" namespace poptorch { namespace popart_compiler { class Compiler; namespace detail { /* We use this structure to maintain all the information related to a CPU callback. This is used by the custom op to create the poplar tensors and by the compiler to create the poplar callbacks. */ struct CallbackInternalMetadata { // We need a unique ID for each so we can track how many we've added. static std::uint32_t number_of_added_ops; // The thing we are calling back. std::function the_callback; // Pointers to the buffers on host. std::vector input_pointers; std::vector output_pointers; // The names of the operation which we give on creation. The custom op needs // to see these to create the operation and the compiler needs it to attach // the callbacks. std::string handle; // Type and shape info for the input and outputs. std::vector input_types; std::vector> input_shapes; std::vector output_types; std::vector> output_shapes; // The callbacks are called in random order so we need to track how many have // copied their data to make sure we only call the host function once all of // them have copied it. std::atomic number_of_input_streams_inited; }; class StepIO : public popart::IStepIO { public: struct ArrayInfo { popart::IArray &array; int64_t offset; int64_t end_offset; int64_t replica_idx; }; using ArrayType = popart::IArray; using AccessorType = popart::StepIONS::IArrayAccessor; using TensorArrayMap = std::map; using TensorTimestamps = std::map>; using TensorArrayInfo = std::map; using TensorStepDataInfo = std::map; StepIO() = default; popart::ConstVoidData in(popart::TensorId id, int64_t num_elems, // NOLINT bool prefetch, bool /*isBroadcast*/) override; void inComplete(popart::TensorId id, int64_t num_elems, bool) override; // NOLINT popart::MutableVoidData out(popart::TensorId id, int64_t num_elems) override; // NOLINT void outComplete(popart::TensorId id) override; // NOLINT void computeStepDataInfo(const popart::TensorId &id, popart::IArray *array); void populate(const TensorArrayMap &inputs, const TensorArrayMap &outputs); template T get(const popart::TensorId &id, TensorArrayInfo *map, int64_t num_elems, bool is_input); static void timestamp(TensorTimestamps *time, const popart::TensorId &id); void assertNumElements( const popart::popx::Executablex & /*unused*/) const override {} const std::vector & getInputTimestamps(const popart::TensorId &id) const { return _in_times.at(id); } const std::vector & getInputCompleteTimestamps(const popart::TensorId &id) const { return _in_complete_times.at(id); } const std::vector & getOutputTimestamps(const popart::TensorId &id) const { return _out_times.at(id); } const std::vector & getOutputCompleteTimestamps(const popart::TensorId &id) const { return _out_complete_times.at(id); } void setInputGroupings(popart::CommGroupType type, int64_t input_group_size, int64_t replica_count); protected: TensorArrayInfo _inputs_info; TensorArrayInfo _outputs_info; TensorStepDataInfo _step_data_info; TensorTimestamps _in_times; TensorTimestamps _in_complete_times; TensorTimestamps _out_times; TensorTimestamps _out_complete_times; popart::CommGroupType _input_cgt; int64_t _input_group_size; int64_t _replica_count; }; class WeightsIO : public popart::IWeightsIO { public: ~WeightsIO() override = default; bool contains(popart::TensorId id) const final; popart::MutableVoidData weight(popart::TensorId id) const final; void registerParameter(const popart::TensorId &id, const popart::TensorInfo &info); void updateData(const std::vector &host_buffers); const std::vector ¶meterIds() const; private: std::map _weights; std::vector _weights_order; }; // Compare a ConstVoidData based on type, shape, and data struct ConstVoidDataLessThan { bool operator()(const popart::ConstVoidData &lhs, const popart::ConstVoidData &rhs) const; }; struct CompilerImpl { public: friend Compiler; CompilerImpl() : op_builder(popart::Builder::create()) { ids.emplace_back(""); // None tensor ids_types.push_back(PopartType::UNDEFINED); active_builder = op_builder.get(); using_overlapped_io = false; } ~CompilerImpl(); std::unique_ptr op_builder; // Op_builder is the top level graph. However to support subgraphs we switch // between adding ops from each of these subgraphs. All subgraphs are children // of the op_builder top level graph. popart::Builder *active_builder; // Stacks for subgraphs realizing true/false branch paths. std::stack if_true_stack; std::stack if_false_stack; std::map anchors; std::vector ids; std::vector ids_types; // Input tensors to the session. std::map popart_incoming; // Output tensors for the session. std::map popart_outgoing; std::map> outgoing_duplicates; std::vector inputs; std::vector outputs; // Flat representation of the output shapes std::vector output_types; // A list to allocate our buffers in so they get released. std::list> memory_manager; std::unique_ptr session; StepIO stepio; WeightsIO weights; WeightsIO updatable_named_buffers; WeightsIO optim_state_tensors; bool is_training = false; // At least one use of overlapped host IO bool using_overlapped_io = false; // Record the final loss, it is guaranteed by previous passes to be just one // loss. popart::TensorId loss; // List of options which have been explicitely set by the user. std::set options_set; popart::SessionOptions popart_options{}; CompilerOptions options{}; // We add operations using a state based system so the user would set the // active IPU and all subsequent operations will be added to that IPU until // stopped. // By default, the active IPU is 0 in case setActiveIpu is never used. // However, clearActiveIpu will set it to -1 making future use of // setActiveIpu compulsory. std::int64_t active_ipu{0}; std::uint64_t active_stage{0}; std::int64_t active_phase{0}; // Keep track of what the maximum phase number used is. std::int64_t max_phase{0}; // Number of ipus used (set by createDevice()) std::uint64_t num_ipus{0}; // Which IPUs are being used // Note that this does not take into account replication and so the number of // IPUs actually used is multiplied by popart_options.replicatedGraphCount. // Due to rounding and the issues with skipping an IPU in a range, the number // of IPUs required may increase further. std::unordered_set used_ipus; // Keep the number of ipu switches to work out the number of pipeline stages // if relevant. std::uint64_t num_ipu_switches{0}; // Store the last ipu used: this will always match active_ipu unless // active_ipu is set to -1. std::uint64_t last_ipu_used{0}; // Map of the pytorch variable update group to the popart weight. std::map> grad_update_groups; std::unique_ptr multi_conv_builder; // Dynamic container for all the callbacks to live in. std::list callbacks; // Returns the number of pipeline stages in the model execution std::uint64_t numPipelineStages(); popart::SourceLocation code_location; std::string torch_node; // General helpers. // Inserts memory into the list of tensors being output by the model. void addMemoryToOutput(TensorId id, void *ptr, std::unique_ptr &&memory); // Domain helpers popart::TensorId reshape(const std::vector &tensors, const std::vector &shape); void addOutputTensor(const std::vector &tensors); popart::TensorId addUntypedInputTensor(const std::vector &tensors); std::vector customOperation( const std::vector &args, const std::string &op, const std::string &domain, std::int64_t version, std::int64_t num_outputs, const std::shared_ptr> &attributes); popart::TensorId recomputationCheckpoint(const std::vector &tensors); popart::TensorId tensorConstant(const std::vector &tensors, const PopartConstant &constant); TensorId hostSideTensorConstant(const std::vector &tensors, HostSideConstant constant); popart::TensorId addNotInPlace(const std::vector &in); popart::TensorId randomNormal(const std::vector &tensors, const std::vector &shape, float mean, float scale, const std::string &dtype); popart::TensorId randomUniform(const std::vector &tensors, const std::vector &shape, float high, float low, const std::string &dtype); popart::TensorId ones(const std::vector &tensors, const std::vector &shape, const std::string &dtype); popart::TensorId zeros(const std::vector &tensors, const std::vector &shape, const std::string &dtype); popart::TensorId zerosOrOnes(const std::vector &tensors, const std::vector &shape, const std::string &dtype, bool zeros); popart::TensorId unfold(const std::vector &tensors, int64_t dimension, int64_t size, int64_t step); popart::TensorId prelu(std::vector &tensors); void addMultiConvPart(const std::vector &tensors, const std::vector &dilations, const std::vector &kernel_shape, const std::vector &pads, const std::vector &strides); std::vector endMultiConv(); void optimizerGroup(const std::vector &tensors, int64_t group) { std::vector ins; std::transform(tensors.begin(), tensors.end(), std::back_inserter(ins), [&](TensorId index) { return ids[index]; }); grad_update_groups.insert({group, ins}); } std::unique_ptr getPopartOptimizer(std::vector optimizers); void updateUseModelConfig(); std::string checkSystemConfig() const; template void setOptionIfNotSet(T &option, U value, const std::string &name, const std::string &value_as_string) { if (options_set.count(name) && option != static_cast(value)) { logging::warn("{} forced by the user from default to {}, " "ignoring value {}", name, option, value_as_string); } else { logging::debug("{} set to value {}", name, value_as_string); option = value; } } template void setOptionIfNotSet(T &option, U value, const std::string &name) { setOptionIfNotSet(option, value, name, std::to_string(value)); } void setExecutionStrategyAttributes(const std::set &tensors); const HostSideConstant &getHostSideConstant(TensorId id) const; bool isHostSideConstant(TensorId id) const; /* must_attach is a special case for on_demand devices */ std::shared_ptr createDevice(bool must_attach = false); bool waitIfUnavailable() const; void attachToDevice(); void detachFromDevice(); bool isAttachedToDevice() const; template void updateGroups(OptimizerType *optimizer, const std::vector &optimizers); std::string getPopartIR() const; std::set getTensorNames() const; // Returns the PopART type for specified id PopartType getPopartType(TensorId id) const; // Caches all PopART types void cachePopartTypes(); // Returns cached PopART type for the specified id // Caution: no bounds checking as this is called for each input, each run. // cachePopartType must be called once first. PopartType getCachedPopartType(TensorId id) const { return ids_types[id]; } void setAttribute(const std::string &attribute, const std::string &key, const std::string &value); void clearAttribute(const std::string &attribute, const std::string &key); popart::DebugContext getDebugContext(const std::string &name); // Mark named buffer as updatable void registerUpdatableNamedBuffer(const TensorId &id, const popart::TensorInfo &info); private: // Raise an error if cycle logging is enabled void errorOnCycleLogging() const; // Keep all the PopART tensors in a cache to avoid adding duplicate constants, // wasting tile memory. This must also be mapped by builder as constants // exist only in one graph. std::map> _constants_cache; std::vector> _constant_cloned_data; // Constants which are simply returned (possibly as part of a tuple/list) and // do not need to be input into Popart std::unordered_map _host_side_constants; std::shared_ptr _device; std::unordered_map> _attribute_key_value_map; }; } // namespace detail popart::DataType popartTypeFromPoptorch(PopartType); poplar::Type poplarTypeFromPoptorch(PopartType); } // namespace popart_compiler } // namespace poptorch ================================================ FILE: popart_compiler/source/include/popart_compiler/CompilerOptions.hpp ================================================ // Copyright (c) 2020 Graphcore Ltd. All rights reserved. #pragma once #include #include #include #include #include #include #include #include "popart_compiler/PopartEnums.hpp" namespace poptorch { namespace popart_compiler { namespace detail { enum class ExecutionMode { Pipelined, Sharded, Phased, N }; // To be kept in sync with the Liveness python enum in python/enums.py enum class Liveness { AlwaysLive, OffChipAfterFwd, OffChipAfterFwdNoOverlap, OffChipAfterEachPhase, N }; struct CompilerOptions { // A constant to tell the copmiler to use the system ipu version constexpr static std::uint64_t use_system_ipu_version = std::numeric_limits::max(); // Make PopART save the initializers in a separate file. // (Needed to keep the ONNX protobuf below the 2GB limit when compiling // large models) std::string external_initializers_file; // Number of times the graph will be executed for each execution. std::uint64_t steps{0}; // Strategy to adopt for returning the graph's output tensors. PopartOutputMode output_mode; // 'N' when output_mode == PopartOutputMode::EveryN std::uint64_t output_return_period; // True if running on the model, False otherwise. bool ipu_model{false}; // Automatically round up the number of IPUs, if required, to the minimum // number required to be reserved bool auto_round_num_ipus{false}; // Only used for offline compilation (DeviceConnectionType.Never): version // of the IPU should the Poplar compiler be targeting. std::uint64_t ipu_version{use_system_ipu_version}; // ID of the specific IPU the user wants to use. (If not set we'll just // iterate over the IPUs present on the system and try to connect to one // that matches our requirements). std::uint64_t ipu_id{0}; popart::DeviceConnectionType connection_type; popart::SyncPattern sync_pattern; std::uint64_t random_seed{0}; // The frontend will unpack the user option and pass it directly in as // [IPU_ID] = Memory proportion for that IPU std::unordered_map available_memory_proportion; // When running in distributed mode: number of processes the training is // split// over. std::uint64_t num_distributed_processes{1}; // In distributed mode: unique ID of this process in [0, // num_distributed_processes]// range std::uint64_t distributed_process_id{0}; popart::Patterns patterns{popart::PatternsLevel::Default}; ExecutionMode execution_mode{}; // Phased execution options: see the python documentation for more // information about how to use them // // Here is how they translate into Popart options: // serial_phases_execution: True -> executionPhaseSettings.stages = 1 // False-> executionPhaseSettings.stages = 2 // // separate_backward_phase: // False: // fwd: bwd: // phase 0 -> phase 4 // phase 1 -> phase 3 // phase 2 -> phase 2 // // (End of fwd and start of bwd are part of the same phase) // True: // fwd: bwd: // phase 0 -> phase 6 // phase 1 -> phase 5 // phase 2 -> phase 4 // // This is done by setting options.executionPhaseSettings.phases to N+1 // // Note that the bwd phases begin with phase 4 and not phase 3. This is // because PopART requires the phase IDs of a fwd/bwd pair to have matching // parity. Since the fwd phase ID is 2, the next phase ID with even parity // is 4. // // Furthermore, all odd phases must run on the same IPUs, and all even // phases must also run on the same IPUs. // // tensors_liveness: // Note: tensors have a liveness of [phase, phase+2] // AlwaysLive: // fwd: bwd: // phase 0 -> phase 6 // phase 1 -> phase 5 // phase 2 -> phase 4 // Stride = 1 // // OffChipAfterFwd: // fwd: bwd: // phase 0 -> phase 8 // phase 1 -> phase 7 // phase 2 -> phase 6 // Stride = 1 // (Gap between fwd and bwd > 2) // This is done by incrementing options.executionPhaseSettings.phases by 3 // // OffChipAfterFwdNoOverlap: // fwd: bwd: // phase 0 -> phase 12 // phase 2 -> phase 10 // phase 4 -> phase 8 // Stride = 2 // (Gap between fwd and bwd > 2, with no overlapping of load/store) // This is done by incrementing options.executionPhaseSettings.phases by 3 // and multiplying the phase_id by 2. // // OffChipAfterEachPhase: (Only for stage=1) // fwd: bwd: // phase 0 -> phase 20 // phase 4 -> phase 16 // phase 8 -> phase 12 // Stride = 4 // (Gap between each phase > 2) // This is done by incrementing options.executionPhaseSettings.phases by 3 // and multiplying the phase_id by 4. bool serial_phases_execution{false}; bool separate_backward_phase{false}; Liveness tensors_liveness{}; // Debug name for the model std::string model_name; // (Not yet supported) Whether each buffer should be broadcasted from the // first to other replicas on each training step. bool broadcast_buffers{false}; std::int64_t input_group_size; popart::CommGroupType input_cgt; }; } // namespace detail } // namespace popart_compiler } // namespace poptorch ================================================ FILE: popart_compiler/source/include/popart_compiler/CustomOps.hpp ================================================ // Copyright (c) 2021 Graphcore Ltd. All rights reserved. /* * Host op represents an operation executed on the CPU. It is offloaded by * writing the tensors from IPU into host buffers. Triggering the operation. * Then writing back to IPU tensors. */ #include #include extern "C" { namespace poptorch { namespace poptorch_custom_ops { constexpr std::uint32_t domain = 1; // The number of input tensors we can consume (between MIN_INPUTS and // MAX_INPUTS). constexpr std::uint32_t min_inputs = 0; constexpr std::uint32_t max_inputs = 64; extern const char host_op_metadata_attr[]; const popart::OperatorIdentifier host_op = {"poptorch.custom_ops", "HostOp", domain, {min_inputs, max_inputs}}; // NOLINT const popart::OperatorIdentifier upsample_bilinear2d = { "poptorch.custom_ops", "UpsampleBilinear2d", 1}; const popart::OperatorIdentifier upsample_bilinear2d_grad = { "poptorch.custom_ops", "UpsampleBilinear2dGrad", 1}; const popart::OperatorIdentifier torch_softplus = { "poptorch.custom_ops", "TorchSoftplus", 1, {1}, 1}; const popart::OperatorIdentifier torch_softplus_inplace = { "poptorch.custom_ops", "TorchSoftplusInplace", 1, {1}, 1}; const popart::OperatorIdentifier torch_softplus_grad = { "poptorch.custom_ops", "TorchSoftplusGrad", 1, {1}, 1}; const popart::OperatorIdentifier embedding = {"poptorch.custom_ops", "Embedding", domain}; const popart::OperatorIdentifier embedding_grad = {"poptorch.custom_ops", "EmbeddingGrad", domain}; const popart::OperatorIdentifier fast_gather_last_dim = { "poptorch.custom_ops", "FastGatherLastDim", 1}; const popart::OperatorIdentifier fast_gather_last_dim_grad = { "poptorch.custom_ops", "FastGatherLastDimGrad", 1}; } // namespace poptorch_custom_ops } // namespace poptorch } ================================================ FILE: popart_compiler/source/include/popart_compiler/MultiConvBuilder.hpp ================================================ // Copyright (c) 2020 Graphcore Ltd. All rights reserved. #pragma once #include #include #include #include #include #include #include "poptorch_logging/Error.hpp" namespace poptorch { namespace popart_compiler { namespace detail { class MultiConvBuilder { public: void addConv(const std::vector &inputs, const std::vector &dilations, const std::vector &kernel_shape, const std::vector &pads, const std::vector &strides) { // Record the inputs and attributes for this single conv _inputs.push_back(inputs); _dilations.push_back(dilations); _kernel_shape.push_back(kernel_shape); _pads.push_back(pads); _strides.push_back(strides); } void setAvailableMemoryProportions(const std::vector &v) { _options.availableMemoryProportions = v; } void setPartialsTypes(const std::vector &partials_types) { std::vector type_strs; for (int64_t t : partials_types) { if (t == 0) { type_strs.emplace_back("float"); } else if (t == 1) { type_strs.emplace_back("half"); } else { ERROR("Invalid MultiConv partials_types"); } } _options.partialsTypes = type_strs; } void setEnableConvDithering(const std::vector &v) { _options.enableConvDithering = v; } void setPlanType(int64_t plan_type) { if (plan_type == 0) { _options.planType = "parallel"; } else if (plan_type == 1) { _options.planType = "serial"; } else { ERROR("Invalid MultiConv plan_type"); } } void setPerConvReservedTiles(int n) { _options.perConvReservedTiles = n; } void setCycleBackOff(float v) { _options.cycleBackOff = v; } std::vector build(popart::Builder *builder) const { auto opset = builder->aiGraphcoreOpset1(); return opset.multiconv(_inputs, _dilations, {}, _pads, {}, _strides, _options.availableMemoryProportions, _options.partialsTypes, _options.planType, _options.perConvReservedTiles, _options.cycleBackOff, _options.enableConvDithering); } private: // Aggregated inputs for all the convs that are fused as a multiconv std::vector> _inputs; std::vector> _dilations; std::vector> _kernel_shape; std::vector> _pads; std::vector> _strides; popart::MultiConvOptions _options = {{}, {}}; }; } // namespace detail } // namespace popart_compiler } // namespace poptorch ================================================ FILE: popart_compiler/source/include/popart_compiler/SessionOptionsImpl.hpp ================================================ // Copyright (c) 2020 Graphcore Ltd. All rights reserved. #pragma once #include #include #include #include #include #include "popart/sessionoptions.hpp" #include "popart_compiler/CompilerOptions.hpp" namespace poptorch { namespace popart_compiler { namespace detail { struct SessionOptionsImpl { SessionOptionsImpl(); std::map> bool_options; std::map> uint64_options; std::map> string_options; std::map> double_options; std::map)>> container_options; std::set options_set; popart::SessionOptions popart_options; CompilerOptions poptorch_options; void setMemoryProportion(std::uint32_t ipu, float memory) { poptorch_options.available_memory_proportion[ipu] = memory; } template void set(const std::string &key, ValueType value, std::map> &options, const std::string &typeStr) { const auto it = options.find(key); ERROR_ON_MSG(it == options.end(), "Unknown " << typeStr << " option " << key); it->second(value); options_set.insert(key); } }; } // namespace detail } // namespace popart_compiler } // namespace poptorch ================================================ FILE: popart_compiler/types/include/popart_compiler/CompilerTypes.hpp ================================================ // Copyright (c) 2022 Graphcore Ltd. All rights reserved. #pragma once #include #include #include #include "popart_compiler/PopartEnums.hpp" // This header should contain ABI agnostic data types which are // used to share data with other PopTorch components. // Types in this file must not depend on external symbols. namespace poptorch { namespace popart_compiler { // PopTorch abstraction of popart::MutableVoidData to be used across the ABI // boundary struct TensorMetadata { const char *id; std::vector shape; const char *dtype; void *data = nullptr; int64_t num_bytes = -1; }; /* We use this callback structure to capture data from the poptorch python frontend. We get the function to call as well as pointers to the output/input storage waiting on CPU. From this we derive more data, see CallbackInternalMetadata in CompilerImpl.hpp. */ struct CallbackMetadata { // The thing we are calling back. std::function the_callback; // Due to tracing complexities we have to register the buffers as a seperate // step after the model has been traced. std::function buffer_registration_callback; // Pointers to the buffers we created on host. std::vector input_pointers; std::vector output_pointers; }; using TensorId = std::size_t; static constexpr TensorId NoneTensor = 0; // NOLINT enum class OutputElemType { Tensor, Tuple, List }; // For testing only: throw an exception of the selected type. enum class TestErrorType { Poptorch, Popart, PopartInternal, Poplibs, PoplarUnrecoverable, PoplarUnknown, PoplarRecoverableFullReset, PoplarLinkError }; struct OutputTypeShape { OutputElemType type; int64_t num_elements{0}; }; struct Timestamps { std::vector> input; std::vector> input_complete; std::vector> output; std::vector> output_complete; }; struct Optimizer { struct Parameter { char name[32]; float value; bool is_const; }; using ParamType = std::pair; explicit Optimizer(OptimizerType t, bool useTfVariant) : type(t), accum_types_provided(false), use_tf_variant(useTfVariant) {} explicit Optimizer(OptimizerType t, bool useTfVariant, float maxGradNorm) : type(t), accum_types_provided(false), use_tf_variant(useTfVariant), max_grad_norm(maxGradNorm) {} Optimizer(OptimizerType t, bool accumType, bool firstOrderType, bool secondOrderType, bool useTfVariant, float maxGradNorm) : type(t), accum_types_provided(true), accum_type_is_half(accumType), first_order_momentum_accum_type_is_half(firstOrderType), second_order_momentum_accum_type_is_half(secondOrderType), use_tf_variant(useTfVariant), max_grad_norm(maxGradNorm) {} OptimizerType type; // True if the main, first and second order accum types have been set. bool accum_types_provided; // Special parameters for adam/lamb. If true accumulations will be half // otherwise will be float. bool accum_type_is_half; bool first_order_momentum_accum_type_is_half; bool second_order_momentum_accum_type_is_half; bool use_tf_variant; float max_grad_norm; std::vector parameters; }; } // namespace popart_compiler } // namespace poptorch ================================================ FILE: popart_compiler/types/include/popart_compiler/PopartEnums.hpp ================================================ // Copyright (c) 2020 Graphcore Ltd. All rights reserved. #ifndef POPART_COMPILER_POPART_ENUMS_HPP #define POPART_COMPILER_POPART_ENUMS_HPP #include #include "poptorch_logging/Error.hpp" namespace poptorch { namespace popart_compiler { /* * We maintain an ABI boundary inbetween PopART and Torch JIT. This avoids the * issue of torch being compiled with different CXX_ABI versions. However it * means we must replicate PopART enums here so they can be shared by both. */ // The training optimizer algorithm used. enum class OptimizerType : std::uint8_t { SGD1 = 0, SGD2, ADAM, ADAMW, ADAMW_NO_BIAS, RMSPROP, RMSPROP_CENTERED, LAMB, LAMB_NO_BIAS, NONE }; #define FOR_ALL_FIXED_POINT_TYPES(_) \ _(UINT8) \ _(INT8) \ _(UINT16) \ _(INT16) \ _(INT32) \ _(INT64) \ _(UINT32) \ _(UINT64) \ _(BOOL) #define FOR_ALL_FLOATING_POINT_TYPES(_) \ _(FLOAT) \ _(FLOAT16) \ _(BFLOAT16) \ _(FLOAT8_143) \ _(FLOAT8_152) \ _(DOUBLE) \ _(COMPLEX64) \ _(COMPLEX128) #define FOR_ALL_POPART_TYPES(_) \ FOR_ALL_FIXED_POINT_TYPES(_) \ FOR_ALL_FLOATING_POINT_TYPES(_) \ _(STRING) \ _(UNDEFINED) // The types supported by popart. #define DEFINE_ENUM(value) value, enum class PopartType { FOR_ALL_POPART_TYPES(DEFINE_ENUM) }; #undef DEFINE_ENUM #define DEFINE_CASE(value) \ case PopartType::value: { \ return #value; \ } inline std::string toPopartTypeStr(const PopartType &type) { switch (type) { FOR_ALL_POPART_TYPES(DEFINE_CASE) default: ERROR("Unsupported PopartType"); } } #undef DEFINE_CASE // See AnchorReturnTypeId in popart/dataflow.hpp for a full description of each. // Must be kept in sync with OutputMode in python/enums.py enum class PopartOutputMode : std::uint8_t { Final = 0, EveryN, All, Sum, N }; // Must be static so each library gets its own copy, __attribute__((unused)) is // to silence the warning if it is unused in any of them. static PopartOutputMode outputModeFromString(const std::string &str) __attribute__((unused)); static const char *outputModeToString(PopartOutputMode type) __attribute__((unused)); static PopartOutputMode outputModeFromString(const std::string &str) { if (str == "FINAL") { return PopartOutputMode::Final; } if (str == "EVERYN") { return PopartOutputMode::EveryN; } if (str == "ALL") { return PopartOutputMode::All; } if (str == "SUM") { return PopartOutputMode::Sum; } ERROR("Internal error: unsupported output mode :" << str); } // Popart only supports a string interface for them so we have to convert back. static const char *outputModeToString(PopartOutputMode type) { switch (type) { case PopartOutputMode::Final: return "FINAL"; case PopartOutputMode::EveryN: return "EVERYN"; case PopartOutputMode::All: return "ALL"; case PopartOutputMode::Sum: return "Sum"; default: ERROR("UNREACHABLE: Converting output mode to string"); } } } // namespace popart_compiler } // namespace poptorch #endif // POPART_COMPILER_POPART_ENUMS_HPP ================================================ FILE: poptorch/CMakeLists.txt ================================================ cmake_minimum_required(VERSION 3.14 FATAL_ERROR) project(poptorch) add_library(poptorch_internal_headers INTERFACE) target_include_directories(poptorch_internal_headers INTERFACE include source/include) # Ensure ABI matches that of PyTorch add_definitions(${TORCH_CXX_FLAGS}) add_library(poptorch SHARED "source/AddDetachOperations.cpp" "source/AddSubgraphConnectionNodes.cpp" "source/AliasProcessing.cpp" "source/CPUOffloadingCleanUp.cpp" "source/ErrorOnUnsupportedAten.cpp" "source/FixupSetAvailableMemory.cpp" "source/GatherWithExpandedIndicesOptimization.cpp" "source/ImplicitCasting.cpp" "source/InplaceOps.cpp" "source/LowerToPopart.cpp" "source/LowerToPopartFactories.cpp" "source/OpBuilder.cpp" "source/OverlappedIO.cpp" "source/PopartCanonicalization.cpp" "source/PopartLateCanonicalization.cpp" "source/PoplarExecutable.cpp" "source/PoptorchSymbols.cpp" "source/RemoveSurplusIdentityLosses.cpp" "source/RequiresGrad.cpp" "source/GNNOptimizations.cpp" "source/SessionOptionsParser.cpp" "source/Utils.cpp" "source/popart_canonicalization/ActivationOps.cpp" "source/popart_canonicalization/ArithmeticOps.cpp" "source/popart_canonicalization/AtenHandlers.gen.cpp" "source/popart_canonicalization/BilinearOps.cpp" "source/popart_canonicalization/BitwiseOps.cpp" "source/popart_canonicalization/BlasOps.cpp" "source/popart_canonicalization/ConstantOps.cpp" "source/popart_canonicalization/ConvolutionOps.cpp" "source/popart_canonicalization/CustomOps.cpp" "source/popart_canonicalization/DistanceOps.cpp" "source/popart_canonicalization/DropoutOps.cpp" "source/popart_canonicalization/EinsumOp.cpp" "source/popart_canonicalization/EmbeddingOps.cpp" "source/popart_canonicalization/IndexOps.cpp" "source/popart_canonicalization/LossOps.cpp" "source/popart_canonicalization/NormalizationOps.cpp" "source/popart_canonicalization/OtherOps.cpp" "source/popart_canonicalization/PoolingOps.cpp" "source/popart_canonicalization/PopartCanonicalizationUtils.cpp" "source/popart_canonicalization/PoptorchHandlers.gen.cpp" "source/popart_canonicalization/PyGTorchScatterOps.cpp" "source/popart_canonicalization/PyGTorchSplineConvOps.cpp" "source/popart_canonicalization/RNNOps.cpp" "source/popart_canonicalization/RandomSamplingOps.cpp" "source/popart_canonicalization/ReduceOps.cpp" "source/popart_canonicalization/ReshapeOps.cpp" "source/popart_canonicalization/SliceOps.cpp" "source/popart_canonicalization/SoftmaxOps.cpp" "source/popart_canonicalization/ScatterReduction.cpp" "source/popart_canonicalization/TensorOps.cpp" "source/popart_canonicalization/pyg_torch_cluster/FpsOp.cpp" "source/popart_canonicalization/pyg_torch_cluster/GridOp.cpp" "source/popart_canonicalization/pyg_torch_cluster/NearestOp.cpp" "source/type_and_constant_canonicalization/AddListNumElements.cpp" "source/type_and_constant_canonicalization/CanonicaliseConstants.cpp" "source/type_and_constant_canonicalization/CastUnsupportedInputs.cpp" "source/type_and_constant_canonicalization/CheckAndChangeOutputTypes.cpp" "source/type_and_constant_canonicalization/EvaluateConstexprs.cpp" "source/type_and_constant_canonicalization/MakeConstantIntParams.cpp" ) file(GLOB_RECURSE poptorch_public_headers "${CMAKE_CURRENT_SOURCE_DIR}/include/*.hpp*") set_target_properties(poptorch PROPERTIES CXX_STANDARD 17 PUBLIC_HEADER "${poptorch_public_headers}") target_link_libraries(poptorch PUBLIC popart_compiler_types PRIVATE dispatch_tracer popart_compiler poptorch_logging torch stdc++fs ) target_include_directories(poptorch PUBLIC $ $ PRIVATE source/include) install(TARGETS poptorch LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/poptorch ) ================================================ FILE: poptorch/include/poptorch/DispatchTracer.hpp ================================================ // Copyright (c) 2021 Graphcore Ltd. All rights reserved. #ifndef INCLUDE_POPTORCH_DISPATCH_TRACER_HPP_ #define INCLUDE_POPTORCH_DISPATCH_TRACER_HPP_ #include #include #include #include #include namespace at { class Tensor; } namespace torch { namespace jit { struct Graph; struct Node; struct Value; } // namespace jit } // namespace torch namespace poptorch { struct CompilerOptions; struct InplaceGraphInfo; struct PoptorchErrorInfo; // Toggled by the user in python to choose which backend to target when tracing. // CPU and SENTINEL will only be toggled by us. enum TracingMode { // Compile normal JIT to run via PopART POPART, }; struct PerReplicaSettings { int comm_group_type; int shards; int variable_retrieval_mode; int64_t size0; std::shared_ptr> host_buffer; }; // Create a new graph. void createGraph(TracingMode mode, const std::vector &inputs, const CompilerOptions &options); // The current graph is complete: finalize it. // // Trying to add ops after this call is undefined behaviour. void finalizeGraph(); InplaceGraphInfo getInplaceGraphInfo(size_t num_anchors, bool replicas_needing_broadcast); // Get the captured JIT graph. In reality is just returning the // torch::jit::Graph it's already been compiling during the dispatch process. std::shared_ptr getTracedGraph(); // Get a pointer to the data source for an IPU input / parameter tensor. // If the value is not a parameter or an input, return nullptr. void *getDataSource(const at::Tensor &tensor); void setParameterName(const at::Tensor &tensor, const std::string &name); // Return the name of a parameter or an empty string if no name was set. std::string getParameterName(torch::jit::Value *value); void setParameterPerReplica(const std::string ¶m_name, const at::Tensor &tensor, int comm_group_type, int shards, int variable_retrieval_mode); bool getParameterPerReplica(torch::jit::Value *value, PerReplicaSettings &settings); // Get a pointer to the data source for a given JIT value. // The value must be an IPU value. // If the value is not a parameter or an input, return nullptr. void *getDataSourceForValue(torch::jit::Value *value); // Return true if the given IPU tensor is a parameter. bool isParameter(torch::jit::Value *value); // Start capturing calls. void startDispatch(); // Stop capturing calls. void endDispatch(bool error_occurred = false); // Called before starting to move parameters between the CPU and the IPU. // (This is used to differentiate inputs from parameters / buffers) // We expect something like: // >>> poptorch_core.startParametersMove() // >>> my_model.to("ipu") // >>> poptorch_core.endParametersMove() void startParametersMove(); void endParametersMove(); // Called before starting to move outputs from the IPU to the CPU. // Allows us to error if an attempt is made to move outputs outside // of IPUScope.outputs(). void startOutputsMove(); void endOutputsMove(); // Return true if we should be compiling with the dispatcher. bool isCompilingWithDispatcher(); // Cleans up all objects associated with poptorch void poptorchAtExit(); // Destroy the active dispatcher object. void destroyDispatcher(); void replaceValueDispatcher(torch::jit::Value *v_old, torch::jit::Value *v_new); std::uint64_t getIpuTensorId(const at::Tensor &tensor); using PoptorchErrorThrower = std::function; // Set the function to use to throw python PoptorchError exceptions. void setPoptorchErrorThrower(PoptorchErrorThrower thrower); // Throw an exception using the poptorch error thrower. // Note: used by RegisterAtenOverloads.cpp in a template, that's why it needs // to be declared publicly. void throwPoptorchError(const PoptorchErrorInfo &info); } // namespace poptorch #endif // INCLUDE_POPTORCH_DISPATCH_TRACER_HPP_ ================================================ FILE: poptorch/include/poptorch/InplaceOps.hpp ================================================ // Copyright (c) 2021 Graphcore Ltd. All rights reserved. #ifndef INCLUDE_POPTORCH_INPLACE_OPS_H #define INCLUDE_POPTORCH_INPLACE_OPS_H #include #include #include #include #include namespace c10 { struct Symbol; } // namespace c10 namespace torch { namespace jit { struct Graph; struct Node; using NodeKind = c10::Symbol; struct Value; } // namespace jit } // namespace torch namespace poptorch { // Store information related to Graph inputs modified in place. struct InplaceGraphInfo { // Mapping for a graph input which is not modified in place. static constexpr size_t no_mapping = std::numeric_limits::max(); // Number of outputs from the graph which are not used to emulate // inplace ops. (An output may be a list or tuple as well as a tensor). size_t num_normal_outputs{0}; // Number of tensors output from the graph which are not used to // emulate inplace ops. (This differs from the previous if the graph returns // one or more tuples/lists.) size_t num_tensor_outputs{0}; // Mapping between each input tensor and the output tensor used // to update the input. If the input tensor is not changed in place, it will // be equal to InplaceGraphInfo::no_mapping // // Note: these are all Graph inputs (inputs and parameters) but only inputs // can have a mapping. // // If the input at graph_input_idx is modified in place: // m[graph_input_idx] = graph_output_idx // else // m[graph_input_idx] = no_mapping std::vector input_output_mapping{}; }; // Get the NodeKind corresponding to the outplace version of the given // inplace op NodeKind torch::jit::NodeKind outplaceKind(torch::jit::NodeKind kind); class InplaceInputsTracker { public: void addTensor(torch::jit::Value *input); // Find if the given value is an alias for an input, if so remove the alias // from the tracker and return the input it was aliasing. If the given value // doesn't alias an input return nullptr. torch::jit::Value *eraseCurrentAlias(torch::jit::Value *alias); void registerAlias(torch::jit::Value *aliased_input, torch::jit::Value *alias); InplaceGraphInfo finalizeGraph(torch::jit::Graph &graph, size_t num_anchors, bool replicas_needing_broadcast); private: // alias -> aliased std::unordered_map _aliases; }; void fixForLoopInputs(torch::jit::Graph &graph); void verifyIfElseBlocksOrder(const torch::jit::Graph &graph); } // namespace poptorch #endif // INCLUDE_POPTORCH_INPLACE_OPS_H ================================================ FILE: poptorch/include/poptorch/LowerToPopart.hpp ================================================ // Copyright (c) 2020 Graphcore Ltd. All rights reserved. #ifndef INCLUDE_POPTORCH_LOWER_TO_POPART_H #define INCLUDE_POPTORCH_LOWER_TO_POPART_H #include #include #include #include #include #include #include "popart_compiler/Compiler.hpp" #include "popart_compiler/PopartEnums.hpp" #include "poptorch/PoplarExecutable.hpp" #include "poptorch/SessionOptionsParser.hpp" namespace poptorch { namespace popart_compiler { class SessionOptions; } namespace detail { class LowerToPopartImpl; } // namespace detail // CallbackMetadata is used to pass information from python to the poplar custom // op for CPU ops. The string is the ID given by the user to each op. using CPUCallbackMap = std::unordered_map; struct Anchor { Anchor(std::string n, std::uint8_t m, size_t p) : name(std::move(n)), mode(m), period(p) {} std::string name; std::uint8_t mode; size_t period; }; using AnchorList = std::vector; /* * Take the transformed graph and create a poponnx graph from it. */ struct InplaceGraphInfo; class LowerToPopart { public: LowerToPopart(torch::jit::Graph *graph, InplaceGraphInfo &&inplace_info, bool training, std::vector &&opt, const popart_compiler::SessionOptions &options, const AttributeAccessor &attribute_accessor, CPUCallbackMap callback, AnchorList &&anchors); LowerToPopart(LowerToPopart &&lower); ~LowerToPopart(); void lower(); std::shared_ptr compile(); std::shared_ptr loadExecutableFromFile(const std::string &input_filename); private: std::unique_ptr _impl; }; } // namespace poptorch #endif // INCLUDE_POPTORCH_LOWER_TO_POPART_H ================================================ FILE: poptorch/include/poptorch/LowerToPopartFactories.hpp ================================================ // Copyright (c) 2022 Graphcore Ltd. All rights reserved. #ifndef INCLUDE_POPTORCH_LOWER_TO_POPART_FACTORIES_H #define INCLUDE_POPTORCH_LOWER_TO_POPART_FACTORIES_H #include #include #include #include #include "poptorch/LowerToPopart.hpp" #include "poptorch/SessionOptionsParser.hpp" namespace poptorch { poptorch::LowerToPopart lowerToPopartFromDispatch( SessionOptionsParser &parser, bool training, AnchorList &&anchors_list, const std::function &initCallbackBuffers, std::vector &&optimizers, const AttributeAccessor &attribute_accessor, CPUCallbackMap &callbacks); } // namespace poptorch #endif // INCLUDE_POPTORCH_LOWER_TO_POPART_FACTORIES_H ================================================ FILE: poptorch/include/poptorch/PoplarExecutable.hpp ================================================ // Copyright (c) 2020 Graphcore Ltd. All rights reserved. #ifndef INCLUDE_POPTORCH_POPLAR_EXECUTABLE_HPP #define INCLUDE_POPTORCH_POPLAR_EXECUTABLE_HPP #include #include #include #include #include #include #include "popart_compiler/Compiler.hpp" #include "poptorch/InplaceOps.hpp" namespace poptorch { class PoplarExecutable { public: PoplarExecutable() = delete; PoplarExecutable(popart_compiler::Compiler &&c, std::vector &&inputs, std::vector &&outputs, std::vector &&outputTypes, std::vector parameter_names, InplaceGraphInfo &&inplace_info) : _compiler(std::move(c)), _popart_inputs(inputs), _popart_outputs(outputs), _popart_output_types(outputTypes), _parameter_names(std::move(parameter_names)), _inplace_info(std::move(inplace_info)) { for (size_t i = 0; i < inputs.size(); i++) { _converted_inputs.emplace_back(); } } void loadEngineAndConnectStreams(); /* * Execute the compiled graph stored in field "compiler" with the given * |inTensors| and return to the user the resulting tensors if any. */ std::vector run(std::vector &inTensors); void updateOptimizers(const std::vector &optimizer); // Tell popart to copy weights off the IPU and write into host memory. void copyWeightsToHost(const std::map &buffers); // Tell popart to copy weights from host into IPU memory. void copyWeightsToDevice(const std::map &buffers); // Tell popart to copy named buffers from host into IPU memory. void copyNamedBuffersToDevice(const std::map &buffers); const std::vector &outputTypes() const; // Get the IR from popart. std::string getPopartIR() const; // Get the tensor names that occur in the model graphs. std::set getTensorNames() const; void detachFromDevice(); void attachToDevice(); bool isAttachedToDevice() const; const popart_compiler::Compiler &getCompiler() const { return _compiler; } popart_compiler::Compiler &getCompiler() { return _compiler; } private: popart_compiler::Compiler _compiler; std::vector _popart_inputs; // Used for types which need conversion to maintain the ref count std::vector _converted_inputs; std::vector _popart_outputs; std::vector _popart_output_types; const std::vector _parameter_names; const InplaceGraphInfo _inplace_info; }; } // namespace poptorch #endif // INCLUDE_POPTORCH_POPLAR_EXECUTABLE_HPP ================================================ FILE: poptorch/include/poptorch/SessionOptionsParser.hpp ================================================ // Copyright (c) 2020 Graphcore Ltd. All rights reserved. #ifndef INCLUDE_POPTORCH_SESSION_OPTIONS_PARSER_HPP #define INCLUDE_POPTORCH_SESSION_OPTIONS_PARSER_HPP #include #include #include #include #include #include #include #include #include "popart_compiler/CompilerTypes.hpp" #include "poptorch_logging/Error.hpp" namespace poptorch { namespace popart_compiler { class SessionOptions; } // namespace popart_compiler // Interface to parse a python object without adding a dependency on pybind class IPyValue { public: virtual std::function toFunction() const = 0; virtual bool isBoolean() const = 0; virtual bool toBoolean() const = 0; virtual bool isDouble() const = 0; virtual double toDouble() const = 0; virtual bool isInt() const = 0; virtual std::int64_t toInt64() const = 0; virtual std::uint64_t toUInt64() const = 0; virtual bool isString() const = 0; virtual std::string toString() const = 0; virtual bool isSetListOrTuple() const = 0; virtual void forEachInList(std::function) const = 0; virtual bool isDict() const = 0; virtual void forEachInDict( std::function) const = 0; // Return nullptr if the key doesn't exist virtual std::unique_ptr getFromDict(const std::string &key) const = 0; // Return nullptr if index is out of bounds virtual std::unique_ptr getFromList(std::uint64_t index) const = 0; virtual std::uint64_t getListSize() const = 0; virtual std::string type() const = 0; float toFloatWithRangeCheck() const; std::vector toVectorString() const; virtual ~IPyValue() = default; }; class SessionOptionsParser { public: explicit SessionOptionsParser(const IPyValue &opts); popart_compiler::SessionOptions &options(); ~SessionOptionsParser(); private: std::unique_ptr _opts; }; typedef std::function(const std::string &)> AttributeAccessor; } // namespace poptorch #endif // INCLUDE_POPTORCH_SESSION_OPTIONS_PARSER_HPP ================================================ FILE: poptorch/include/poptorch/Utils.hpp ================================================ // Copyright (c) 2020 Graphcore Ltd. All rights reserved. #ifndef INCLUDE_POPTORCH_UTILS_HPP #define INCLUDE_POPTORCH_UTILS_HPP #include #include #include #include #include #include namespace poptorch { torch::jit::Node *findEarliestUser(const torch::jit::Value *value); bool isNondeterministic(const torch::jit::Node &node); std::string nodeToString(const torch::jit::Node *node); std::string scalarTypeToOnnxString(at::ScalarType type); at::ScalarType onnxStrToScalarType(const char *type_str); at::ScalarType coerceToSupportedType(at::ScalarType type); torch::jit::Node *createAndInsertCastOp(torch::jit::Graph *graph, torch::jit::Value *val, at::ScalarType type); // Returns a collapsed version of the graph input hierachy into a list of // tensor values by following any tuples/lists and their unpacking // N.B. if a tuple is not used (unpacked), the resulting values will be null // as a placeholder. std::vector collapsedGraphInputHierachy(torch::jit::Graph *graph); // Return the number of tensors for a given type: in the case of a tensor // this is 1, but in case of nested tuples, this is the sum over all. size_t numTensorsForType(const c10::TypePtr &type); // Delete a node and also its inputs if they are also unused. void searchAndPossiblyDestroy( const std::unordered_set &to_test); // Remove all the node's inputs and destroy them if they're not used // anywhere else. void removeAndPossiblyDestroyAllInputs(torch::jit::Node *node); std::unique_ptr stringToUniquePtr(const std::string &str); // Get the tensor shape as a vector of ints. std::vector shapeFromTensor(const torch::jit::Value *value); // Add casts as necessary such that weight and bias have the same scalar type // as input. void castWeightAndBias(torch::jit::Graph *graph, torch::jit::Value *input, torch::jit::Value *&weight, torch::jit::Value *&bias); // A replacement for PyTorch's ListType which includes the number of elements // unlike PyTorch's own type. class ListTypeWithNumElements : public c10::SingleElementType { public: ListTypeWithNumElements(c10::TypePtr elem_type, size_t num_elements) : SingleElementType(std::move(elem_type)), _num_elements(num_elements) {} bool equals(const Type &rhs) const override { if (auto rhs_cast = rhs.cast()) { return numElements() == rhs_cast->numElements(); } return false; } size_t numElements() const { return _num_elements; } std::string str() const override; c10::ListTypePtr getOriginalListType() const { return c10::ListType::create(getElementType()); } private: size_t _num_elements; std::string annotation_str_impl(c10::TypePrinter printer) const override { (void)(printer); return str(); } }; using ListTypeWithNumElementsPtr = std::shared_ptr; struct JitTensorInfo { explicit JitTensorInfo(const at::Tensor &tensor); explicit JitTensorInfo(torch::jit::Value *value); std::string toString() const; at::ScalarType scalar_type; std::vector dims; }; void validateTensorShapeAndType(torch::jit::Value *value, const at::Tensor &tensor); // setNodeTensorAttrValue and getNodeTensorAttrValue must be used instead of // node->t_(c10::attr::value, v) and node->t(c10::attr::value). // // When printing a torch::jit::Graph the graph will iterate over each node // and print all its attributes. // If an attribute is a tensor it will try to print the content of that // tensor which in our case would trigger a copy from IPU to CPU. This copy // not only will fail, it will also be interpreted as a request to add this // tensor as a graph output which will corrupt the graph. // However attributes which are arrays of tensors are not printed and therefore // will not trigger a copy, so behind the scenes these functions will wrap // and unwrap the tensor attribute in a size 1 vector. void setNodeTensorAttrValue(torch::jit::Node *node, torch::jit::TensorAttr::ConstructorType value); const torch::jit::TensorAttr::ValueType & getNodeTensorAttrValue(const torch::jit::Node *node); } // namespace poptorch #endif // INCLUDE_POPTORCH_UTILS_HPP ================================================ FILE: poptorch/source/AddDetachOperations.cpp ================================================ // Copyright (c) 2021 Graphcore Ltd. All rights reserved. #include #include "PoptorchSymbols.hpp" #include "poptorch/OpBuilder.hpp" #include "poptorch/PopartCanonicalization.hpp" #include "poptorch/Utils.hpp" #include "poptorch_logging/Error.hpp" #include "poptorch_logging/Logging.hpp" namespace poptorch { namespace { std::map detached_values; std::set visited_nodes; torch::jit::Value *possiblyDetachedValue(torch::jit::Graph *graph, torch::jit::Value *value) { auto *producer = value->node(); auto producer_kind = producer->kind(); if (value->requires_grad() || producer_kind == c10::prim::Constant || producer_kind == symbols::poptorch::tensor_constant || producer_kind == symbols::poptorch::host_side_tensor_constant || producer_kind == symbols::popart::detach || producer_kind == c10::prim::TupleConstruct || producer_kind == c10::prim::ListConstruct) { return value; } auto it = detached_values.find(value); if (it == detached_values.end()) { WithNodeMetadata meta(producer); auto *detach = graph->create(symbols::popart::detach); detach->addInput(value); insertNodeAfterNode(detach, producer); detach->output(0)->setType(value->type()); it = detached_values.insert({value, detach->output(0)}).first; } return it->second; } void maybeInsertDetachOp(torch::jit::Graph *graph, torch::jit::Node *node) { logging::LogContext ctx( "AddDetachOperations (maybeInsertDetachOp) processing " + nodeToString(node)); if (visited_nodes.find(node) != visited_nodes.end()) { return; } visited_nodes.insert(node); for (torch::jit::Value *input : node->inputs()) { auto *detach = possiblyDetachedValue(graph, input); if (input == detach) { maybeInsertDetachOp(graph, input->node()); } } } void replaceDetachedValues(torch::jit::Node *node) { logging::LogContext ctx( "AddDetachOperations (replaceDetachedValues) processing " + nodeToString(node)); if (visited_nodes.find(node) != visited_nodes.end()) { return; } visited_nodes.insert(node); for (torch::jit::Value *input : node->inputs()) { auto it = detached_values.find(input); if (it != detached_values.end()) { if (node->kind() == symbols::popart::detach) { // Only replace values (with their detached counterparts) that exist // after the detach node that generated the detached value. return; } node->replaceInputWith(input, it->second); } replaceDetachedValues(input->node()); } } } // namespace void addDetachOperations(torch::jit::Graph *graph) { detached_values.clear(); visited_nodes.clear(); // Special prim::Param nodes that correspond to graph inputs should not be // detached so we superficially mark them as detached before processing. for (torch::jit::Value *input : graph->inputs()) { visited_nodes.insert(input->node()); } // Process the graph recursively and replace the values at the end. maybeInsertDetachOp(graph, graph->return_node()); visited_nodes.clear(); for (torch::jit::Value *input : graph->inputs()) { visited_nodes.insert(input->node()); } replaceDetachedValues(graph->return_node()); } } // namespace poptorch ================================================ FILE: poptorch/source/AddSubgraphConnectionNodes.cpp ================================================ // Copyright (c) 2020 Graphcore Ltd. All rights reserved. #include #include #include #include #include "PoptorchSymbols.hpp" #include "poptorch/OpBuilder.hpp" #include "poptorch/PopartCanonicalization.hpp" #include "poptorch/Utils.hpp" #include "poptorch_logging/Error.hpp" #include "poptorch_logging/Logging.hpp" namespace poptorch { namespace { // A small class to keep track of information regarding subgraphs. struct Subgraph { // All the nodes in the subgraph. std::unordered_set nodes; // Track the inputs already added so we don't double count them. std::unordered_set added_inputs; // Map of new inputs to old inputs. std::unordered_map input_map; // Map of old inputs to the new ones. std::unordered_map reverse_input_map; }; bool isTerminator(const torch::jit::Node *node) { return node->kind() == symbols::poptorch::end_for_loop; } bool isUsedInTerminator(const torch::jit::Node *node) { for (const torch::jit::Value *output : node->outputs()) { for (const torch::jit::Use &use : output->uses()) { const torch::jit::Node *user = use.user; if (isTerminator(user)) { return true; } } } return false; } bool markInputsAsComingFromParent(torch::jit::Graph *graph, torch::jit::Node *node, Subgraph *subgraph, const bool inputFromParent = true) { bool used_in_subgraph = false; WithNodeMetadata meta(node); // If this node is NOT used in the terminator then we need to add it as an // input to the graph. for (torch::jit::Value *value : node->inputs()) { // If the user isn't used in this subgraph AND the node hasn't already // been marked an input. if (subgraph->nodes.count(value->node()) == 0) { if (subgraph->added_inputs.count(value) == 0) { if (!inputFromParent) { torch::jit::Node *new_out = createAddUntypedInputTensor(graph, value); subgraph->input_map.insert({new_out->output(), value}); subgraph->reverse_input_map.insert({value, new_out->output()}); } subgraph->added_inputs.insert(value); used_in_subgraph = true; } } else { used_in_subgraph = true; } } return used_in_subgraph; } void markOutputs(torch::jit::Graph *graph, torch::jit::Node *outputs, torch::jit::Node *insertion_point, Subgraph *subgraph) { torch::jit::WithInsertPoint insert_point(outputs); // Sometimes the return might not be processed in this node. const bool used_in_subgraph = markInputsAsComingFromParent(graph, outputs, subgraph); for (torch::jit::Value *output : outputs->inputs()) { WithNodeMetadata meta{output->node()}; // Add an identity op in lieu if the op isn't used in the subgraph to make // sure popart handles the alias correctly. if (!used_in_subgraph) { torch::jit::Node *node = createIdentity(graph, {output}); output = node->output(); } // PopART doesn't allow inputs to be outputs directly. if (subgraph->reverse_input_map.find(output) != subgraph->reverse_input_map.end()) { output = subgraph->reverse_input_map[output]; } torch::jit::Node *new_node = createAddOutputTensor(graph, output); insertNodeBeforeNode(new_node, insertion_point); } } struct InsertionPointAndShape { torch::jit::Node *insertion_point; std::vector shape; }; using ReshapePutterHelper = std::vector; void markCondOutputs(torch::jit::Graph *graph, torch::jit::Node *outputs, torch::jit::Node *insertion_point, Subgraph *subgraph, ReshapePutterHelper &reshape_putter_helper, bool processingElseOutputs = false) { torch::jit::WithInsertPoint insert_point(outputs); // Sometimes the return might not be processed in this node. const bool used_in_subgraph = markInputsAsComingFromParent(graph, outputs, subgraph); at::ArrayRef inputs = outputs->inputs(); for (size_t idx = 0; idx < inputs.size(); idx++) { torch::jit::Value *output = inputs[idx]; WithNodeMetadata meta{output->node()}; // Output tensor shape has to be read before adding IdentityOp as the shape // info does not propagate to the op output. const auto output_shape = shapeFromTensor(output); // Add an identity op in lieu if the op isn't used in the subgraph to make // sure popart handles the alias correctly. if (!used_in_subgraph) { torch::jit::Node *node = createIdentity(graph, {output}); output = node->output(); } // PopART doesn't allow inputs to be outputs directly. if (subgraph->reverse_input_map.find(output) != subgraph->reverse_input_map.end()) { output = subgraph->reverse_input_map[output]; } if (processingElseOutputs) { // Processing the else branch of the cond op. Here we make sure the // outputs of the branches have the same shapes. If not, we add a reshape // in the `then` branch. const auto &then_out_shape = reshape_putter_helper[idx].shape; const auto &else_out_shape = output_shape; if (else_out_shape.empty()) { ERROR("`else` branch output has an empty shape, so adding a reshape " "op to the `then` branch to achieve shapes identity is not " "possible!"); } if (then_out_shape != else_out_shape) { // In case if branches output shapes differ, there is a reshape added: // 1. Create a reshape op torch::jit::Node *reshape_node = nullptr; { torch::jit::WithInsertPoint reshape_insert_point( reshape_putter_helper[idx].insertion_point); auto *tensor_to_reshape = reshape_putter_helper[idx].insertion_point->input(); reshape_node = createReshape(graph, tensor_to_reshape, else_out_shape); } // 2. Create a new output tensor of the `then` branch (being the reshape // output) and insert it before the original output tensor op. torch::jit::Node *new_then_output_node = createAddOutputTensor(graph, reshape_node->output()); insertNodeBeforeNode(new_then_output_node, reshape_putter_helper[idx].insertion_point); // 3. Remove the original output tensor op returning the wrongly shaped // tensor. reshape_putter_helper[idx].insertion_point->destroy(); } // Create the output tensor of the `else` branch. torch::jit::Node *else_output_node = createAddOutputTensor(graph, output); insertNodeBeforeNode(else_output_node, insertion_point); } else { // Create the output tensor of the `then` branch. // In case the output tensor turns out to be of a different shape then // `else` branch'es one, it will be replaced with the reshaped output // tensor. torch::jit::Node *then_output_node = createAddOutputTensor(graph, output); insertNodeBeforeNode(then_output_node, insertion_point); reshape_putter_helper.push_back( {then_output_node, shapeFromTensor(output)}); } } } void insertSetAttribute(torch::jit::Graph *graph, size_t cond_nest_lvl, torch::jit::Node *insertion_point, std::mt19937 &random_gen, bool after_insert_pnt = false) { torch::jit::WithInsertPoint set_attr_insert_point(insertion_point); WithNodeMetadata meta{insertion_point}; std::uniform_int_distribution<> distribution; const std::string id{"cond_id_" + std::to_string(distribution(random_gen))}; const std::string cond_context{"cond_context_" + std::to_string(cond_nest_lvl)}; createSetAttribute(graph, "__outline", cond_context, id, after_insert_pnt); } void insertClearAttribute(torch::jit::Graph *graph, size_t cond_nest_lvl, torch::jit::Node *insertion_point, bool after_insert_pnt = false) { torch::jit::WithInsertPoint clr_attr_insert_point(insertion_point); WithNodeMetadata meta{insertion_point}; const std::string cond_context = "cond_context_" + std::to_string(cond_nest_lvl); createClearAttribute(graph, "__outline", cond_context, after_insert_pnt); } // State during the dispatcher intercept calls. std::stack start_for_loop_nodes; } // namespace /* * Certain ops are essentially subgraphs within the main graph. For instance * for loops. If they have a tensor which comes from the subgraph * above we must add a specific input entry op to the graph for that op. */ void annotateSubgraphs(torch::jit::Graph *graph, torch::jit::Node *start_node) { logging::LogContext ctx_func("annotateSubgraphs Processing"); // Subgraph start to all nodes contained directly within that subgraph. std::stack subgraph_nodes; // Nodes to delete (if they are truely unused). std::unordered_set to_delete; // Helper struct for processing if_else. std::stack reshape_putter_helpers_stack; // Random generator - used to generate ids for cond operator entities std::mt19937 random_gen(0); // Look for any subgraphs. Subgraphs are currently: // * for loops. for (auto iter = start_node->iterator(); iter != graph->nodes().end(); ++iter) { torch::jit::Node *node = *iter; logging::LogContext ctx("Processing " + nodeToString(node)); const torch::jit::Symbol kind = node->kind(); if (kind == symbols::poptorch::start_for_loop) { // Start tracking the new subgraph. subgraph_nodes.push(Subgraph()); torch::jit::WithInsertPoint insert_point(node->next()); markInputsAsComingFromParent(graph, node->input()->node(), &subgraph_nodes.top(), false); // We no longer need these inputs. to_delete.insert(node->input(0)->node()); node->removeInput(0); } else if (kind == symbols::poptorch::end_for_loop) { markOutputs(graph, node->input(0)->node(), node, &subgraph_nodes.top()); subgraph_nodes.pop(); // We no longer need these inputs. to_delete.insert(node->input(0)->node()); node->removeInput(0); } else if (kind == symbols::poptorch::start_if_block) { // Start tracking the new subgraph. subgraph_nodes.push(Subgraph()); // if/else block branches code have to get their own context, so that // popart outlining does not tear them appart and break their logic. insertSetAttribute(graph, subgraph_nodes.size(), node, random_gen); // Delete the input node (condition) as it is not needed anymore. to_delete.insert(node->input(0)->node()); node->removeInput(0); } else if (kind == symbols::poptorch::start_else_block) { // Add the outputs of `then` branch just before starting the else block. reshape_putter_helpers_stack.emplace(); markCondOutputs(graph, node->input(0)->node(), node, &subgraph_nodes.top(), reshape_putter_helpers_stack.top(), false /*processingElseOuputs*/); insertClearAttribute(graph, subgraph_nodes.size(), node); // Remove the if subgraph. subgraph_nodes.pop(); // Start tracking the new subgraph. subgraph_nodes.push(Subgraph()); insertSetAttribute(graph, subgraph_nodes.size(), node, random_gen, true /* after_insert_pnt */); // Delete the input node (then_branch output), as it is not needed // anymore. to_delete.insert(node->input(0)->node()); node->removeInput(0); } else if (kind == symbols::poptorch::end_if_block) { // Mark the outputs of the `else` block. markCondOutputs(graph, node->input(0)->node(), node, &subgraph_nodes.top(), reshape_putter_helpers_stack.top(), true /*processingElseOutputs*/); reshape_putter_helpers_stack.pop(); insertClearAttribute(graph, subgraph_nodes.size(), node, true /* after_insert_pnt */); // Remove the else subgraph. subgraph_nodes.pop(); // Record the number of outputs. const std::size_t num_outputs = node->input(0)->node()->inputs().size(); node->i_(c10::Symbol::fromQualString("attr::num_outputs"), num_outputs); // Delete the 1st input node (else_branch output), as it is not needed // anymore. to_delete.insert(node->input(0)->node()); node->removeInput(0); } else if (kind == symbols::poptorch::add_untyped_input_tensor) { continue; } else if (!subgraph_nodes.empty()) { // Don't count the list construction nodes. if (isUsedInTerminator(node)) { continue; } // Add this node to the active subgraph. subgraph_nodes.top().nodes.insert(node); torch::jit::WithInsertPoint insert_point(node); markInputsAsComingFromParent(graph, node, &subgraph_nodes.top()); for (const std::pair &pair : subgraph_nodes.top().input_map) { node->replaceInputWith(pair.second, pair.first); } } } for (torch::jit::Node *node : to_delete) { if (node->output()->uses().empty()) { node->destroy(); } } } } // namespace poptorch ================================================ FILE: poptorch/source/AliasProcessing.cpp ================================================ // Copyright (c) 2021 Graphcore Ltd. All rights reserved. #include #include #include "poptorch/AliasProcessing.hpp" namespace poptorch { void resolveAliases(torch::jit::Graph *graph) { std::vector to_delete; for (auto *node : graph->nodes()) { if (node->kind() != c10::aten::alias) { continue; } // Replace the alias output with the direct input node->output()->replaceAllUsesWith(node->input()); to_delete.push_back(node); } for (auto *node : to_delete) { node->destroy(); } } } // namespace poptorch ================================================ FILE: poptorch/source/CPUOffloadingCleanUp.cpp ================================================ // Copyright (c) 2021 Graphcore Ltd. All rights reserved. #include #include "PoptorchSymbols.hpp" #include "poptorch/OpBuilder.hpp" #include "poptorch/PopartCanonicalization.hpp" #include "poptorch/Utils.hpp" #include "poptorch_logging/Logging.hpp" /* * CPU operations come in from the host in the form: Tensor[] %outputs = poptorch::call_cpu_op(%inputs) ... # Any traced user operations (to keep the trace consistent/happy) = poptorch::end_cpu_op(%8) * We need to do two things. Firstly we need to cull all the ops inbetween * call_cpu_op and end_cpu_op. Secondly we need to map the users of * poptorch::end_cpu_op to the outputs of poptorch::call_cpu_op. * * To do this we simply traverse through the nodes and record when we enter and * exit CPU op scope i.e between poptorch::call_cpu_op a poptorch::end_cpu_op. */ // extern c10::Symbol call_cpu_op; namespace poptorch { void cpuOffloadingCleanup(torch::jit::Graph *graph) { std::unordered_set to_delete; // For diagnostics. std::size_t cpu_ops_found = 0; // The CPU op we are currently working on. torch::jit::Node *cpu_op_in_scope = nullptr; // For all nodes in the IR. for (torch::jit::Node *node : graph->nodes()) { const torch::jit::Symbol kind = node->kind(); // Start CPU op scope. if (kind == symbols::poptorch::call_cpu_op) { ERROR_ON_MSG( cpu_op_in_scope != nullptr, "Trying to enter CPU from another CPU op! CPU ops must not overlap."); cpu_ops_found++; cpu_op_in_scope = node; } else if (kind == symbols::poptorch::canonicalised_cpu_call) { ERROR_ON_MSG( cpu_op_in_scope != nullptr, "Trying to enter CPU from another CPU op! CPU ops must not overlap."); cpu_ops_found++; cpu_op_in_scope = node; } else if (kind == symbols::poptorch::end_cpu_op) { to_delete.insert(node); // The form should be that the `end_cpu_op` feeds into a `ListUnpack` node // which converts the single output of the `end_cpu_op` (representing a // tuple/list) into multiple outputs. We transform it to eliminate that // unpack. torch::jit::Value *output = node->output(); std::vector uses = output->uses(); ERROR_ON_MSG( uses.empty(), "[Internal compiler error] CPU operation output has no uses."); ERROR_ON_MSG( uses.size() > 1, "[Internal compiler error] CPU operation output has multiple uses."); // List unpack torch::jit::Node *unpack = uses[0].user; ERROR_ON_MSG(unpack->kind() != c10::prim::ListUnpack, "[Internal compiler error] CPU operation output is not used " "by a list unpack"); unpack->removeAllInputs(); ERROR_ON_MSG(cpu_op_in_scope == nullptr, "[Internal compiler error] CPU operation is null"); // Remove the output. // Add the outputs and remap them to point to what the unpack previously // was used in. for (torch::jit::Value *old_out : unpack->outputs()) { torch::jit::Value *new_out = cpu_op_in_scope->addOutput(); new_out->copyMetadata(old_out); old_out->replaceAllUsesWith(new_out); } // Remove the unpack. to_delete.insert(unpack); // Leave CPU scope. cpu_op_in_scope = nullptr; } else if (cpu_op_in_scope != nullptr) { // Unfortunately the compiler can put some non-functional SSA unpack ops // in the CPU scope that do logically outlive it. if (node->kind() != c10::prim::ListUnpack) { // Enables us to clean up some nodes without invalidating the IR. node->removeAllInputs(); // Record the op for removal. to_delete.insert(node); } } } logging::trace("Found {} cpu ops. Removed {} nodes", cpu_ops_found, to_delete.size()); // Remove the dead nodes. searchAndPossiblyDestroy(to_delete); } } // namespace poptorch ================================================ FILE: poptorch/source/CompilerOps.cpp.inc ================================================ // Copyright (c) 2022 Graphcore Ltd. All rights reserved. // Auto generated file, do not modify // Run `python3 scripts/PopParse.py` to regenerate // clang-format off torch::jit::Node* createCopyvarupdate(torch::jit::Graph *graph, const std::vector& args) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::copyvarupdate, args, ImplicitCast::None, OutputType::Unknown); return new_node; } torch::jit::Node* createBucketize(torch::jit::Graph *graph, const std::vector& args, bool right) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::bucketize, args, ImplicitCast::None, OutputType::AsFirstInput); new_node->i_(c10::Symbol::attr("right"), static_cast(right)); return new_node; } torch::jit::Node* createBatchnormalization(torch::jit::Graph *graph, const std::vector& args,unsigned int num_outputs,float epsilon,float momentum, unsigned int num_node_outputs) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::batchnormalization, args, ImplicitCast::None, OutputType::AsFirstInput, num_node_outputs); new_node->i_(c10::Symbol::attr("num_outputs"),num_outputs); new_node->f_(c10::Symbol::attr("epsilon"),epsilon); new_node->f_(c10::Symbol::attr("momentum"),momentum); return new_node; } torch::jit::Node* createGroupnormalization(torch::jit::Graph *graph, const std::vector& args,int64_t num_groups,float epsilon) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::groupnormalization, args, ImplicitCast::None, OutputType::AsFirstInput); new_node->i_(c10::Symbol::attr("num_groups"),num_groups); new_node->f_(c10::Symbol::attr("epsilon"),epsilon); return new_node; } torch::jit::Node* createSubsample(torch::jit::Graph *graph, const std::vector& args,const std::vector & strides) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::subsample, args, ImplicitCast::None, OutputType::AsFirstInput); new_node->is_(c10::Symbol::attr("strides"),strides); return new_node; } torch::jit::Node* createPrinttensor(torch::jit::Graph *graph, const std::vector& args,int64_t print_gradient,const std::string & title,const int summariseThreshold,const int edgeItems,const int maxLineWidth,const int digits,const int floatFormat,const char separator,const char openBracket,const char closeBracket) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::printtensor, args, ImplicitCast::None, OutputType::AsFirstInput); new_node->i_(c10::Symbol::attr("print_gradient"),print_gradient); new_node->s_(c10::Symbol::attr("title"),title); new_node->i_(c10::Symbol::attr("summariseThreshold"),summariseThreshold); new_node->i_(c10::Symbol::attr("edgeItems"),edgeItems); new_node->i_(c10::Symbol::attr("maxLineWidth"),maxLineWidth); new_node->i_(c10::Symbol::attr("digits"),digits); new_node->i_(c10::Symbol::attr("floatFormat"),floatFormat); new_node->i_(c10::Symbol::attr("separator"),separator); new_node->i_(c10::Symbol::attr("openBracket"),openBracket); new_node->i_(c10::Symbol::attr("closeBracket"),closeBracket); return new_node; } torch::jit::Node* createNop(torch::jit::Graph *graph, const std::vector& args) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::nop, args, ImplicitCast::None, OutputType::AsFirstInput); return new_node; } torch::jit::Node* createScale(torch::jit::Graph *graph, const std::vector& args,float scale) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::scale, args, ImplicitCast::None, OutputType::AsFirstInput); new_node->f_(c10::Symbol::attr("scale"),scale); return new_node; } torch::jit::Node* createScaledadd(torch::jit::Graph *graph, const std::vector& args,float scale0,float scale1) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::scaledadd, args, ImplicitCast::None, OutputType::AsFirstInput); new_node->f_(c10::Symbol::attr("scale0"),scale0); new_node->f_(c10::Symbol::attr("scale1"),scale1); return new_node; } torch::jit::Node* createLstm(torch::jit::Graph *graph, const std::vector& args,int64_t outputFullSequence) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::lstm, args, ImplicitCast::All, OutputType::AsImplicitCastPromoted, 2); new_node->i_(c10::Symbol::attr("outputFullSequence"),outputFullSequence); return new_node; } torch::jit::Node* createGelu(torch::jit::Graph *graph, const std::vector& args) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::gelu, args, ImplicitCast::None, OutputType::AsFirstInput); return new_node; } torch::jit::Node* createGeluErf(torch::jit::Graph *graph, const std::vector& args) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::geluerf, args, ImplicitCast::None, OutputType::AsFirstInput); return new_node; } torch::jit::Node* createDetach(torch::jit::Graph *graph, const std::vector& args) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::detach, args, ImplicitCast::None, OutputType::AsFirstInput); return new_node; } torch::jit::Node* createDepthtospace(torch::jit::Graph *graph, const std::vector& args,int64_t blocksize,const std::string & mode) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::depthtospace, args, ImplicitCast::None, OutputType::AsFirstInput); new_node->i_(c10::Symbol::attr("blocksize"),blocksize); new_node->s_(c10::Symbol::attr("mode"),mode); return new_node; } torch::jit::Node* createRound(torch::jit::Graph *graph, const std::vector& args) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::round, args, ImplicitCast::None, OutputType::AsFirstInput); return new_node; } torch::jit::Node* createNearbyInt(torch::jit::Graph *graph, const std::vector& args) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::nearbyint, args, ImplicitCast::None, OutputType::AsFirstInput); return new_node; } torch::jit::Node* createDynamicslice(torch::jit::Graph *graph, const std::vector& args,std::vector axes,std::vector sizes,std::int32_t noOverlap) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::dynamicslice, args, ImplicitCast::None, OutputType::AsFirstInput); new_node->is_(c10::Symbol::attr("axes"),axes); new_node->is_(c10::Symbol::attr("sizes"),sizes); new_node->i_(c10::Symbol::attr("noOverlap"),noOverlap); return new_node; } torch::jit::Node* createDynamicupdate(torch::jit::Graph *graph, const std::vector& args,std::vector axes,std::vector sizes,std::int32_t noOverlap) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::dynamicupdate, args, ImplicitCast::None, OutputType::AsFirstInput); new_node->is_(c10::Symbol::attr("axes"),axes); new_node->is_(c10::Symbol::attr("sizes"),sizes); new_node->i_(c10::Symbol::attr("noOverlap"),noOverlap); return new_node; } torch::jit::Node* createDynamiczero(torch::jit::Graph *graph, const std::vector& args,std::vector axes,std::vector sizes) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::dynamiczero, args, ImplicitCast::All, OutputType::AsImplicitCastPromoted); new_node->is_(c10::Symbol::attr("axes"),axes); new_node->is_(c10::Symbol::attr("sizes"),sizes); return new_node; } torch::jit::Node* createDynamicadd(torch::jit::Graph *graph, const std::vector& args,std::vector axes,std::vector sizes) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::dynamicadd, args, ImplicitCast::All, OutputType::AsImplicitCastPromoted); new_node->is_(c10::Symbol::attr("axes"),axes); new_node->is_(c10::Symbol::attr("sizes"),sizes); return new_node; } torch::jit::Node* createSequenceslice(torch::jit::Graph *graph, const std::vector& args,std::int32_t zeroUnused) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::sequenceslice, args, ImplicitCast::None, OutputType::Unknown); new_node->i_(c10::Symbol::attr("zeroUnused"),zeroUnused); return new_node; } torch::jit::Node* createL1loss(torch::jit::Graph *graph, const std::vector& args,const float lambda,std::int32_t reduction) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::l1loss, args, ImplicitCast::None, OutputType::AsFirstInput); new_node->f_(c10::Symbol::attr("lambda"),lambda); new_node->i_(c10::Symbol::attr("reduction"),reduction); return new_node; } torch::jit::Node* createNllloss(torch::jit::Graph *graph, const std::vector& args,std::int32_t reduction,std::int32_t ignoreIndex,bool inputIsLogProbability) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::nllloss, args, ImplicitCast::None, OutputType::AsFirstInput); new_node->i_(c10::Symbol::attr("reduction"),reduction); new_node->i_(c10::Symbol::attr("ignoreIndex"),ignoreIndex); new_node->i_(c10::Symbol::attr("inputIsLogProbability"),inputIsLogProbability); return new_node; } torch::jit::Node* createIdentityloss(torch::jit::Graph *graph, const std::vector& args,std::int32_t reduction) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::identityloss, args, ImplicitCast::None, OutputType::AsFirstInput); new_node->i_(c10::Symbol::attr("reduction"),reduction); return new_node; } torch::jit::Node* create_ctcloss(torch::jit::Graph *graph, const std::vector& args,std::int32_t reduction,const unsigned int blank,const std::string & outDataType,const bool zeroInfinity) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::_ctcloss, args, ImplicitCast::None, OutputType::AsFirstInput); new_node->i_(c10::Symbol::attr("reduction"),reduction); new_node->i_(c10::Symbol::attr("blank"),blank); new_node->s_(c10::Symbol::attr("outDataType"),outDataType); new_node->i_(c10::Symbol::attr("zeroInfinity"),zeroInfinity); return new_node; } torch::jit::Node* createCtcbeamsearchdecoder(torch::jit::Graph *graph, const std::vector& args,unsigned int blank,unsigned int beamWidth,unsigned int topPaths) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::ctcbeamsearchdecoder, args, ImplicitCast::None, OutputType::Unknown); new_node->i_(c10::Symbol::attr("blank"),blank); new_node->i_(c10::Symbol::attr("beamWidth"),beamWidth); new_node->i_(c10::Symbol::attr("topPaths"),topPaths); return new_node; } torch::jit::Node* createShapeddropout(torch::jit::Graph *graph, const std::vector& args,const std::vector & shape,float ratio) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::shapeddropout, args, ImplicitCast::None, OutputType::AsFirstInput); new_node->is_(c10::Symbol::attr("shape"),shape); new_node->f_(c10::Symbol::attr("ratio"),ratio); return new_node; } torch::jit::Node* createAtan2(torch::jit::Graph *graph, const std::vector& args) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::atan2, args, ImplicitCast::All, OutputType::AsImplicitCastPromoted); return new_node; } torch::jit::Node* createExpm1(torch::jit::Graph *graph, const std::vector& args) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::expm1, args, ImplicitCast::None, OutputType::AsFirstInput); return new_node; } torch::jit::Node* createLog1p(torch::jit::Graph *graph, const std::vector& args) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::log1p, args, ImplicitCast::None, OutputType::AsFirstInput); return new_node; } torch::jit::Node* createFmod(torch::jit::Graph *graph, const std::vector& args) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::fmod, args, ImplicitCast::All, OutputType::AsImplicitCastPromoted); return new_node; } torch::jit::Node* createRemainder(torch::jit::Graph *graph, const std::vector& args) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::remainder, args, ImplicitCast::All, OutputType::AsImplicitCastPromoted); return new_node; } torch::jit::Node* createReverse(torch::jit::Graph *graph, const std::vector& args,const std::vector & dimensions) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::reverse, args, ImplicitCast::None, OutputType::AsFirstInput); new_node->is_(c10::Symbol::attr("dimensions"),dimensions); return new_node; } torch::jit::Node* createSlice(torch::jit::Graph *graph, const std::vector& args,const std::vector & ends,const std::vector & starts,const std::vector & axes) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::slice, args, ImplicitCast::None, OutputType::AsFirstInput); new_node->is_(c10::Symbol::attr("ends"),ends); new_node->is_(c10::Symbol::attr("starts"),starts); new_node->is_(c10::Symbol::attr("axes"),axes); return new_node; } torch::jit::Node* createBitwisenot(torch::jit::Graph *graph, const std::vector& args) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::bitwisenot, args, ImplicitCast::None, OutputType::AsFirstInput); return new_node; } torch::jit::Node* createBitwiseand(torch::jit::Graph *graph, const std::vector& args) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::bitwiseand, args, ImplicitCast::None, OutputType::AsFirstInput); return new_node; } torch::jit::Node* createBitwiseor(torch::jit::Graph *graph, const std::vector& args) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::bitwiseor, args, ImplicitCast::None, OutputType::AsFirstInput); return new_node; } torch::jit::Node* createBitwisexor(torch::jit::Graph *graph, const std::vector& args) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::bitwisexor, args, ImplicitCast::None, OutputType::AsFirstInput); return new_node; } torch::jit::Node* createBitwisexnor(torch::jit::Graph *graph, const std::vector& args) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::bitwisexnor, args, ImplicitCast::None, OutputType::AsFirstInput); return new_node; } torch::jit::Node* createReducemedian(torch::jit::Graph *graph, const std::vector& args,std::vector axes,int64_t keepdims) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::reducemedian, args, ImplicitCast::None, OutputType::FirstAsFirstInputSecondAlwaysInt, 2); new_node->is_(c10::Symbol::attr("axes"),axes); new_node->i_(c10::Symbol::attr("keepdims"),keepdims); return new_node; } torch::jit::Node* createScatterreduce(torch::jit::Graph *graph, const std::vector& args,std::int32_t axis_size,std::int32_t axis,bool enable_index_broadcast,std::int32_t reduction) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::scatterreduce, args, ImplicitCast::None, OutputType::AsFirstInput); new_node->i_(c10::Symbol::attr("axis_size"),axis_size); new_node->i_(c10::Symbol::attr("axis"),axis); new_node->i_(c10::Symbol::attr("reduction"),reduction); new_node->i_(c10::Symbol::attr("enable_index_broadcast"), static_cast(enable_index_broadcast)); return new_node; } torch::jit::Node* createGroupedscatterreduce(torch::jit::Graph *graph, const std::vector& args,std::int32_t axis_size,std::int32_t axis,std::int32_t group_size,bool enable_index_broadcast,std::int32_t reduction) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::groupedscatterreduce, args, ImplicitCast::None, OutputType::AsFirstInput); new_node->i_(c10::Symbol::attr("axis_size"),axis_size); new_node->i_(c10::Symbol::attr("axis"),axis); new_node->i_(c10::Symbol::attr("reduction"),reduction); new_node->i_(c10::Symbol::attr("group_size"), group_size); new_node->i_(c10::Symbol::attr("enable_index_broadcast"), static_cast(enable_index_broadcast)); return new_node; } torch::jit::Node* createSwish(torch::jit::Graph *graph, const std::vector& args) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::swish, args, ImplicitCast::None, OutputType::AsFirstInput); return new_node; } torch::jit::Node* createAveragepool(torch::jit::Graph *graph, const std::vector& args,const std::vector & kernel_shape,int64_t ceil_mode,int64_t count_include_pad,const std::vector & pads,const std::vector & strides) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::averagepool, args, ImplicitCast::None, OutputType::AsFirstInput); new_node->is_(c10::Symbol::attr("kernel_shape"),kernel_shape); new_node->i_(c10::Symbol::attr("ceil_mode"),ceil_mode); new_node->i_(c10::Symbol::attr("count_include_pad"),count_include_pad); new_node->is_(c10::Symbol::attr("pads"),pads); new_node->is_(c10::Symbol::attr("strides"),strides); return new_node; } torch::jit::Node* createConvinteger(torch::jit::Graph *graph, const std::vector& args,const std::vector & dilations,int64_t group,const std::vector & kernel_shape,const std::vector & pads,const std::vector & strides) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::convinteger, args, ImplicitCast::None, OutputType::AlwaysInt); new_node->is_(c10::Symbol::attr("dilations"),dilations); new_node->i_(c10::Symbol::attr("group"),group); new_node->is_(c10::Symbol::attr("kernel_shape"),kernel_shape); new_node->is_(c10::Symbol::attr("pads"),pads); new_node->is_(c10::Symbol::attr("strides"),strides); return new_node; } torch::jit::Node* createDequantizelinear(torch::jit::Graph *graph, const std::vector& args) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::dequantizelinear, args, ImplicitCast::ExceptSecond, OutputType::AlwaysFloat); return new_node; } torch::jit::Node* createDropout(torch::jit::Graph *graph, const std::vector& args,unsigned int num_outputs,float ratio) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::dropout, args, ImplicitCast::None, OutputType::AsFirstInput); new_node->i_(c10::Symbol::attr("num_outputs"),num_outputs); new_node->f_(c10::Symbol::attr("ratio"),ratio); return new_node; } torch::jit::Node* createIsinf(torch::jit::Graph *graph, const std::vector& args,int64_t detect_negative,int64_t detect_positive) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::isinf, args, ImplicitCast::None, OutputType::AlwaysBool); new_node->i_(c10::Symbol::attr("detect_negative"),detect_negative); new_node->i_(c10::Symbol::attr("detect_positive"),detect_positive); return new_node; } torch::jit::Node* createMatmulinteger(torch::jit::Graph *graph, const std::vector& args) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::matmulinteger, args, ImplicitCast::None, OutputType::AlwaysInt); return new_node; } torch::jit::Node* createMaxpool(torch::jit::Graph *graph, const std::vector& args,unsigned int num_outputs,const std::vector & kernel_shape,int64_t ceil_mode,const std::vector & dilations,const std::vector & pads,int64_t storage_order,const std::vector & strides) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::maxpool, args, ImplicitCast::None, OutputType::AsFirstInput); new_node->i_(c10::Symbol::attr("num_outputs"),num_outputs); new_node->is_(c10::Symbol::attr("kernel_shape"),kernel_shape); new_node->i_(c10::Symbol::attr("ceil_mode"),ceil_mode); new_node->is_(c10::Symbol::attr("dilations"),dilations); new_node->is_(c10::Symbol::attr("pads"),pads); new_node->i_(c10::Symbol::attr("storage_order"),storage_order); new_node->is_(c10::Symbol::attr("strides"),strides); return new_node; } torch::jit::Node* createMod(torch::jit::Graph *graph, const std::vector& args,int64_t fmod) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::mod, args, ImplicitCast::All, OutputType::AsImplicitCastPromoted); new_node->i_(c10::Symbol::attr("fmod"),fmod); return new_node; } torch::jit::Node* createNonmaxsuppression(torch::jit::Graph *graph, const std::vector& args,int64_t center_point_box) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::nonmaxsuppression, args, ImplicitCast::None, OutputType::AlwaysInt); new_node->i_(c10::Symbol::attr("center_point_box"),center_point_box); return new_node; } torch::jit::Node* createQlinearconv(torch::jit::Graph *graph, const std::vector& args,const std::vector & dilations,int64_t group,const std::vector & kernel_shape,const std::vector & pads,const std::vector & strides) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::qlinearconv, args, ImplicitCast::None, OutputType::AlwaysUint8); new_node->is_(c10::Symbol::attr("dilations"),dilations); new_node->i_(c10::Symbol::attr("group"),group); new_node->is_(c10::Symbol::attr("kernel_shape"),kernel_shape); new_node->is_(c10::Symbol::attr("pads"),pads); new_node->is_(c10::Symbol::attr("strides"),strides); return new_node; } torch::jit::Node* createQlinearmatmul(torch::jit::Graph *graph, const std::vector& args) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::qlinearmatmul, args, ImplicitCast::None, OutputType::AlwaysUint8); return new_node; } torch::jit::Node* createQuantizelinear(torch::jit::Graph *graph, const std::vector& args) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::quantizelinear, args, ImplicitCast::None, OutputType::AlwaysUint8); return new_node; } torch::jit::Node* createResize(torch::jit::Graph *graph, const std::vector& args,const std::string &coordinate_transformation_mode,float cubic_coeff_a,int64_t exclude_outside,float extrapolation_value,const std::string & mode,const std::string &nearest_mode) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::resize, args, ImplicitCast::None, OutputType::AsFirstInput); new_node->s_(c10::Symbol::attr("coordinate_transformation_mode"),coordinate_transformation_mode); new_node->f_(c10::Symbol::attr("cubic_coeff_a"),cubic_coeff_a); new_node->i_(c10::Symbol::attr("exclude_outside"),exclude_outside); new_node->f_(c10::Symbol::attr("extrapolation_value"),extrapolation_value); new_node->s_(c10::Symbol::attr("mode"),mode); new_node->s_(c10::Symbol::attr("nearest_mode"),nearest_mode); return new_node; } torch::jit::Node* createReversesequence(torch::jit::Graph *graph, const std::vector& args,int64_t batch_axis,int64_t time_axis) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::reversesequence, args, ImplicitCast::None, OutputType::AsFirstInput); new_node->i_(c10::Symbol::attr("batch_axis"),batch_axis); new_node->i_(c10::Symbol::attr("time_axis"),time_axis); return new_node; } torch::jit::Node* createRoialign(torch::jit::Graph *graph, const std::vector& args,const std::string & mode,int64_t output_height,int64_t output_width,int64_t sampling_ratio,float spatial_scale) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::roialign, args, ImplicitCast::ExceptThird, OutputType::AsImplicitCastPromoted); new_node->s_(c10::Symbol::attr("mode"),mode); new_node->i_(c10::Symbol::attr("output_height"),output_height); new_node->i_(c10::Symbol::attr("output_width"),output_width); new_node->i_(c10::Symbol::attr("sampling_ratio"),sampling_ratio); new_node->f_(c10::Symbol::attr("spatial_scale"),spatial_scale); return new_node; } torch::jit::Node* createThresholdedrelu(torch::jit::Graph *graph, const std::vector& args,float alpha) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::thresholdedrelu, args, ImplicitCast::None, OutputType::AsFirstInput); new_node->f_(c10::Symbol::attr("alpha"),alpha); return new_node; } torch::jit::Node* createTopk(torch::jit::Graph *graph, const std::vector& args,int64_t axis, bool largest, bool sorted) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::topk, args, ImplicitCast::None, OutputType::FirstAsFirstInputSecondAlwaysInt, 2); new_node->i_(c10::Symbol::attr("axis"),axis); new_node->i_(c10::Symbol::attr("largest"),static_cast(largest)); new_node->i_(c10::Symbol::attr("sorted"),static_cast(sorted)); return new_node; } torch::jit::Node* createSort(torch::jit::Graph *graph, const std::vector& args,int64_t axis, bool descending, bool stable) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::sort, args, ImplicitCast::None, OutputType::FirstAsFirstInputSecondAlwaysInt, 2); new_node->i_(c10::Symbol::attr("axis"),axis); new_node->i_(c10::Symbol::attr("descending"),static_cast(descending)); new_node->i_(c10::Symbol::attr("stable"),static_cast(stable)); return new_node; } torch::jit::Node* createUpsample(torch::jit::Graph *graph, const std::vector& args,const std::string & mode) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::upsample, args, ImplicitCast::None, OutputType::AsFirstInput); new_node->s_(c10::Symbol::attr("mode"),mode); return new_node; } torch::jit::Node* createAcosh(torch::jit::Graph *graph, const std::vector& args) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::acosh, args, ImplicitCast::None, OutputType::AsFirstInput); return new_node; } torch::jit::Node* createAsinh(torch::jit::Graph *graph, const std::vector& args) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::asinh, args, ImplicitCast::None, OutputType::AsFirstInput); return new_node; } torch::jit::Node* createAtanh(torch::jit::Graph *graph, const std::vector& args) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::atanh, args, ImplicitCast::None, OutputType::AsFirstInput); return new_node; } torch::jit::Node* createCast(torch::jit::Graph *graph, const std::vector& args,const std::string & to) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::cast, args, ImplicitCast::None, OutputType::AsDtype); new_node->s_(c10::Symbol::attr("to"),to); setNodeOutputsTypes(new_node, ImplicitCast::None, OutputType::AsDtype); return new_node; } torch::jit::Node* createCompress(torch::jit::Graph *graph, const std::vector& args,std::int32_t axis) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::compress, args, ImplicitCast::None, OutputType::AsFirstInput); new_node->i_(c10::Symbol::attr("axis"),axis); return new_node; } torch::jit::Node* createCosh(torch::jit::Graph *graph, const std::vector& args) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::cosh, args, ImplicitCast::None, OutputType::AsFirstInput); return new_node; } torch::jit::Node* createErf(torch::jit::Graph *graph, const std::vector& args) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::erf, args, ImplicitCast::None, OutputType::AsFirstInput); return new_node; } torch::jit::Node* createEyelike(torch::jit::Graph *graph, const std::vector& args,std::int32_t dtype,int64_t k) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::eyelike, args, ImplicitCast::None, OutputType::AsDtype); new_node->i_(c10::Symbol::attr("dtype"),dtype); new_node->i_(c10::Symbol::attr("k"),k); setNodeOutputsTypes(new_node, ImplicitCast::None, OutputType::AsDtype); return new_node; } torch::jit::Node* createFlatten(torch::jit::Graph *graph, const std::vector& args,int64_t axis) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::flatten, args, ImplicitCast::None, OutputType::AsFirstInput); new_node->i_(c10::Symbol::attr("axis"),axis); return new_node; } torch::jit::Node* createGemm(torch::jit::Graph *graph, const std::vector& args,float alpha,float beta,int64_t transA,int64_t transB) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::gemm, args, ImplicitCast::All, OutputType::AsImplicitCastPromoted); new_node->f_(c10::Symbol::attr("alpha"),alpha); new_node->f_(c10::Symbol::attr("beta"),beta); new_node->i_(c10::Symbol::attr("transA"),transA); new_node->i_(c10::Symbol::attr("transB"),transB); return new_node; } torch::jit::Node* createGreater(torch::jit::Graph *graph, const std::vector& args) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::greater, args, ImplicitCast::All, OutputType::AlwaysBool); return new_node; } torch::jit::Node* createIsnan(torch::jit::Graph *graph, const std::vector& args) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::isnan, args, ImplicitCast::None, OutputType::AlwaysBool); return new_node; } torch::jit::Node* createLess(torch::jit::Graph *graph, const std::vector& args) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::less, args, ImplicitCast::All, OutputType::AlwaysBool); return new_node; } torch::jit::Node* createMatmul(torch::jit::Graph *graph, const std::vector& args) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::matmul, args, ImplicitCast::All, OutputType::AsImplicitCastPromoted); return new_node; } torch::jit::Node* createMaxunpool(torch::jit::Graph *graph, const std::vector& args,const std::vector & kernel_shape,const std::vector & pads,const std::vector & strides) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::maxunpool, args, ImplicitCast::None, OutputType::AsFirstInput); new_node->is_(c10::Symbol::attr("kernel_shape"),kernel_shape); new_node->is_(c10::Symbol::attr("pads"),pads); new_node->is_(c10::Symbol::attr("strides"),strides); return new_node; } torch::jit::Node* createMeanvariancenormalization(torch::jit::Graph *graph, const std::vector& args,const std::vector & axes) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::meanvariancenormalization, args, ImplicitCast::None, OutputType::AsFirstInput); new_node->is_(c10::Symbol::attr("axes"),axes); return new_node; } torch::jit::Node* createNonzero(torch::jit::Graph *graph, const std::vector& args) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::nonzero, args, ImplicitCast::None, OutputType::AlwaysInt); return new_node; } torch::jit::Node* createOnehot(torch::jit::Graph *graph, const std::vector& args,int64_t axis) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::onehot, args, ImplicitCast::None, OutputType::AsFirstInput); new_node->i_(c10::Symbol::attr("axis"),axis); return new_node; } torch::jit::Node* createScatter(torch::jit::Graph *graph, const std::vector& args,int64_t axis) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::scatter, args, ImplicitCast::None, OutputType::AsFirstInput); new_node->i_(c10::Symbol::attr("axis"),axis); return new_node; } torch::jit::Node* createScatterElements(torch::jit::Graph *graph, const std::vector& args,int64_t axis) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::scatterelements, args, ImplicitCast::None, OutputType::AsFirstInput); new_node->i_(c10::Symbol::attr("axis"),axis); return new_node; } torch::jit::Node* createShrink(torch::jit::Graph *graph, const std::vector& args,float bias,float lambd) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::shrink, args, ImplicitCast::None, OutputType::AsFirstInput); new_node->f_(c10::Symbol::attr("bias"),bias); new_node->f_(c10::Symbol::attr("lambd"),lambd); return new_node; } torch::jit::Node* createSign(torch::jit::Graph *graph, const std::vector& args) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::sign, args, ImplicitCast::None, OutputType::AsFirstInput); return new_node; } torch::jit::Node* createSinh(torch::jit::Graph *graph, const std::vector& args) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::sinh, args, ImplicitCast::None, OutputType::AsFirstInput); return new_node; } torch::jit::Node* createTfidfvectorizer(torch::jit::Graph *graph, const std::vector& args,int64_t max_gram_length,int64_t max_skip_count,int64_t min_gram_length,const std::string & mode,const std::vector & ngram_counts,const std::vector & ngram_indexes,const std::vector & pool_int64s,const std::vector & pool_strings,std::vector weights) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::tfidfvectorizer, args, ImplicitCast::None, OutputType::AlwaysFloat); new_node->i_(c10::Symbol::attr("max_gram_length"),max_gram_length); new_node->i_(c10::Symbol::attr("max_skip_count"),max_skip_count); new_node->i_(c10::Symbol::attr("min_gram_length"),min_gram_length); new_node->s_(c10::Symbol::attr("mode"),mode); new_node->is_(c10::Symbol::attr("ngram_counts"),ngram_counts); new_node->is_(c10::Symbol::attr("ngram_indexes"),ngram_indexes); new_node->is_(c10::Symbol::attr("pool_int64s"),pool_int64s); new_node->ss_(c10::Symbol::attr("pool_strings"),pool_strings); new_node->fs_(c10::Symbol::attr("weights"),weights); return new_node; } torch::jit::Node* createWhere(torch::jit::Graph *graph, const std::vector& args) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::where, args, ImplicitCast::ExceptFirst, OutputType::AsImplicitCastPromoted); return new_node; } torch::jit::Node* createExpand(torch::jit::Graph *graph, const std::vector& args) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::expand, args, ImplicitCast::None, OutputType::AsFirstInput); return new_node; } torch::jit::Node* createMax(torch::jit::Graph *graph, const std::vector& args) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::max, args, ImplicitCast::All, OutputType::AsImplicitCastPromoted); return new_node; } torch::jit::Node* createMean(torch::jit::Graph *graph, const std::vector& args) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::mean, args, ImplicitCast::All, OutputType::AsImplicitCastPromoted); return new_node; } torch::jit::Node* createMin(torch::jit::Graph *graph, const std::vector& args) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::min, args, ImplicitCast::All, OutputType::AsImplicitCastPromoted); return new_node; } torch::jit::Node* createSum(torch::jit::Graph *graph, const std::vector& args) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::sum, args, ImplicitCast::All, OutputType::AsImplicitCastPromoted); return new_node; } torch::jit::Node* createAcos(torch::jit::Graph *graph, const std::vector& args) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::acos, args, ImplicitCast::None, OutputType::AsFirstInput); return new_node; } torch::jit::Node* createAdd(torch::jit::Graph *graph, const std::vector& args) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::add, args, ImplicitCast::All, OutputType::AsImplicitCastPromoted); return new_node; } torch::jit::Node* createLogical_and(torch::jit::Graph *graph, const std::vector& args) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::logical_and, args, ImplicitCast::All, OutputType::AlwaysBool); return new_node; } torch::jit::Node* createAsin(torch::jit::Graph *graph, const std::vector& args) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::asin, args, ImplicitCast::None, OutputType::AsFirstInput); return new_node; } torch::jit::Node* createAtan(torch::jit::Graph *graph, const std::vector& args) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::atan, args, ImplicitCast::None, OutputType::AsFirstInput); return new_node; } torch::jit::Node* createCos(torch::jit::Graph *graph, const std::vector& args) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::cos, args, ImplicitCast::None, OutputType::AsFirstInput); return new_node; } torch::jit::Node* createDiv(torch::jit::Graph *graph, const std::vector& args) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::div, args, ImplicitCast::All, OutputType::AsImplicitCastPromoted); return new_node; } torch::jit::Node* createEqual(torch::jit::Graph *graph, const std::vector& args) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::equal, args, ImplicitCast::All, OutputType::AlwaysBool); return new_node; } torch::jit::Node* createMul(torch::jit::Graph *graph, const std::vector& args) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::mul, args, ImplicitCast::All, OutputType::AsImplicitCastPromoted); return new_node; } torch::jit::Node* createMultinomial(torch::jit::Graph *graph, const std::vector& args,int64_t dtype,int64_t sample_size,float seed) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::multinomial, args, ImplicitCast::None, OutputType::AsDtype); new_node->i_(c10::Symbol::attr("dtype"),dtype); new_node->i_(c10::Symbol::attr("sample_size"),sample_size); new_node->f_(c10::Symbol::attr("seed"),seed); setNodeOutputsTypes(new_node, ImplicitCast::None, OutputType::AsDtype); return new_node; } torch::jit::Node* createLogical_or(torch::jit::Graph *graph, const std::vector& args) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::logical_or, args, ImplicitCast::All, OutputType::AlwaysBool); return new_node; } torch::jit::Node* createPow(torch::jit::Graph *graph, const std::vector& args) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::pow, args, ImplicitCast::All, OutputType::AsImplicitCastPromoted); return new_node; } torch::jit::Node* createSin(torch::jit::Graph *graph, const std::vector& args) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::sin, args, ImplicitCast::None, OutputType::AsFirstInput); return new_node; } torch::jit::Node* createSub(torch::jit::Graph *graph, const std::vector& args) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::sub, args, ImplicitCast::All, OutputType::AsImplicitCastPromoted); return new_node; } torch::jit::Node* createTan(torch::jit::Graph *graph, const std::vector& args) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::tan, args, ImplicitCast::None, OutputType::AsFirstInput); return new_node; } torch::jit::Node* createLogical_xor(torch::jit::Graph *graph, const std::vector& args) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::logical_xor, args, ImplicitCast::All, OutputType::AlwaysBool); return new_node; } torch::jit::Node* createAbs(torch::jit::Graph *graph, const std::vector& args) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::abs, args, ImplicitCast::None, OutputType::AsFirstInput); return new_node; } torch::jit::Node* createArgmax(torch::jit::Graph *graph, const std::vector& args,int64_t axis,int64_t keepdims) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::argmax, args, ImplicitCast::None, OutputType::AlwaysInt); new_node->i_(c10::Symbol::attr("axis"),axis); new_node->i_(c10::Symbol::attr("keepdims"),keepdims); return new_node; } torch::jit::Node* createArgmin(torch::jit::Graph *graph, const std::vector& args,int64_t axis,int64_t keepdims) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::argmin, args, ImplicitCast::None, OutputType::AlwaysInt); new_node->i_(c10::Symbol::attr("axis"),axis); new_node->i_(c10::Symbol::attr("keepdims"),keepdims); return new_node; } torch::jit::Node* createCeil(torch::jit::Graph *graph, const std::vector& args) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::ceil, args, ImplicitCast::None, OutputType::AsFirstInput); return new_node; } torch::jit::Node* createClip(torch::jit::Graph *graph, const std::vector& args) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::clip, args, ImplicitCast::None, OutputType::AsImplicitCastPromoted); return new_node; } torch::jit::Node* createConcat(torch::jit::Graph *graph, const std::vector& args,int64_t axis) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::concat, args, ImplicitCast::All, OutputType::AsImplicitCastPromoted); new_node->i_(c10::Symbol::attr("axis"),axis); return new_node; } torch::jit::Node* createConv(torch::jit::Graph *graph, const std::vector& args,const std::vector & dilations,int64_t group,const std::vector & kernel_shape,const std::vector & pads,const std::vector & strides) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::conv, args, ImplicitCast::None, OutputType::AsFirstInput); new_node->is_(c10::Symbol::attr("dilations"),dilations); new_node->i_(c10::Symbol::attr("group"),group); new_node->is_(c10::Symbol::attr("kernel_shape"),kernel_shape); new_node->is_(c10::Symbol::attr("pads"),pads); new_node->is_(c10::Symbol::attr("strides"),strides); return new_node; } torch::jit::Node* createConvtranspose(torch::jit::Graph *graph, const std::vector& args,const std::vector & dilations,int64_t group,const std::vector & kernel_shape,const std::vector & output_padding,const std::vector & output_shape,const std::vector & pads,const std::vector & strides) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::convtranspose, args, ImplicitCast::All, OutputType::AsImplicitCastPromoted); new_node->is_(c10::Symbol::attr("dilations"),dilations); new_node->i_(c10::Symbol::attr("group"),group); new_node->is_(c10::Symbol::attr("kernel_shape"),kernel_shape); new_node->is_(c10::Symbol::attr("output_padding"),output_padding); new_node->is_(c10::Symbol::attr("output_shape"),output_shape); new_node->is_(c10::Symbol::attr("pads"),pads); new_node->is_(c10::Symbol::attr("strides"),strides); return new_node; } torch::jit::Node* createElu(torch::jit::Graph *graph, const std::vector& args,float alpha) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::elu, args, ImplicitCast::None, OutputType::AsFirstInput); new_node->f_(c10::Symbol::attr("alpha"),alpha); return new_node; } torch::jit::Node* createExp(torch::jit::Graph *graph, const std::vector& args) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::exp, args, ImplicitCast::None, OutputType::AsFirstInput); return new_node; } torch::jit::Node* createFloor(torch::jit::Graph *graph, const std::vector& args) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::floor, args, ImplicitCast::None, OutputType::AsFirstInput); return new_node; } torch::jit::Node* createGather(torch::jit::Graph *graph, const std::vector& args,int64_t axis) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::gather, args, ImplicitCast::None, OutputType::AsFirstInput); new_node->i_(c10::Symbol::attr("axis"),axis); return new_node; } torch::jit::Node* createGroupedgather(torch::jit::Graph *graph, const std::vector& args,int64_t axis,int64_t group_size) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::groupedgather, args, ImplicitCast::None, OutputType::AsFirstInput); new_node->i_(c10::Symbol::attr("axis"),axis); new_node->i_(c10::Symbol::attr("group_size"),group_size); return new_node; } torch::jit::Node* createGlobalaveragepool(torch::jit::Graph *graph, const std::vector& args) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::globalaveragepool, args, ImplicitCast::None, OutputType::AsFirstInput); return new_node; } torch::jit::Node* createGloballppool(torch::jit::Graph *graph, const std::vector& args,int64_t p) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::globallppool, args, ImplicitCast::None, OutputType::AsFirstInput); new_node->i_(c10::Symbol::attr("p"),p); return new_node; } torch::jit::Node* createGlobalmaxpool(torch::jit::Graph *graph, const std::vector& args) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::globalmaxpool, args, ImplicitCast::None, OutputType::AsFirstInput); return new_node; } torch::jit::Node* createHardsigmoid(torch::jit::Graph *graph, const std::vector& args,float alpha,float beta) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::hardsigmoid, args, ImplicitCast::None, OutputType::AsFirstInput); new_node->f_(c10::Symbol::attr("alpha"),alpha); new_node->f_(c10::Symbol::attr("beta"),beta); return new_node; } torch::jit::Node* createHardmax(torch::jit::Graph *graph, const std::vector& args,int64_t axis) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::hardmax, args, ImplicitCast::None, OutputType::AsFirstInput); new_node->i_(c10::Symbol::attr("axis"),axis); return new_node; } torch::jit::Node* createIdentity(torch::jit::Graph *graph, const std::vector& args) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::identity, args, ImplicitCast::None, OutputType::AsFirstInput); return new_node; } torch::jit::Node* createInstancenormalization(torch::jit::Graph *graph, const std::vector& args,float epsilon) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::instancenormalization, args, ImplicitCast::All, OutputType::AsImplicitCastPromoted); new_node->f_(c10::Symbol::attr("epsilon"),epsilon); return new_node; } torch::jit::Node* createLrn(torch::jit::Graph *graph, const std::vector& args,int64_t size,float alpha,float beta,float bias) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::lrn, args, ImplicitCast::None, OutputType::AsFirstInput); new_node->i_(c10::Symbol::attr("size"),size); new_node->f_(c10::Symbol::attr("alpha"),alpha); new_node->f_(c10::Symbol::attr("beta"),beta); new_node->f_(c10::Symbol::attr("bias"),bias); return new_node; } torch::jit::Node* createLeakyrelu(torch::jit::Graph *graph, const std::vector& args,float alpha) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::leakyrelu, args, ImplicitCast::None, OutputType::AsFirstInput); new_node->f_(c10::Symbol::attr("alpha"),alpha); return new_node; } torch::jit::Node* createLog(torch::jit::Graph *graph, const std::vector& args) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::log, args, ImplicitCast::None, OutputType::AsFirstInput); return new_node; } torch::jit::Node* createLogsoftmax(torch::jit::Graph *graph, const std::vector& args,int64_t axis) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::logsoftmax, args, ImplicitCast::None, OutputType::AsFirstInput); new_node->i_(c10::Symbol::attr("axis"),axis); return new_node; } torch::jit::Node* createLpnormalization(torch::jit::Graph *graph, const std::vector& args,int64_t axis,int64_t p) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::lpnormalization, args, ImplicitCast::None, OutputType::AsFirstInput); new_node->i_(c10::Symbol::attr("axis"),axis); new_node->i_(c10::Symbol::attr("p"),p); return new_node; } torch::jit::Node* createLppool(torch::jit::Graph *graph, const std::vector& args,const std::vector & kernel_shape,int64_t p,const std::vector & pads,const std::vector & strides) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::lppool, args, ImplicitCast::None, OutputType::AsFirstInput); new_node->is_(c10::Symbol::attr("kernel_shape"),kernel_shape); new_node->i_(c10::Symbol::attr("p"),p); new_node->is_(c10::Symbol::attr("pads"),pads); new_node->is_(c10::Symbol::attr("strides"),strides); return new_node; } torch::jit::Node* createMaxroipool(torch::jit::Graph *graph, const std::vector& args,const std::vector & pooled_shape,float spatial_scale) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::maxroipool, args, ImplicitCast::All, OutputType::AsImplicitCastPromoted); new_node->is_(c10::Symbol::attr("pooled_shape"),pooled_shape); new_node->f_(c10::Symbol::attr("spatial_scale"),spatial_scale); return new_node; } torch::jit::Node* createNeg(torch::jit::Graph *graph, const std::vector& args) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::neg, args, ImplicitCast::None, OutputType::AsFirstInput); return new_node; } torch::jit::Node* createLogical_not(torch::jit::Graph *graph, const std::vector& args) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::logical_not, args, ImplicitCast::None, OutputType::AlwaysBool); return new_node; } torch::jit::Node* createPad(torch::jit::Graph *graph, const std::vector& args,const std::string & mode) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::pad, args, ImplicitCast::None, OutputType::AsFirstInput); new_node->s_(c10::Symbol::attr("mode"),mode); return new_node; } torch::jit::Node* createRandomnormallike(torch::jit::Graph *graph, const std::vector& args,std::int32_t dtype,float mean,float scale,float seed) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::randomnormallike, args, ImplicitCast::None, OutputType::AsDtypeOrAsPromoted); new_node->i_(c10::Symbol::attr("dtype"),dtype); new_node->f_(c10::Symbol::attr("mean"),mean); new_node->f_(c10::Symbol::attr("scale"),scale); new_node->f_(c10::Symbol::attr("seed"),seed); setNodeOutputsTypes(new_node, ImplicitCast::All, OutputType::AsDtypeOrAsPromoted); return new_node; } torch::jit::Node* createRandomuniformlike(torch::jit::Graph *graph, const std::vector& args,std::int32_t dtype,float high,float low,float seed) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::randomuniformlike, args, ImplicitCast::None, OutputType::AsDtypeOrAsPromoted); new_node->i_(c10::Symbol::attr("dtype"),dtype); new_node->f_(c10::Symbol::attr("high"),high); new_node->f_(c10::Symbol::attr("low"),low); new_node->f_(c10::Symbol::attr("seed"),seed); setNodeOutputsTypes(new_node, ImplicitCast::All, OutputType::AsDtypeOrAsPromoted); return new_node; } torch::jit::Node* createReciprocal(torch::jit::Graph *graph, const std::vector& args) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::reciprocal, args, ImplicitCast::None, OutputType::AsFirstInput); return new_node; } torch::jit::Node* createReducel1(torch::jit::Graph *graph, const std::vector& args,std::vector axes,int64_t keepdims) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::reducel1, args, ImplicitCast::None, OutputType::AsFirstInput); new_node->is_(c10::Symbol::attr("axes"),axes); new_node->i_(c10::Symbol::attr("keepdims"),keepdims); return new_node; } torch::jit::Node* createReducel2(torch::jit::Graph *graph, const std::vector& args,std::vector axes,int64_t keepdims) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::reducel2, args, ImplicitCast::None, OutputType::AsFirstInput); new_node->is_(c10::Symbol::attr("axes"),axes); new_node->i_(c10::Symbol::attr("keepdims"),keepdims); return new_node; } torch::jit::Node* createReducelogsum(torch::jit::Graph *graph, const std::vector& args,std::vector axes,int64_t keepdims) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::reducelogsum, args, ImplicitCast::None, OutputType::AsFirstInput); new_node->is_(c10::Symbol::attr("axes"),axes); new_node->i_(c10::Symbol::attr("keepdims"),keepdims); return new_node; } torch::jit::Node* createReducelogsumexp(torch::jit::Graph *graph, const std::vector& args,std::vector axes,int64_t keepdims) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::reducelogsumexp, args, ImplicitCast::None, OutputType::AsFirstInput); new_node->is_(c10::Symbol::attr("axes"),axes); new_node->i_(c10::Symbol::attr("keepdims"),keepdims); return new_node; } torch::jit::Node* createReducemax(torch::jit::Graph *graph, const std::vector& args,std::vector axes,int64_t keepdims) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::reducemax, args, ImplicitCast::None, OutputType::AsFirstInput); new_node->is_(c10::Symbol::attr("axes"),axes); new_node->i_(c10::Symbol::attr("keepdims"),keepdims); return new_node; } torch::jit::Node* createReducemean(torch::jit::Graph *graph, const std::vector& args,std::vector axes,int64_t keepdims) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::reducemean, args, ImplicitCast::None, OutputType::AsFirstInput); new_node->is_(c10::Symbol::attr("axes"),axes); new_node->i_(c10::Symbol::attr("keepdims"),keepdims); return new_node; } torch::jit::Node* createReducemin(torch::jit::Graph *graph, const std::vector& args,std::vector axes,int64_t keepdims) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::reducemin, args, ImplicitCast::None, OutputType::AsFirstInput); new_node->is_(c10::Symbol::attr("axes"),axes); new_node->i_(c10::Symbol::attr("keepdims"),keepdims); return new_node; } torch::jit::Node* createReduceprod(torch::jit::Graph *graph, const std::vector& args,std::vector axes,int64_t keepdims) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::reduceprod, args, ImplicitCast::None, OutputType::AsFirstInput); new_node->is_(c10::Symbol::attr("axes"),axes); new_node->i_(c10::Symbol::attr("keepdims"),keepdims); return new_node; } torch::jit::Node* createReducesum(torch::jit::Graph *graph, const std::vector& args,std::vector axes,int64_t keepdims) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::reducesum, args, ImplicitCast::None, OutputType::AsFirstInput); new_node->is_(c10::Symbol::attr("axes"),axes); new_node->i_(c10::Symbol::attr("keepdims"),keepdims); return new_node; } torch::jit::Node* createReducesumsquare(torch::jit::Graph *graph, const std::vector& args,std::vector axes,int64_t keepdims) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::reducesumsquare, args, ImplicitCast::None, OutputType::AsFirstInput); new_node->is_(c10::Symbol::attr("axes"),axes); new_node->i_(c10::Symbol::attr("keepdims"),keepdims); return new_node; } torch::jit::Node* createRelu(torch::jit::Graph *graph, const std::vector& args) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::relu, args, ImplicitCast::None, OutputType::AsFirstInput); return new_node; } torch::jit::Node* createSelu(torch::jit::Graph *graph, const std::vector& args,float alpha,float gamma) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::selu, args, ImplicitCast::None, OutputType::AsFirstInput); new_node->f_(c10::Symbol::attr("alpha"),alpha); new_node->f_(c10::Symbol::attr("gamma"),gamma); return new_node; } torch::jit::Node* createShape(torch::jit::Graph *graph, const std::vector& args) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::shape, args, ImplicitCast::None, OutputType::AlwaysInt); return new_node; } torch::jit::Node* createSigmoid(torch::jit::Graph *graph, const std::vector& args) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::sigmoid, args, ImplicitCast::None, OutputType::AsFirstInput); return new_node; } torch::jit::Node* createSize(torch::jit::Graph *graph, const std::vector& args) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::size, args, ImplicitCast::None, OutputType::AlwaysInt); return new_node; } torch::jit::Node* createSoftmax(torch::jit::Graph *graph, const std::vector& args,int64_t axis) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::softmax, args, ImplicitCast::None, OutputType::AsFirstInput); new_node->i_(c10::Symbol::attr("axis"),axis); return new_node; } torch::jit::Node* createSoftplus(torch::jit::Graph *graph, const std::vector& args) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::softplus, args, ImplicitCast::None, OutputType::AsFirstInput); return new_node; } torch::jit::Node* createSoftsign(torch::jit::Graph *graph, const std::vector& args) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::softsign, args, ImplicitCast::None, OutputType::AsFirstInput); return new_node; } torch::jit::Node* createSpacetodepth(torch::jit::Graph *graph, const std::vector& args,int64_t blocksize) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::spacetodepth, args, ImplicitCast::None, OutputType::AsFirstInput); new_node->i_(c10::Symbol::attr("blocksize"),blocksize); return new_node; } torch::jit::Node* createSplinebasis(torch::jit::Graph *graph, const std::vector& args,std::int32_t degree) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::splinebasis, args, ImplicitCast::None, OutputType::FirstAsFirstInputSecondAlwaysInt, 2); new_node->i_(c10::Symbol::attr("degree"),degree); return new_node; } torch::jit::Node* createSplineweighting(torch::jit::Graph *graph, const std::vector& args) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::splineweighting, args, ImplicitCast::None, OutputType::AsFirstInput); return new_node; } torch::jit::Node* createSplit(torch::jit::Graph *graph, const std::vector& args,unsigned int num_outputs,int64_t axis,const std::vector & split) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::split, args, ImplicitCast::None, OutputType::AsFirstInput, num_outputs); new_node->i_(c10::Symbol::attr("num_outputs"),num_outputs); new_node->i_(c10::Symbol::attr("axis"),axis); new_node->is_(c10::Symbol::attr("split"),split); return new_node; } torch::jit::Node* createSqrt(torch::jit::Graph *graph, const std::vector& args) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::sqrt, args, ImplicitCast::None, OutputType::AsFirstInput); return new_node; } torch::jit::Node* createSqueeze(torch::jit::Graph *graph, const std::vector& args,const std::vector & axes) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::squeeze, args, ImplicitCast::None, OutputType::AsFirstInput); new_node->is_(c10::Symbol::attr("axes"),axes); return new_node; } torch::jit::Node* createTanh(torch::jit::Graph *graph, const std::vector& args) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::tanh, args, ImplicitCast::None, OutputType::AsFirstInput); return new_node; } torch::jit::Node* createTile(torch::jit::Graph *graph, const std::vector& args) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::tile, args, ImplicitCast::None, OutputType::AsFirstInput); return new_node; } torch::jit::Node* createTranspose(torch::jit::Graph *graph, const std::vector& args,const std::vector & perm) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::transpose, args, ImplicitCast::None, OutputType::AsFirstInput); new_node->is_(c10::Symbol::attr("perm"),perm); return new_node; } torch::jit::Node* createUnsqueeze(torch::jit::Graph *graph, const std::vector& args,const std::vector & axes) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::unsqueeze, args, ImplicitCast::None, OutputType::AsFirstInput); new_node->is_(c10::Symbol::attr("axes"),axes); return new_node; } ================================================ FILE: poptorch/source/ErrorOnUnsupportedAten.cpp ================================================ // Copyright (c) 2020 Graphcore Ltd. All rights reserved. #include #include "poptorch/PopartCanonicalization.hpp" #include "poptorch_logging/Error.hpp" #include "poptorch_logging/Logging.hpp" namespace poptorch { void errorOnUnsupportedAten(torch::jit::Graph *graph) { // Check that all of the "aten::" ops have been eliminated. std::unordered_set unsupported_ops; for (torch::jit::Node *node : graph->nodes()) { if (node->kind().is_aten()) { unsupported_ops.insert(node->kind()); } } // Terminate compilation via error. if (!unsupported_ops.empty()) { std::stringstream ss; std::string sep; for (const auto &op : unsupported_ops) { ss << sep << op.toQualString(); sep = ", "; } ERROR("Unsupported ops found in compiled model: [" << ss.str() << "]. Not all operations are supported yet by Graphcore's PyTorch " "compiler. If you believe any of these should be, please report " "this message to support@graphcore.ai."); } } } // namespace poptorch ================================================ FILE: poptorch/source/FixupSetAvailableMemory.cpp ================================================ // Copyright (c) 2022 Graphcore Ltd. All rights reserved. #include #include #include #include #include "PoptorchSymbols.hpp" #include "poptorch/PopartCanonicalization.hpp" #include "poptorch/Utils.hpp" #include "poptorch_logging/Error.hpp" #include "poptorch_logging/Logging.hpp" using torch::jit::Node; using torch::jit::Value; namespace poptorch { namespace { std::vector amp_possible_input_nodes; bool isValidInputOpForAMP(const Node *node) { namespace popart_syms = poptorch::symbols::popart; const auto kind = node->kind(); return kind == popart_syms::gather || kind == popart_syms::lstm || kind == popart_syms::matmul || kind == popart_syms::scatter || kind == popart_syms::scatterreduce; } // Returns true if the given node was removed by searching the possible inputs // backwards. bool tryRemovePossibleInput(const Node *input) { auto input_nodes_count = amp_possible_input_nodes.size(); auto remove_position = std::remove(amp_possible_input_nodes.rbegin(), amp_possible_input_nodes.rend(), input); amp_possible_input_nodes.erase(amp_possible_input_nodes.rend().base(), remove_position.base()); return input_nodes_count > amp_possible_input_nodes.size(); } torch::jit::Value *tryFindAncestor(torch::jit::Value *v, int depth_to_check, int depth = 0) { if (depth == depth_to_check) { if (tryRemovePossibleInput(v->node())) { return v; } } for (auto *inp : v->node()->inputs()) { if (torch::jit::Value *ancestor = tryFindAncestor(inp, depth_to_check, depth + 1)) { return ancestor; } } return nullptr; } } // namespace void setAvailableMemoryAddPossibleInputOp(torch::jit::Node *node) { if (!isValidInputOpForAMP(node)) { return; } logging::trace("Adding node {} as a possible input to set_available_memory", nodeToString(node)); amp_possible_input_nodes.push_back(node); } void moveSetAvailableMemoryIfRequired( torch::jit::Node *set_available_memory_node) { ERROR_ON(set_available_memory_node->kind() != poptorch::symbols::poptorch::set_available_memory); if (amp_possible_input_nodes.empty()) { return; } // If the current input is already in the possible inputs list, remove it, // and return. Node *current_input_node = set_available_memory_node->input(0)->node(); if (tryRemovePossibleInput(current_input_node)) { return; } logging::trace("Found set_available_memory node that might need fixup: {}", nodeToString(set_available_memory_node)); // The current input isn't among the possible inputs. Try to go through the // inputs of the input. // // If we don't find anything: try one more level. (In some cases there is // a reshape followed by an add). // // We don't want to do a full search in the graph as it // might lead to undesired results. The search will be limited to the // possible grandparent and great grandparent nodes that are made of // decomposed ops such as scatteradd and linear. These ops are composed // of multiple ops, and set_available_memory needs to find the suitable // op among them. torch::jit::Value *new_input = tryFindAncestor(set_available_memory_node->input(0), 1); if (new_input == nullptr) { new_input = tryFindAncestor(set_available_memory_node->input(0), 2); } if (new_input == nullptr) { logging::trace( "No matching ancestor found for set_available_memory node {}", nodeToString(set_available_memory_node)); return; } auto *current_input = set_available_memory_node->input(0); logging::trace("Replacing set_available_memory input '%{}' with '%{}'", current_input->debugName(), new_input->debugName()); // Remove set_available_memory_node from its current position set_available_memory_node->output()->replaceAllUsesWith( set_available_memory_node->input(0)); // Replace all the uses of the new input with "set_available_memory" new_input->replaceAllUsesWith(set_available_memory_node->output()); // Update set_available_memory's input set_available_memory_node->moveAfter(new_input->node()); set_available_memory_node->replaceInput(0, new_input); } } // namespace poptorch ================================================ FILE: poptorch/source/GNNOptimizations.cpp ================================================ // Copyright (c) 2022 Graphcore Ltd. All rights reserved. #include #include #include #include #include #include #include #include #include #include "popart_canonicalization/PopartCanonicalizationUtils.hpp" #include "poptorch/OpBuilder.hpp" #include "poptorch/PopartCanonicalization.hpp" #include "poptorch/Utils.hpp" #include "poptorch_logging/Error.hpp" #include "poptorch_logging/Logging.hpp" #include "PoptorchSymbols.hpp" namespace poptorch { namespace { using InputArgs = std::tuple; using GroupedInputArgs = std::array, 3>; using GroupedOpFactory = std::function &)>; void groupScatterReduceNodes(torch::jit::Graph *graph); void groupGatherNodes(torch::jit::Graph *graph); void initQueue(torch::jit::Graph *graph, std::queue &queue, torch::jit::node_list &barriers); std::size_t deduceOpStage(const torch::jit::Node *node, const torch::jit::node_list &barriers); std::vector concatGroupedInputs(torch::jit::Graph *graph, GroupedInputArgs &grouped_inputs, bool with_update); torch::jit::Node * createGroupedScatterReduceNode(torch::jit::Graph *graph, const torch::jit::node_list &scatter_nodes, const std::vector &inputs); torch::jit::Node * createGroupedGatherNode(torch::jit::Graph *graph, const torch::jit::node_list &gather_nodes, const std::vector &inputs); torch::jit::node_list dispatch(torch::jit::Graph *graph, torch::jit::node_list &nodes, const GroupedOpFactory &createGroupedOpFn, bool with_update = false); InputArgs getInputArgs(const torch::jit::Node *node, bool with_update); GroupedInputArgs groupInputs(const torch::jit::node_list &nodes, bool with_update); torch::jit::Node *mergeNodes(torch::jit::Graph *graph, const torch::jit::node_list &nodes, const GroupedOpFactory &createGroupedOpFn, bool with_update); void moveOutputNodesAfterInsertionPoint(const torch::jit::node_list &nodes, torch::jit::Node *insertion_point_node); torch::jit::node_list removeDuplicates(const torch::jit::node_list &nodes, bool with_update); void sortInTopologicalOrder(torch::jit::node_list &nodes); void unpackGroupedOutputs(torch::jit::Graph *graph, torch::jit::Node *grouped_node, const torch::jit::node_list &fused_nodes); } // namespace /* * Algorithm: * 1. Move the BFS around the graph and add only those that all inputs are * encountered until the entire queue is scatters and gathers. * 2. Merge the scatters and gathers. * 3. Add outputs to queue and remove scatters and gathers. * 4. If queue is not empty go to step 1. */ void groupScatterReduceAndGatherNodes(torch::jit::Graph *graph) { groupScatterReduceNodes(graph); groupGatherNodes(graph); } void removeScatterAddIndexExpansion(torch::jit::Graph *graph) { const logging::LogContext ctx{"ScatterAddOptimization"}; std::vector to_delete; for (auto *node : graph->nodes()) { if (node->kind() != c10::aten::scatter_add && node->kind() != c10::aten::scatter_add_ && node->kind() != c10::aten::scatter_reduce && node->kind() != c10::aten::scatter_reduce_) { continue; } auto *index = node->input(2); auto *index_producer = index->node(); // Only remove index expansions. if (index_producer->kind() != c10::aten::expand && index_producer->kind() != c10::aten::expand_as) { continue; } auto *src = node->input(3); auto *original_index = index_producer->input(0); const auto expanded_index_shape = shapeFromTensor(index); // Make sure removal is valid if (index->uses().size() > 1 || shapeFromTensor(src) != expanded_index_shape) { continue; } logging::trace("Removing index expansion node: {}", nodeToString(index_producer)); node->replaceInputWith(index, original_index); node->i_(c10::Symbol::attr("enable_index_broadcast"), 1); to_delete.push_back(index_producer); } for (auto *node : to_delete) { node->destroy(); } } namespace { void groupScatterReduceNodes(torch::jit::Graph *graph) { logging::LogContext const ctx{"groupScatterReduceNodes"}; // Queue contains fully reached nodes. std::queue queue; torch::jit::node_list barriers; initQueue(graph, queue, barriers); // The unordered_map elements represent the number of times the node was // reached. std::unordered_map node_num_visited_inputs; // The unordered_set elements mean that children have been added to the queue. static constexpr auto with_update_idx = 3; using ScatterKind = std::tuple; std::map scatters; std::size_t optimization_candidates = 0; // Lambda to add the children of the vertex. const auto add_children_to_queue = [&](const torch::jit::Node *node) { for (const torch::jit::Value *output : node->outputs()) { for (const torch::jit::Use &use : output->uses()) { torch::jit::Node *user = use.user; const auto num_user_inputs = user->inputs().size(); auto &num_user_visited_inputs = node_num_visited_inputs[user]; ++num_user_visited_inputs; if (num_user_visited_inputs == num_user_inputs) { queue.push(user); if (user->kind() == symbols::popart::scatterreduce) { ++optimization_candidates; const std::int64_t reduction = user->i(c10::Symbol::attr("reduction")); const at::ScalarType input_type = *user->input(0) ->type() ->expect() ->scalarType(); const bool with_update = num_user_inputs == 3; const bool index_broadcast_enabled = user->i(c10::Symbol::attr("enable_index_broadcast")) != 0; const std::size_t stage = deduceOpStage(user, barriers); const std::int64_t axis = user->i(c10::Symbol::attr("axis")); const std::int64_t axis_size = user->i(c10::Symbol::attr("axis_size")); const ScatterKind key{ reduction, input_type, index_broadcast_enabled, with_update, stage, axis, axis_size}; scatters[key].push_back(user); } } } } }; const auto merge_scatters = [&]() { for (auto &&[scatter_kind, scatter_vec] : scatters) { if (scatter_vec.size() > 1) { const bool with_update = std::get(scatter_kind); const auto &merged_scatters = dispatch( graph, scatter_vec, createGroupedScatterReduceNode, with_update); for (torch::jit::Node *scatter_node : merged_scatters) { add_children_to_queue(scatter_node); } } else { add_children_to_queue(scatter_vec.front()); } } scatters.clear(); }; while (!queue.empty()) { auto *node = queue.front(); queue.pop(); const torch::jit::Symbol kind = node->kind(); // If scatter or gather, push back. if (kind == symbols::popart::scatterreduce) { queue.push(node); } else { add_children_to_queue(node); } // If all elements of the queue are scatters and gathers. if (queue.size() == optimization_candidates) { // Clear queue. queue = std::queue(); optimization_candidates = 0; // Merge scatters and gathers that have been encountered twice. merge_scatters(); } } } void groupGatherNodes(torch::jit::Graph *graph) { logging::LogContext const ctx{"groupGatherNodes"}; // Queue contains fully reached nodes. std::queue queue; torch::jit::node_list barriers; initQueue(graph, queue, barriers); // The unordered_map elements represent the number of times the node was // reached. std::unordered_map node_num_visited_inputs; // The unordered_set elements mean that children have been added to the queue. using GatherKind = std::tuple; std::map gathers; std::size_t optimization_candidates = 0; // Lambda to add the children of the vertex. const auto add_children_to_queue = [&](const torch::jit::Node *node) { for (const torch::jit::Value *output : node->outputs()) { for (const torch::jit::Use &use : output->uses()) { torch::jit::Node *user = use.user; const auto num_user_inputs = user->inputs().size(); auto &num_user_visited_inputs = node_num_visited_inputs[user]; ++num_user_visited_inputs; if (num_user_visited_inputs == num_user_inputs) { queue.push(user); if (user->kind() == symbols::popart::gather) { ++optimization_candidates; const at::ScalarType input_type = *user->input(0) ->type() ->expect() ->scalarType(); const std::int64_t axis = user->i(c10::Symbol::attr("axis")); const std::size_t stage = deduceOpStage(user, barriers); const GatherKind key{input_type, axis, stage}; gathers[key].push_back(user); } } } } }; const auto merge_gathers = [&]() { for (auto &&[_, gather_vec] : gathers) { UNUSED(_); if (gather_vec.size() > 1) { const auto &merged_gathers = dispatch(graph, gather_vec, createGroupedGatherNode); for (torch::jit::Node *gather_node : merged_gathers) { add_children_to_queue(gather_node); } } else { add_children_to_queue(gather_vec.front()); } } gathers.clear(); }; while (!queue.empty()) { auto *node = queue.front(); queue.pop(); const torch::jit::Symbol kind = node->kind(); // If scatter or gather, push back. if (kind == symbols::popart::gather) { queue.push(node); } else { add_children_to_queue(node); } // If all elements of the queue are scatters and gathers. if (queue.size() == optimization_candidates) { // Clear queue. queue = std::queue(); optimization_candidates = 0; // Merge scatters and gathers that have been encountered twice. merge_gathers(); } } } void initQueue(torch::jit::Graph *graph, std::queue &queue, torch::jit::node_list &barriers) { // Add roots to queue. std::unordered_set added; for (torch::jit::Node *node : graph->nodes()) { if (node->inputs().empty()) { if (added.find(node) == added.end()) { queue.push(node); added.insert(node); } } if (node->kind() == symbols::poptorch::begin_ipu_block) { barriers.push_back(node); } } for (torch::jit::Value *input : graph->inputs()) { auto *node = input->node(); if (added.find(node) == added.end()) { queue.push(node); added.insert(node); } } } // Find which phase the fused operation is in std::size_t deduceOpStage(const torch::jit::Node *node, const torch::jit::node_list &barriers) { std::size_t stage = 0; while (stage < barriers.size() && !node->isBefore(barriers[stage])) { stage++; } return stage; } torch::jit::node_list dispatch(torch::jit::Graph *graph, torch::jit::node_list &nodes, const GroupedOpFactory &createGroupedOpFn, bool with_update) { using Shape = std::vector; using Group = std::tuple; std::map group_to_merge_candidates; for (torch::jit::Node *node : nodes) { const std::int64_t axis = node->i(c10::Symbol::attr("axis")); const Shape src_shape = shapeFromTensor(node->input(0)); const Shape index_shape = shapeFromTensor(node->input(1)); const Shape self_shape = with_update ? shapeFromTensor(node->input(2)) : Shape{}; const Group key{axis, index_shape, src_shape, self_shape}; group_to_merge_candidates[key].push_back(node); } torch::jit::node_list grouped_nodes; for (auto &&[_, merge_candidates] : group_to_merge_candidates) { UNUSED(_); if (merge_candidates.size() > 1) { grouped_nodes.push_back( mergeNodes(graph, merge_candidates, createGroupedOpFn, with_update)); } else { grouped_nodes.push_back(merge_candidates.front()); } } return grouped_nodes; } torch::jit::Node *mergeNodes(torch::jit::Graph *graph, const torch::jit::node_list &nodes, const GroupedOpFactory &createGroupedOpFn, bool with_update) { torch::jit::node_list unique_nodes = poptorch::removeDuplicates(nodes, with_update); sortInTopologicalOrder(unique_nodes); torch::jit::Node *insertion_point_node = unique_nodes.back(); moveOutputNodesAfterInsertionPoint(unique_nodes, insertion_point_node); auto grouped_inputs = groupInputs(unique_nodes, with_update); const WithNodeMetadata meta{insertion_point_node}; const torch::jit::WithInsertPoint insertion_point(insertion_point_node); const auto grouped_args = concatGroupedInputs(graph, grouped_inputs, with_update); torch::jit::Node *grouped_node; grouped_node = createGroupedOpFn(graph, unique_nodes, grouped_args); unpackGroupedOutputs(graph, grouped_node, unique_nodes); return grouped_node; } torch::jit::node_list removeDuplicates(const torch::jit::node_list &nodes, bool with_update) { std::map input_args_to_nodes; std::unordered_set to_destroy; for (torch::jit::Node *node : nodes) { const auto node_inputs = getInputArgs(node, with_update); auto stored_node_it = input_args_to_nodes.find(node_inputs); const bool is_duplicate = stored_node_it != input_args_to_nodes.end(); if (is_duplicate) { auto *const stored_node = stored_node_it->second; replaceOutputUse(node->output(), stored_node->output()); to_destroy.insert(node); } else { input_args_to_nodes.emplace(node_inputs, node); } } searchAndPossiblyDestroy(to_destroy); torch::jit::node_list unique_nodes(input_args_to_nodes.size(), nullptr); std::transform(input_args_to_nodes.begin(), input_args_to_nodes.end(), unique_nodes.begin(), [&](const auto &input_args_to_node) { return input_args_to_node.second; }); return unique_nodes; } void sortInTopologicalOrder(torch::jit::node_list &nodes) { std::sort(nodes.begin(), nodes.end(), [=](const torch::jit::Node *lhs, const torch::jit::Node *rhs) { return lhs->isBefore(rhs); }); } void moveOutputNodesAfterInsertionPoint( const torch::jit::node_list &nodes, torch::jit::Node *insertion_point_node) { const auto collect_output_nodes = [](const torch::jit::Node *node_to_process, std::queue &queue) { for (const auto *output : node_to_process->outputs()) { for (const auto &use : output->uses()) { const auto &user = use.user; queue.push(user); } } }; std::unordered_set collected_nodes_to_move; for (torch::jit::Node *node : nodes) { if (node == insertion_point_node) { continue; } std::queue nodes_to_move; collect_output_nodes(node, nodes_to_move); while (!nodes_to_move.empty()) { torch::jit::Node *node_to_move = nodes_to_move.front(); nodes_to_move.pop(); if (node_to_move->isBefore(insertion_point_node) && collected_nodes_to_move.find(node_to_move) == collected_nodes_to_move.end()) { collected_nodes_to_move.insert(node_to_move); collect_output_nodes(node_to_move, nodes_to_move); } } } torch::jit::node_list sorted_collected_nodes_to_move; sorted_collected_nodes_to_move.insert(sorted_collected_nodes_to_move.end(), collected_nodes_to_move.begin(), collected_nodes_to_move.end()); sortInTopologicalOrder(sorted_collected_nodes_to_move); auto *tmp_insertion_point_node = insertion_point_node; for (auto *node_to_move : sorted_collected_nodes_to_move) { node_to_move->moveAfter(tmp_insertion_point_node); tmp_insertion_point_node = node_to_move; } } GroupedInputArgs groupInputs(const torch::jit::node_list &nodes, bool with_update) { const int64_t num_groups = nodes.size(); GroupedInputArgs grouped_input_nodes; for (auto &input_vec : grouped_input_nodes) { input_vec = std::vector(num_groups, nullptr); } for (int64_t group_id = 0; group_id < num_groups; ++group_id) { std::tie(grouped_input_nodes[0][group_id], grouped_input_nodes[1][group_id], grouped_input_nodes[2][group_id]) = getInputArgs(nodes[group_id], with_update); } return grouped_input_nodes; } InputArgs getInputArgs(const torch::jit::Node *node, bool with_update) { return {node->input(0), node->input(1), (with_update ? node->input(2) : nullptr)}; } std::vector concatGroupedInputs(torch::jit::Graph *graph, GroupedInputArgs &grouped_inputs, bool with_update) { const std::size_t num_groups = grouped_inputs[0].size(); for (std::size_t group_id = 0; group_id < num_groups; group_id++) { auto &src_input = grouped_inputs[0][group_id]; src_input = createUnsqueeze(graph, {src_input}, {0})->output(); auto &index_input = grouped_inputs[1][group_id]; index_input = createUnsqueeze(graph, {index_input}, {0})->output(); if (with_update) { auto &self_input = grouped_inputs[2][group_id]; self_input = createUnsqueeze(graph, {self_input}, {0})->output(); } } std::vector grouped_args; grouped_args.reserve(3); grouped_args.push_back(createConcat(graph, grouped_inputs[0], 0)->output()); grouped_args.push_back(createConcat(graph, grouped_inputs[1], 0)->output()); if (with_update) { grouped_args.push_back(createConcat(graph, grouped_inputs[2], 0)->output()); } return grouped_args; } torch::jit::Node * createGroupedScatterReduceNode(torch::jit::Graph *graph, const torch::jit::node_list &scatter_nodes, const std::vector &inputs) { const int64_t num_groups = scatter_nodes.size(); auto *const node_with_attributes = scatter_nodes.back(); const auto axis_size = node_with_attributes->i(c10::Symbol::attr("axis_size")); const auto old_axis = node_with_attributes->i(c10::Symbol::attr("axis")); const auto reduction = node_with_attributes->i(c10::Symbol::attr("reduction")); const bool enable_index_broadcast = node_with_attributes->i(c10::Symbol::attr("enable_index_broadcast")) != 0; return createGroupedscatterreduce(graph, inputs, axis_size, old_axis + 1, num_groups, enable_index_broadcast, reduction); } torch::jit::Node * createGroupedGatherNode(torch::jit::Graph *graph, const torch::jit::node_list &gather_nodes, const std::vector &inputs) { const int64_t num_groups = gather_nodes.size(); auto *const node_with_attributes = gather_nodes.back(); const auto axis = node_with_attributes->i(c10::Symbol::attr("axis")); return createGroupedgather(graph, inputs, axis + 1, num_groups); } void unpackGroupedOutputs(torch::jit::Graph *graph, torch::jit::Node *grouped_node, const torch::jit::node_list &fused_nodes) { std::unordered_set to_destroy; const int64_t num_groups = fused_nodes.size(); for (int64_t group_id = 0; group_id < num_groups; ++group_id) { torch::jit::Node *slice = createSlice(graph, {grouped_node->output()}, {group_id + 1}, {group_id}, {0}); torch::jit::Node *squeeze = createSqueeze(graph, {slice->output()}, {0}); // Replace outputs with grouped version. torch::jit::Node *node_to_replace = fused_nodes[group_id]; for (torch::jit::Value *output : node_to_replace->outputs()) { replaceOutputUse(output, squeeze->output()); } to_destroy.insert(node_to_replace); } // Destroy merged scatters. searchAndPossiblyDestroy(to_destroy); } } // namespace } // namespace poptorch ================================================ FILE: poptorch/source/GatherWithExpandedIndicesOptimization.cpp ================================================ // Copyright (c) 2022 Graphcore Ltd. All rights reserved. #include #include #include "popart_canonicalization/PopartCanonicalizationUtils.hpp" #include "poptorch/PopartCanonicalization.hpp" #include "poptorch/Utils.hpp" #include "poptorch_logging/Logging.hpp" #include "poptorch/OpBuilder.hpp" namespace poptorch { void simplifyGatherWithExpandedIndices(torch::jit::Graph *graph) { logging::LogContext const ctx{"GatherWithExpandedIndicesOptimisation"}; std::unordered_set to_delete; for (auto *node : graph->nodes()) { if (node->kind() != c10::aten::gather) { continue; } // aten::gather(Tensor self, int dim, Tensor index, *, bool // sparse_grad=False) -> Tensor auto *input = node->input(0); const size_t gather_dim = handleDimensionParam( node->input(1), input->type()->expect()); auto *indices = node->input(2); auto *expand_node = indices->node(); // Only remove index expansions. if (expand_node->kind() != c10::aten::expand && expand_node->kind() != c10::aten::expand_as) { continue; } const WithNodeMetadata meta(node); // aten::expand(Tensor self, int[] size, *, bool implicit) -> Tensor // aten::expand_as(Tensor self, Tensor other) -> Tensor auto *original_indices = expand_node->input(0); auto original_indices_shape = shapeFromTensor(original_indices); // Get the (single) expanded dimension std::vector expand_shape{}; if (expand_node->kind() == c10::aten::expand) { expand_shape = constantToLongVec(expand_node->input(1)->node()); } else { expand_shape = shapeFromTensor(expand_node->input(1)); } std::vector expand_dims{}; for (size_t i = 0; i < expand_shape.size(); i++) { if (expand_shape[i] > original_indices_shape[i]) { expand_dims.push_back(i); } } if (expand_dims.size() != 1) { continue; } const size_t expand_dim = expand_dims[0]; // Only optimise if: // * source tensor's shape has 2 dimensions of length > 1 // * dimension of gather, and dimension of expand are the 2 dimensions of // length > 1 const auto self_shape = shapeFromTensor(input); std::vector non_singleton_dimensions{}; for (size_t i = 0; i < self_shape.size(); i++) { if (self_shape[i] > 1) { non_singleton_dimensions.push_back(i); } } if (non_singleton_dimensions.size() != 2) { continue; } const auto ga_it = std::find(non_singleton_dimensions.begin(), non_singleton_dimensions.end(), gather_dim); const auto ex_it = std::find(non_singleton_dimensions.begin(), non_singleton_dimensions.end(), expand_dim); if (ga_it == ex_it || ga_it == non_singleton_dimensions.end() || ex_it == non_singleton_dimensions.end()) { continue; } // Replace the aten::expand -> aten::gather with an // aten::squeeze -> aten::index_select logging::debug("Optimising gather: {}", nodeToString(node)); std::vector squeezed_shape; std::copy_if(original_indices_shape.begin(), original_indices_shape.end(), std::back_inserter(squeezed_shape), [](auto dim) { return dim > 1; }); torch::jit::WithInsertPoint const insert_point(node); torch::jit::Node *squeezed = createAndInsertNode(graph, c10::aten::squeeze, {original_indices}, ImplicitCast::None, OutputType::AsFirstInput); squeezed->output()->setType( original_indices->type()->expect()->withSizes( squeezed_shape)); torch::jit::Node *gathered = createAndInsertNode(graph, c10::aten::index_select, {input, node->input(1), squeezed->output()}, ImplicitCast::None, OutputType::AsFirstInput) ->output() ->node(); to_delete.insert(node); to_delete.insert(expand_node); if (node->hasUses()) { for (size_t i = 0; i < node->outputs().size(); ++i) { // As well as replacing the use, this will copy across shape/type // if not explicitly set. replaceOutputUse(node, gathered, i); } } } // Remove the dead nodes. searchAndPossiblyDestroy(to_delete); } } // namespace poptorch ================================================ FILE: poptorch/source/ImplicitCasting.cpp ================================================ // Copyright (c) 2020 Graphcore Ltd. All rights reserved. #include #include #include #include #include #include "poptorch_logging/Error.hpp" #include "poptorch_logging/Logging.hpp" #include "poptorch/DispatchTracer.hpp" #include "poptorch/ImplicitCasting.hpp" #include "poptorch/OpBuilder.hpp" #include "poptorch/Utils.hpp" #include "PoptorchSymbols.hpp" namespace poptorch { namespace { bool skipInput(const ImplicitCast implicit_cast, const unsigned int input_num) { ERROR_ON(implicit_cast == ImplicitCast::None); if (implicit_cast == ImplicitCast::ExceptFirst && input_num == 0) { return true; } if (implicit_cast == ImplicitCast::ExceptSecond && input_num == 1) { return true; } if (implicit_cast == ImplicitCast::ExceptThird && input_num == 2) { return true; } if (implicit_cast == ImplicitCast::ExceptFourthFifth && (input_num == 3 || input_num == 4)) { return true; } return false; } c10::ScalarType inferExpectedTypeDispatch( const torch::jit::ArrayRef &inputs, const ImplicitCast implicit_cast) { // Work out the types of all inputs at::native::ResultTypeState state = {}; unsigned int input_num = 0; for (auto *input : inputs) { logging::LogContext const ctx(std::string("processing input ") + std::to_string(input_num)); if (!skipInput(implicit_cast, input_num) && input->type()->kind() != c10::TypeKind::NoneType) { auto tensor_type = input->type()->expect(); ERROR_ON(!tensor_type->scalarType()); auto osizes = tensor_type->sizes().concrete_sizes(); std::vector sizes; if (osizes) { sizes = *osizes; } state = at::native::update_result_type_state( at::native::empty_cpu(c10::IntArrayRef(sizes.data(), sizes.size()), tensor_type->scalarType()), state); } input_num++; } return at::native::result_type(state); } bool needToRetype(const torch::jit::Value *input, const c10::ScalarType expected_type) { if (input->type()->kind() == c10::TypeKind::NoneType) { return false; } ERROR_ON(input->node()->kind() == at::prim::Constant); auto input_type = input->type()->cast()->scalarType(); return input_type != expected_type; } torch::jit::Value *addCast(torch::jit::Value *input, const c10::ScalarType type) { torch::jit::Node *node = input->node(); auto *new_node = createCast(input->owningGraph(), input, type); auto current_type = input->type()->cast(); new_node->output()->setType(current_type->withScalarType(type)); node->replaceInputWith(input, new_node->output()); return new_node->output(); } } // namespace std::vector implicitCastInputs(torch::jit::ArrayRef *inputs, const ImplicitCast implicit_cast) { // The dispatcher version of mixed-precision type inference simply delegates // to PyTorch's own routines, so that we always match their decisions. c10::ScalarType const expected_type = inferExpectedTypeDispatch(*inputs, implicit_cast); std::vector new_inputs; unsigned int input_num = 0; for (auto *input : *inputs) { if (!skipInput(implicit_cast, input_num) && needToRetype(input, expected_type)) { new_inputs.push_back(addCast(input, expected_type)); } else { new_inputs.push_back(input); } input_num++; } return new_inputs; } void removeDeadImplicitCasts(torch::jit::Graph *graph) { // We are removing dead code casts that result from the following cases: // - Torch is dispatching a cast of a tensor in which case it should be used // elsewhere and its uses won't be empty -> just delete the cast. // - Torch is dispatching a cast of a wrapped number (a tensor_constant on // our side) -> delete the cast and the constant. std::vector to_delete; for (auto *node : graph->nodes()) { if (node->kind() != symbols::popart::cast || node->hasUses()) { continue; } to_delete.push_back(node); if (node->input()->uses().size() == 1) { // 'node' is the only use so it's safe to delete. This must be a // tensor_constant representing a wrapped number. auto *constant = node->input()->node(); if (constant->kind() == symbols::poptorch::tensor_constant) { to_delete.push_back(constant); } } } for (auto *node : to_delete) { node->destroy(); } } } // namespace poptorch ================================================ FILE: poptorch/source/InplaceOps.cpp ================================================ // Copyright (c) 2021 Graphcore Ltd. All rights reserved. #include #include #include #include "poptorch/DispatchTracer.hpp" #include "poptorch/InplaceOps.hpp" #include "poptorch/InplaceOpsPyTorch.hpp_nolint" #include "poptorch/OpBuilder.hpp" #include "poptorch_logging/Error.hpp" #include "poptorch_logging/Logging.hpp" #include "popart_canonicalization/PopartCanonicalizationUtils.hpp" #include "poptorch/Utils.hpp" #include "PoptorchSymbols.hpp" namespace poptorch { namespace { namespace aten = c10::aten; // Ops which only have an in-place version const std::unordered_set &onlyInplaceOps() { // static to make sure values are initialised static const std::unordered_set only_inplace = { aten::copy_, aten::normal_, aten::uniform_, aten::random_, aten::exponential_}; return only_inplace; } // Known view operations const std::unordered_set &viewOps() { // static to make sure values are initialised static const std::unordered_set view_ops = { aten::chunk, aten::detach, aten::narrow, aten::permute, aten::reshape, aten::select, aten::slice, aten::split, aten::squeeze, aten::transpose, aten::unbind, aten::unsqueeze, aten::view, aten::as_strided, aten::diagonal, aten::movedim, aten::swapaxes, aten::swapdims, aten::view_as, aten::_unsafe_view}; return view_ops; } size_t countNumTensorOutputs(torch::jit::Graph &graph) { size_t num_tensors = 0; for (const auto &output : graph.outputs()) { if (output->node()->kind() == c10::prim::ListConstruct) { for (const auto &input : output->node()->inputs()) { num_tensors += numTensorsForType(input->type()); } } else { num_tensors += numTensorsForType(output->type()); } } return num_tensors; } // When replacing `node` with `new_node`, if `new_node` doesn't have enough // inputs pad them out with None-nodes. // NOTE: Body mostly taken from torch (see torch::jit::RemoveInplaceOps), with // the addition of metadata. void addAdditionalInputsIfRequired(torch::jit::Graph *graph, const torch::jit::Node *node, torch::jit::Node *new_node) { int additional_input_count = 0; if (torch::jit::expectedInputCount.find(node->kind()) != torch::jit::expectedInputCount.end()) { additional_input_count = torch::jit::expectedInputCount.at(node->kind()) - static_cast(new_node->inputs().size()); } const WithNodeMetadata meta(new_node); for (int i = 0; i < additional_input_count; ++i) { auto *none_node = graph->createNone(); // NOLINTNEXTLINE readability-suspicious-call-argument insertNodeBeforeNode(none_node, new_node); new_node->addInput(none_node->output()); } } torch::jit::Node *outplaceOp(torch::jit::Graph &graph, torch::jit::Node *node) { torch::jit::NodeKind const new_kind = outplaceKind(node->kind()); torch::jit::WithInsertPoint const insert_point(node); const WithNodeMetadata meta(node); auto *new_node = createAndInsertNode(&graph, new_kind, node->inputs()); addAdditionalInputsIfRequired(&graph, node, new_node); new_node->output()->setType(node->output()->type()); node->output()->replaceAllUsesWith(new_node->output()); return new_node; } void removeRemainingInplaceOps(torch::jit::Graph &graph) { std::vector to_delete; for (auto *node : graph.nodes()) { // Skip if not in-place if (!torch::jit::isInplaceOp(node)) { continue; } // Keep it in place if there is only an inplace version if (onlyInplaceOps().count(node->kind()) != 0) { continue; } outplaceOp(graph, node); to_delete.push_back(node); } for (auto *node : to_delete) { node->destroy(); } } } // namespace torch::jit::NodeKind outplaceKind(torch::jit::NodeKind kind) { if (onlyInplaceOps().count(kind) != 0) { return kind; } std::string kind_str = kind.toQualString(); torch::jit::NodeKind new_kind = kind; if (torch::jit::inPlaceToOutOfPlace.count(kind) != 0) { new_kind = torch::jit::inPlaceToOutOfPlace.at(kind); } else if (kind_str.back() == '_') { // Remove trailing '_' from the kind string kind_str.pop_back(); new_kind = c10::Symbol::fromQualString(kind_str); } return new_kind; } void InplaceInputsTracker::addTensor(torch::jit::Value *input) { logging::trace("Tracking tensor %{}", input->debugName()); const bool success = _aliases.insert({input, input}).second; ERROR_ON_MSG(!success, "Value already tracked"); } torch::jit::Value * InplaceInputsTracker::eraseCurrentAlias(torch::jit::Value *alias) { ERROR_ON(alias == nullptr); // Walk through the view ops until we find an input tensor. while (viewOps().count(alias->node()->kind()) != 0) { alias = alias->node()->input(0); } auto it = _aliases.find(alias); if (it != _aliases.end()) { auto *real_input = it->second; logging::trace("Deleted alias %{} for input %{}", it->first->debugName(), it->second->debugName()); // Remove current alias. _aliases.erase(it); return real_input; } return nullptr; } void InplaceInputsTracker::registerAlias(torch::jit::Value *aliased_input, torch::jit::Value *alias) { logging::trace("Registering alias %{} for input %{}", alias->debugName(), aliased_input->debugName()); ERROR_ON(!_aliases.insert({alias, aliased_input}).second); } InplaceGraphInfo InplaceInputsTracker::finalizeGraph(torch::jit::Graph &graph, size_t num_anchors, bool replicas_needing_broadcast) { // For every alias (ie. target of an inplace op), look back and see if it's // applied through a bunch of views back to an input. if it is, mark it to be // handled later, at canonicalisation. for (const auto &[alias, aliased_input] : _aliases) { if (alias == aliased_input) { continue; } auto *inplace_op = alias->node(); // Aliases are already traced back through views to graph inputs when // they're updated via `eraseCurrentAlias`, so can just check that the // ultimate input (`aliased_input`) is different to the inplace op's // immediate input. const bool was_inplace_on_view = !inplace_op->inputs().empty() && aliased_input != inplace_op->input(0); inplace_op->i_(c10::Symbol::attr("was_inplace_on_view"), was_inplace_on_view ? 1 : 0); } // _aliases[alias] = graph_input -> we want the other way around. std::map input_aliases; for (auto &p : _aliases) { ERROR_ON_MSG(!input_aliases.insert({p.second, p.first}).second, "More than one alias for graph input %" << p.second->debugName()); } const size_t num_normal_tensor_outputs = countNumTensorOutputs(graph); InplaceGraphInfo out; out.num_normal_outputs = graph.outputs().size() + num_anchors; out.num_tensor_outputs = num_normal_tensor_outputs + num_anchors; const std::vector collapsed_inputs = collapsedGraphInputHierachy(&graph); out.input_output_mapping.reserve(collapsed_inputs.size()); for (const auto &graph_input : collapsed_inputs) { auto it = input_aliases.find(graph_input); ERROR_ON(it == input_aliases.end()); size_t output_mapping = InplaceGraphInfo::no_mapping; if (it->first == it->second) { // no alias found } else { auto *alias = it->second; if (isParameter(graph_input)) { logging::trace("Alias for parameter %{} -> %{}", it->first->debugName(), alias->debugName()); // This is not supported with replicas needing broadcast ERROR_ON_MSG( replicas_needing_broadcast, "PopTorch does not support broadcasting buffers. If your " "model is able to tolerate buffers becoming out of sync " "between replicas, you can disable buffer broadcasting using " "poptorch.Options.broadcastBuffers(False)."); const WithNodeMetadata meta(alias->node()); auto *new_node = createAndInsertNode(&graph, symbols::poptorch::update_param_inplace, {graph_input, alias}); new_node->moveAfter(alias->node()); new_node->output()->setType(alias->type()); } else { logging::trace("Alias for input %{} -> %{}", it->first->debugName(), alias->debugName()); // Check if the alias is already being returned. for (size_t output = 0; output < graph.outputs().size(); output++) { if (graph.outputs()[output] == alias) { output_mapping = output; } } // If not, add a new output. if (output_mapping == InplaceGraphInfo::no_mapping) { output_mapping = graph.registerOutput(alias); // Ensure the overlap flag is set to no overlap (any models wanting // the additional efficiency of overalpped host IO should not use // inplace ops.) auto overlap_symbol = getOverlapSymbol("output", graph.outputs().size() - 1); graph.return_node()->s_(overlap_symbol, "no_overlap"); } } } // The input/output mapping is only for 'true' inputs -- not parameters & // buffers (see its usage in PoplarExecutable::run). if (!isParameter(graph_input)) { out.input_output_mapping.push_back(output_mapping); } } // Outplace all the ops we can; the _aliases map no longer needs to be kept // up-to-date. removeRemainingInplaceOps(graph); return out; } void fixForLoopInputs(torch::jit::Graph &graph) { torch::jit::Value *correct_loop_input = nullptr; for (auto *node : graph.nodes()) { if (node->kind() == symbols::poptorch::start_for_loop) { ERROR_ON_MSG(correct_loop_input, "[Internal] new poptorch::start_for_loop " "encountered before previous poptorch::end_for_loop"); correct_loop_input = node->input(); } else if (node->kind() == symbols::poptorch::end_for_loop) { ERROR_ON_MSG(!correct_loop_input, "[Internal] poptorch::end_for_loop " "encountered before poptorch::start_for_loop"); node->replaceInput(1, correct_loop_input); correct_loop_input = nullptr; } } } void verifyIfElseBlocksOrder(const torch::jit::Graph &graph) { // Verifies order of if...else blocks and generates friendly user error // messages if the order is incorrect. size_t if_cnt = 0; size_t else_cnt = 0; size_t end_cnt = 0; for (const auto *node : graph.nodes()) { if (node->kind() == symbols::poptorch::start_if_block) { if_cnt++; } else if (node->kind() == symbols::poptorch::start_else_block) { ERROR_ON_MSG(if_cnt <= else_cnt, "[Internal] new poptorch::start_else_block " "encountered before previous poptorch::start_if_block"); else_cnt++; } else if (node->kind() == symbols::poptorch::end_if_block) { ERROR_ON_MSG(if_cnt < else_cnt || else_cnt <= end_cnt, "[Internal] poptorch::end_if_block " "encountered before poptorch::start_if_block and " "poptorch::start_else_block"); end_cnt++; } } ERROR_ON_MSG(!(if_cnt == else_cnt && else_cnt == end_cnt), "[Internal] no enclosing poptorch::end_if_block encountered"); } } // namespace poptorch ================================================ FILE: poptorch/source/LowerToPopart.cpp ================================================ // Copyright (c) 2020 Graphcore Ltd. All rights reserved. #include "poptorch/LowerToPopart.hpp" #include #include #include #include #include #include #include #include #include #include "PoptorchSymbols.hpp" #include "popart_compiler/Compiler.hpp" #include "popart_compiler/PopartEnums.hpp" #include "poptorch/DispatchTracer.hpp" #include "poptorch/InplaceOps.hpp" #include "poptorch/PopartCanonicalization.hpp" #include "poptorch/Utils.hpp" #include "poptorch_logging/Error.hpp" #include "poptorch_logging/Logging.hpp" namespace fs = std::experimental::filesystem; namespace poptorch { namespace { std::string getModelProtoFilename() { if (const char *proto_file = std::getenv("POPTORCH_EXPORT_PROTO_FILE")) { fs::path file = fs::absolute(proto_file); fs::path dir = file; if (dir.has_extension()) { dir.remove_filename(); } else { file += "/model.proto"; } fs::create_directories(dir); logging::info( "POPTORCH_EXPORT_PROTO_FILE set: saving model prototype to {}", file); return file; } return ""; } // Mapping between the SSA values of torch jit with the ssa values of popart. // Each Value is either a single tensor, tuple or list (Note: nested tuples are // stored flattened). class ValueMap { public: using TensorList = std::vector; popart_compiler::TensorId tensor(torch::jit::Value *value) const; const TensorList &listTuple(torch::jit::Value *value) const; // Return the list of tensors without checking if it's a tuple, list or a // single tensor. const TensorList &tensors(torch::jit::Value *value) const; bool hasTensor(torch::jit::Value *value) const { return _map.count(value) == 1; } void setTensor(torch::jit::Value *value, popart_compiler::TensorId id); void setList(torch::jit::Value *value, const TensorList &tensors); void setTuple(torch::jit::Value *value, const TensorList &tensors); private: struct Data { explicit Data(popart_compiler::TensorId id) : type(popart_compiler::OutputElemType::Tensor) { tensors.push_back(id); } Data(TensorList tuple, popart_compiler::OutputElemType type_) : type(type_), tensors(std::move(tuple)) {} popart_compiler::OutputElemType type; TensorList tensors; }; std::unordered_map _map; }; popart_compiler::TensorId ValueMap::tensor(torch::jit::Value *value) const { const auto it = _map.find(value); ERROR_ON_MSG(it == _map.cend(), value->debugName() << " not found in ValueMap"); ERROR_ON_MSG(it->second.type != popart_compiler::OutputElemType::Tensor, value->debugName() << " is not a tensor"); ERROR_ON(it->second.tensors.size() != 1); return it->second.tensors.front(); } const ValueMap::TensorList & ValueMap::listTuple(torch::jit::Value *value) const { const auto it = _map.find(value); ERROR_ON_MSG(it == _map.end(), value->debugName() << " not found in ValueMap"); ERROR_ON_MSG((it->second.type != popart_compiler::OutputElemType::Tuple && it->second.type != popart_compiler::OutputElemType::List), value->debugName() << " is not a tuple or list"); return it->second.tensors; } const ValueMap::TensorList &ValueMap::tensors(torch::jit::Value *value) const { const auto it = _map.find(value); ERROR_ON_MSG(it == _map.end(), value->debugName() << " not found in ValueMap"); return it->second.tensors; } void ValueMap::setTensor(torch::jit::Value *value, popart_compiler::TensorId id) { ERROR_ON_MSG(!_map.emplace(value, Data(id)).second, "Value " << value->debugName() << " already present in the map"); } void ValueMap::setList(torch::jit::Value *value, const ValueMap::TensorList &tensors) { ERROR_ON_MSG( !_map.emplace(value, Data(tensors, popart_compiler::OutputElemType::List)) .second, "Value " << value->debugName() << " already present in the map"); } void ValueMap::setTuple(torch::jit::Value *value, const ValueMap::TensorList &tensors) { ERROR_ON_MSG( !_map.emplace(value, Data(tensors, popart_compiler::OutputElemType::Tuple)) .second, "Value " << value->debugName() << " already present in the map"); } /* * Static helper functions. */ const char *typeToPopartCStr(const at::ScalarType type) { switch (type) { case at::ScalarType::Float: case at::ScalarType::Double: return "FLOAT"; case at::ScalarType::Half: return "FLOAT16"; case at::ScalarType::Short: return "INT16"; case at::ScalarType::Int: case at::ScalarType::Long: return "INT32"; case at::ScalarType::Bool: return "BOOL"; case at::ScalarType::Char: return "INT8"; case at::ScalarType::Byte: return "UINT8"; default: logging::err("Unimplemented type '{}'", type); return "UNIMPLEMENTED"; } return "UNIMPLEMENTED"; } std::vector getTensorDimensions(const at::Tensor &tensor) { const auto &sizes = tensor.sizes(); return std::vector(sizes.cbegin(), sizes.cend()); } at::ScalarType fromPopartType(const popart_compiler::PopartType type) { switch (type) { case popart_compiler::PopartType::UINT8: { return at::ScalarType::Byte; } case popart_compiler::PopartType::INT8: { return at::ScalarType::Char; } case popart_compiler::PopartType::INT16: case popart_compiler::PopartType::UINT16: { return at::ScalarType::Short; } case popart_compiler::PopartType::INT32: case popart_compiler::PopartType::UINT32: { return at::ScalarType::Int; } case popart_compiler::PopartType::INT64: { return at::ScalarType::Long; } case popart_compiler::PopartType::BOOL: { return at::ScalarType::Bool; } case popart_compiler::PopartType::FLOAT: { return at::ScalarType::Float; } case popart_compiler::PopartType::FLOAT16: { return at::ScalarType::Half; } case popart_compiler::PopartType::BFLOAT16: { return at::ScalarType::BFloat16; } case popart_compiler::PopartType::DOUBLE: { return at::ScalarType::Double; } case popart_compiler::PopartType::COMPLEX64: { return at::ScalarType::ComplexFloat; } case popart_compiler::PopartType::COMPLEX128: { return at::ScalarType::ComplexDouble; } case popart_compiler::PopartType::FLOAT8_143: case popart_compiler::PopartType::FLOAT8_152: ERROR("Can't convert a float8 PopART type to a PyTorch scalar type"); default: ERROR("Unsupported PopART data type " << toPopartTypeStr(type)); } } popart_compiler::PopartType toPopartType(const at::ScalarType type) { switch (type) { case at::ScalarType::Byte: { return popart_compiler::PopartType::UINT8; } case at::ScalarType::Char: { return popart_compiler::PopartType::INT8; } case at::ScalarType::Short: { return popart_compiler::PopartType::INT16; } case at::ScalarType::Int: { return popart_compiler::PopartType::INT32; } case at::ScalarType::Long: { return popart_compiler::PopartType::INT64; } case at::ScalarType::Bool: { return popart_compiler::PopartType::BOOL; } case at::ScalarType::Float: { return popart_compiler::PopartType::FLOAT; } case at::ScalarType::Half: { return popart_compiler::PopartType::FLOAT16; } case at::ScalarType::BFloat16: { return popart_compiler::PopartType::BFLOAT16; } case at::ScalarType::Double: { return popart_compiler::PopartType::DOUBLE; } case at::ScalarType::ComplexFloat: { return popart_compiler::PopartType::COMPLEX64; } case at::ScalarType::ComplexDouble: { return popart_compiler::PopartType::COMPLEX128; } default: ERROR("Unsupported PyTorch scalar type " << toString(type)); } } void platformAgnosticTypeInfoFromIRType( torch::jit::Value *value, std::vector *types, std::vector> *shapes) { const std::shared_ptr tensor_type = value->type()->expect(); c10::ScalarType const as_scalar = *tensor_type->scalarType(); types->emplace_back(toPopartType(as_scalar)); c10::VaryingShape const shape = tensor_type->sizes(); shapes->emplace_back(); for (std::uint32_t i = 0; i < *shape.size(); ++i) { shapes->back().push_back(*shape[i]); } } } // namespace namespace detail { /* * Implementation of the lowering operation. */ class LowerToPopartImpl { public: LowerToPopartImpl(torch::jit::Graph *g, InplaceGraphInfo &&inplace_info, bool training, std::vector &&opt, const popart_compiler::SessionOptions &options, const AttributeAccessor &attribute_accessor, CPUCallbackMap &&callback, const AnchorList &&anchors); void lower(); std::shared_ptr compile(); std::shared_ptr loadExecutableFromFile(const std::string &input_filename); private: void printWasLoweredDebug(const torch::jit::Node *node, popart_compiler::TensorId first_output_tensor); torch::jit::Graph &_graph; bool _lowered; std::vector _parameters; std::vector _parameter_names; InplaceGraphInfo _inplace_info; std::vector _input_tensor_hooks; std::vector _output_tensor_hooks; ValueMap _value_map; // Optimizer from the user. const std::vector _optimizers; // Tensors to be anchored other than outputs const AnchorList &_anchors; using FunctionType = std::function &inputs, torch::jit::Node *)>; std::unordered_map _functionToImplementation; popart_compiler::Compiler _compiler; CPUCallbackMap _callbacks; void lowerParameters(); void lowerBody(); void lowerReturn(); std::string tensorNames(std::int64_t first_tensor, std::int64_t num_tensors); std::string tensorNames(const ValueMap::TensorList &tensors); std::string tensorTypesAndShapes(std::int64_t first_tensor, std::int64_t num_tensors); std::string tensorTypesAndShapes(const ValueMap::TensorList &tensors); void validateOutputShapeAndType(popart_compiler::TensorId output_tensor, torch::jit::Node *node, std::uint64_t node_output); }; namespace { // Remove from vec all elements vec[i] for which mask[i] is false template void maskVector(std::vector *vec, const std::vector &mask, size_t ignore_first = 0) { const auto predicate = [&mask, &vec, ignore_first](const T &val) { const auto idx = static_cast(&val - &(*vec->begin())); if (idx < ignore_first) { return false; } return !mask.at(idx - ignore_first); }; const auto erase_begin = std::remove_if(vec->begin(), vec->end(), predicate); vec->erase(erase_begin, vec->end()); } } // namespace /* * Lower to popart impl. */ std::shared_ptr LowerToPopartImpl::compile() { ERROR_ON_MSG(!_lowered, "You need to lower() the graph first"); logging::LogContext const ctx("LowerToPopart::compile"); // Init the session, this also involves compiling to poplar. _compiler.initSession(_optimizers, getModelProtoFilename().c_str()); _compiler.compileAndPrepareDevice(); std::vector data_types; data_types.reserve(_output_tensor_hooks.size()); for (const auto id : _output_tensor_hooks) { data_types.emplace_back(fromPopartType(_compiler.getPopartType(id))); } return std::make_shared( std::move(_compiler), std::move(_input_tensor_hooks), std::move(_output_tensor_hooks), std::move(data_types), _parameter_names, std::move(_inplace_info)); } std::shared_ptr LowerToPopartImpl::loadExecutableFromFile(const std::string &input_filename) { logging::LogContext const ctx("LowerToPopart::loadExecutableFromFile"); // Init the session, this also involves compiling to poplar. _compiler.initSession(_optimizers, getModelProtoFilename().c_str()); _compiler.loadExecutableAndPrepareDevice(input_filename.c_str()); std::vector data_types; data_types.reserve(_output_tensor_hooks.size()); for (const auto id : _output_tensor_hooks) { data_types.emplace_back(fromPopartType(_compiler.getPopartType(id))); } return std::make_shared( std::move(_compiler), std::move(_input_tensor_hooks), std::move(_output_tensor_hooks), std::move(data_types), _parameter_names, std::move(_inplace_info)); } void LowerToPopartImpl::lower() { logging::debug("Graph lowered to PopART {"); // Lower the tensor parameters of the _graph to OpInputs. lowerParameters(); // Lower the body of the _graph. lowerBody(); lowerReturn(); logging::debug("}"); _lowered = true; } void LowerToPopartImpl::printWasLoweredDebug( const torch::jit::Node *node, popart_compiler::TensorId first_output_tensor) { logging::debug( "{} was lowered to {} [{},{}]", nodeToString(node), tensorNames(first_output_tensor, node->outputs().size()), tensorTypesAndShapes(first_output_tensor, node->outputs().size()), _compiler.getExecutionInfo().get()); } void LowerToPopartImpl::lowerReturn() { // Used to encode the number of (actual) outputs _compiler.addOutputType( {popart_compiler::OutputElemType::Tuple, static_cast(_inplace_info.num_normal_outputs)}); // Recursively go through the output's type to flatten its structure and // add it to the compiler. // In this representation, (T0, T1, (T2, T3), T4) would be // [ Tuple3, Tensor, Tensor, Tuple2, Tensor, Tensor, Tensor] // Only lower the outputs not used for tensors modified inplace. std::function process_type; process_type = [this, &process_type](const c10::TypePtr &type) { switch (type->kind()) { case c10::TypeKind::TensorType: { _compiler.addOutputType({popart_compiler::OutputElemType::Tensor}); break; } case c10::TypeKind::TupleType: { const auto tuple_type = type->expect(); _compiler.addOutputType( {popart_compiler::OutputElemType::Tuple, static_cast(tuple_type->elements().size())}); for (const auto &elt_type : tuple_type->elements()) { process_type(elt_type); } break; } case c10::TypeKind::ListType: { // Use our custom type to find the number of tensors (lists can only be // tensors as enforced by torch JIT) // type->expect is static and always succeeds const auto list_type = type->cast(); ERROR_ON(!list_type); _compiler.addOutputType( {popart_compiler::OutputElemType::List, static_cast(list_type->numElements())}); for (size_t i = 0; i < list_type->numElements(); i++) { _compiler.addOutputType({popart_compiler::OutputElemType::Tensor}); } break; } default: ERROR("Unsupported output type '" << c10::typeKindToString(type->kind())); } }; logging::debug(" return ("); for (torch::jit::Value *value : _graph.outputs()) { const auto tensors = _value_map.tensors(value); const auto msg = fmt::format(" output: %{} : {} ->", value->debugName(), *value->type()); logging::debug("{} {} [{}]", msg, tensorNames(tensors), tensorTypesAndShapes(tensors)); if (value->type()->kind() == c10::TypeKind::ListType) { c10::TypeKind const elt_kind = value->type()->expect()->getElementType()->kind(); ERROR_ON_MSG(elt_kind != c10::TypeKind::TensorType, "Unsupported list type " << c10::typeKindToString(elt_kind)); const std::int64_t num_tensors = static_cast(tensors.size()); _compiler.addOutputType( {popart_compiler::OutputElemType::List, num_tensors}); logging::trace("List with num tensors: {}", num_tensors); for (std::int64_t i = 0; i < num_tensors; ++i) { _compiler.addOutputType({popart_compiler::OutputElemType::Tensor}); } } else { process_type(value->type()); } uint64_t output_num = 0; for (const auto id : tensors) { const auto overlap_symbol = getOverlapSymbol("output", output_num); ERROR_ON(!_graph.return_node()->hasAttribute(overlap_symbol)); const auto overlap_str = _graph.return_node()->s(overlap_symbol); _compiler.addOutputTensor(id, popart_compiler::PopartOutputMode::N, 1, overlap_str.c_str()); _output_tensor_hooks.push_back(id); output_num++; } } logging::debug(" )"); for (const auto &anchor : _anchors) { const char *name = anchor.name.c_str(); popart_compiler::PopartOutputMode const output_mode = static_cast(anchor.mode); const size_t return_period = anchor.period; logging::debug(" anchor ( {} {}/{} )", name, outputModeToString(output_mode), return_period); const auto id = _compiler.createTensorId(name); _compiler.addOutputType({popart_compiler::OutputElemType::Tensor}); _compiler.addOutputTensor(id); _output_tensor_hooks.push_back(id); } } std::string LowerToPopartImpl::tensorNames(std::int64_t first_tensor, std::int64_t num_tensors) { ValueMap::TensorList tensors; tensors.reserve(num_tensors); for (int i = 0; i < num_tensors; i++) { tensors.push_back(first_tensor + i); } return tensorNames(tensors); } std::string LowerToPopartImpl::tensorNames(const ValueMap::TensorList &tensors) { std::string sep{}; std::string names; for (const auto tensor : tensors) { names += sep + _compiler.tensorName(tensor); sep = ", "; } return names; } std::string LowerToPopartImpl::tensorTypesAndShapes(std::int64_t first_tensor, std::int64_t num_tensors) { ValueMap::TensorList tensors; tensors.reserve(num_tensors); for (int i = 0; i < num_tensors; i++) { tensors.push_back(first_tensor + i); } return tensorTypesAndShapes(tensors); } std::string LowerToPopartImpl::tensorTypesAndShapes(const ValueMap::TensorList &tensors) { std::string sep{}; std::string shapes; const char *shape_inf_failed = "(shape inference failed)"; for (const auto tensor : tensors) { std::ostringstream shape_str; try { const auto tensor_shape = _compiler.getSize(tensor); const auto dtype_chars = _compiler.getTensorDTypeString(tensor); shape_str << dtype_chars.get(); if (tensor_shape == popart_compiler::Compiler::invalid_size) { shape_str << shape_inf_failed; } else { shape_str << "("; for (auto it = tensor_shape.cbegin(); it != tensor_shape.cend(); it++) { shape_str << *it; if (it + 1 != tensor_shape.cend()) { shape_str << ", "; } } shape_str << ")"; } } catch (const logging::Error &) { shape_str << shape_inf_failed; } shapes += sep + shape_str.str(); sep = ", "; } return shapes; } void LowerToPopartImpl::validateOutputShapeAndType( popart_compiler::TensorId output_tensor, torch::jit::Node *node, std::uint64_t node_output) { torch::jit::Value *output = node->output(node_output); const JitTensorInfo jit_output(output); at::ScalarType const popart_type = fromPopartType(_compiler.getPopartType(output_tensor)); const auto popart_size = _compiler.getSize(output_tensor); bool match = (popart_type == jit_output.scalar_type); // Only validate shape if PopART's shape inference worked. if (match && popart_size != popart_compiler::Compiler::invalid_size) { match = (popart_size == jit_output.dims); } ERROR_ON_MSG(!match, "Output[" << node_output << "] mismatch: " << nodeToString(node) << " -> PopART " << tensorTypesAndShapes(output_tensor, 1)); } // Lower the main body of the _graph. void LowerToPopartImpl::lowerBody() { logging::LogContext const ctx_func("LowerToPopartImpl::lowerBody"); for (torch::jit::Node *node : _graph.nodes()) { logging::LogContext const ctx("processing " + nodeToString(node)); // Switch/lookup based on the actual int value. const c10::Symbol kind = node->kind(); // When using the dispatcher metadata should always be set. const std::string meta = node->sourceRange().source() ? node->sourceRange().source()->text_str().str() : std::string{}; ERROR_ON_MSG(meta.empty(), "Source code location missing for node " + nodeToString(node)); // Note: filename and line number might still not be available (For example // if the filter set by the user excludes the entire stack). const auto file_line_col = node->sourceRange().file_line_col(); std::uint64_t line = 0; std::uint64_t col = 0; std::string filename; if (file_line_col) { std::tie(filename, line, col) = *file_line_col; } _compiler.setCurrentPythonCodeLocation(meta.c_str(), filename.c_str(), line, col); const auto itr = _functionToImplementation.find(kind); if (itr != _functionToImplementation.cend()) { // Get the torch jit SSA for the input/output values. std::vector inputs; std::transform(node->inputs().begin(), node->inputs().end(), std::back_inserter(inputs), [&](torch::jit::Value *val) { // Tuples aren't supported here but it's ok because // we don't support any operations which actually take in // tuples. return _value_map.tensor(val); }); // Call the callback popart_compiler::TensorId const first_output_tensor = itr->second(inputs, node); // The callback only returns the ID of the first tensor, but we know // the generated tensors have contiguous IDs, so we can infer the other // IDs. for (std::uint64_t i = 0; i < node->outputs().size(); ++i) { torch::jit::Value *output = node->output(i); popart_compiler::TensorId const output_tensor = first_output_tensor + i; ERROR_ON_MSG(!_compiler.tensorIdIsValid(output_tensor), "Output " << i << " doesn't exist of Node " << *node); // TODO(T66614): JIT graph doesn't have any shape inference so we can't // validate the shapes. Revisit once we've migrated to MLIR. // validateOutputShapeAndType(output_tensor, node, i); _value_map.setTensor(output, output_tensor); } if (!_compiler.isHostSideConstant(first_output_tensor)) { printWasLoweredDebug(node, first_output_tensor); } } else if (kind == symbols::poptorch::end_ipu_block) { _compiler.clearActiveIpu(); } else if (kind == symbols::poptorch::start_for_loop) { _compiler.startSubgraph(); logging::debug("{} was lowered", nodeToString(node)); } else if (kind == symbols::poptorch::end_for_loop) { const std::vector inputs = _value_map.tensors(node->input(0)); // Popart needs to know the number of outputs even though it's in the // graph. const std::size_t num_outputs = node->i(c10::Symbol::attr("num_outputs")); const std::int32_t trip_count = static_cast(node->i(c10::Symbol::attr("trip_count"))); // Call the callback. This will pop the subgraphs from the stack. const popart_compiler::TensorId first_output_tensor = _compiler.endForLoop(trip_count, num_outputs, inputs); // The callback only returns the ID of the first tensor, but we know // the generated tensors have contiguous IDs, so we can infer the other // IDs. std::vector outs(num_outputs); for (std::uint64_t i = 0; i < num_outputs; ++i) { outs[i] = first_output_tensor + i; } _value_map.setTuple(node->output(), outs); printWasLoweredDebug(node, first_output_tensor); } else if (kind == symbols::poptorch::start_if_block) { // Starting the if block means changing the internal builder state to work // with a new subgraph. _compiler.startIfBlock(); logging::debug("{} was lowered", nodeToString(node)); } else if (kind == symbols::poptorch::start_else_block) { // Starting the else block means changing the internal builder state to // work with a new subgraph. _compiler.startElseBlock(); logging::debug("{} was lowered", nodeToString(node)); } else if (kind == symbols::poptorch::end_if_block) { // Process the if condition. const auto &inputs = _value_map.tensors(node->input(0)); const auto &condition = inputs[0]; // Popart needs to know the number of outputs even though it's in the // graph. const std::size_t num_outputs = node->i(c10::Symbol::fromQualString("attr::num_outputs")); // Call the callback. This will pop the subgraphs from the stack. const popart_compiler::TensorId first_output_tensor = _compiler.endIfBlock(condition, num_outputs); // The callback only returns the ID of the first tensor, but we know // the generated tensors have contiguous IDs, so we can infer the other // IDs. std::vector outs(num_outputs); std::iota(outs.begin(), outs.end(), first_output_tensor); _value_map.setTuple(node->output(), outs); printWasLoweredDebug(node, first_output_tensor); } else if (kind == symbols::poptorch::add_untyped_input_tensor) { popart_compiler::TensorId const out = _compiler.addUntypedInputTensor(); _value_map.setTensor(node->output(), out); printWasLoweredDebug(node, out); } else if (kind == symbols::poptorch::begin_ipu_block) { _compiler.setActiveIpu(node->i(c10::Symbol::attr("stage")), node->i(c10::Symbol::attr("phase")), node->i(c10::Symbol::attr("ipu"))); } else if (kind == symbols::poptorch::push_name_scope) { _compiler.pushNameScope(node->s(c10::Symbol::attr("name")).c_str()); } else if (kind == symbols::poptorch::pop_name_scope) { _compiler.popNameScope(); } else if (kind == symbols::poptorch::set_matmul_serialization) { popart_compiler::TensorId const input = _value_map.tensor(node->input()); _compiler.setMatMulSerialization( input, node->s(c10::Symbol::attr("mode")).c_str(), node->i(c10::Symbol::attr("factor")), node->i(c10::Symbol::attr("keep_precision"))); _value_map.setTensor(node->output(), input); } else if (kind == symbols::poptorch::optimizer_group) { std::vector inputs; std::transform(node->inputs().begin(), node->inputs().end(), std::back_inserter(inputs), [&](torch::jit::Value *val) { return _value_map.tensor(val); }); std::uint64_t const group = node->i(c10::Symbol::attr("group")); _compiler.optimizerGroup(inputs, group); } else if (kind == symbols::poptorch::set_available_memory) { // Get the torch jit SSA for the input/output values. std::vector> inputs; for (auto *input : node->inputs()) { inputs.emplace_back(); auto outputs = input->node()->outputs(); std::transform( std::begin(outputs), std::end(outputs), std::inserter(inputs.back(), std::begin(inputs.back())), [&](torch::jit::Value *val) { return _value_map.tensor(val); }); } _compiler.setAvailableMemoryProportion( inputs, node->f(c10::Symbol::attr("availableMemoryProportion"))); for (std::uint64_t i = 0; i < node->outputs().size(); ++i) { _value_map.setTensor(node->output(i), _value_map.tensor(node->input(i))); } } else if (kind == c10::prim::Constant) { ERROR_ON_MSG(node->hasAttribute(c10::attr::value), "Only None constants should be left in the graph after the " "CanonicaliseConstants pass"); _value_map.setTensor(node->output(), popart_compiler::NoneTensor); } else if (kind == c10::prim::TupleConstruct || kind == c10::prim::ListConstruct) { // Get the torch jit SSA for the input/output values. torch::jit::Value *output = node->output(); // Add the values to the value map. ValueMap::TensorList input_tensors; for (torch::jit::Value *ids : node->inputs()) { for (auto tensor : _value_map.tensors(ids)) { input_tensors.push_back(tensor); } } if (kind == c10::prim::TupleConstruct) { _value_map.setTuple(output, input_tensors); } else { _value_map.setList(output, input_tensors); } logging::debug("{} was lowered", nodeToString(node)); } else if (kind == c10::prim::TupleUnpack || kind == c10::prim::ListUnpack) { // Get the torch jit SSA for the input/output values. const auto &tensors(_value_map.listTuple(node->input())); auto tensor_it = tensors.begin(); // As tuples may be nested, walk recursively to flatten all tensors std::function flattened_tuple; flattened_tuple = [&](const c10::TypePtr &type, ValueMap::TensorList &tensorList) { switch (type->kind()) { case c10::TypeKind::TensorType: { ERROR_ON_MSG(tensor_it == tensors.end(), "Not enough tensors to unpack"); tensorList.push_back(*tensor_it); tensor_it++; break; } case c10::TypeKind::TupleType: { auto tuple = type->expect(); for (const auto &elt_type : tuple->elements()) { flattened_tuple(elt_type, tensorList); } break; } default: ERROR("Unsupported type '" << c10::typeKindToString(type->kind())); } }; for (auto *output : node->outputs()) { switch (output->type()->kind()) { case c10::TypeKind::TensorType: { ERROR_ON(tensor_it == tensors.end()); _value_map.setTensor(output, *tensor_it); tensor_it++; break; } case c10::TypeKind::ListType: // (should only have TensorType) case c10::TypeKind::TupleType: { ValueMap::TensorList tensor_list; flattened_tuple(output->type(), tensor_list); _value_map.setTuple(output, tensor_list); break; } default: ERROR("Unsupported parameter type '" << c10::typeKindToString(output->type()->kind())); } } ERROR_ON_MSG(tensor_it != tensors.end(), "Didn't unpack all the tensors"); logging::debug("{} was lowered", nodeToString(node)); } else if (kind == symbols::poptorch::host_side_cast) { // Map to the input value since the type will be cast host side ERROR_ON_MSG(!_value_map.hasTensor(node->input()), "Input to host side cast has not been registered"); ERROR_ON_MSG(node->inputs().size() != 1, "Host side cast should only have one input."); _value_map.setTensor(node->output(), _value_map.tensor(node->input())); } else if (kind == symbols::poptorch::multi_conv_part) { std::vector inputs; std::transform(node->inputs().begin(), node->inputs().end(), std::back_inserter(inputs), [&](torch::jit::Value *val) { return _value_map.tensor(val); }); _compiler.addMultiConvPart(inputs, node->is(c10::Symbol::attr("dilations")), node->is(c10::Symbol::attr("kernel_shape")), node->is(c10::Symbol::attr("pads")), node->is(c10::Symbol::attr("strides"))); logging::debug("{} was lowered as component of MultiConv", nodeToString(node)); } else if (kind == symbols::poptorch::end_multi_conv) { // Extract multiconv options that are set as attributes on the // end_multi_conv instruction const auto amp = c10::Symbol::attr("available_memory_proportions"); if (node->hasAttribute(amp)) { _compiler.setMultiConvAvailableMemoryProportions(node->fs(amp)); } const auto partials_types = c10::Symbol::attr("partials_types"); if (node->hasAttribute(partials_types)) { _compiler.setMultiConvPartialsTypes(node->is(partials_types)); } const auto conv_ditherings = c10::Symbol::attr("enable_conv_dithering"); if (node->hasAttribute(conv_ditherings)) { _compiler.setMultiConvEnableConvDithering(node->is(conv_ditherings)); } const auto plan_type = c10::Symbol::attr("plan_type"); if (node->hasAttribute(plan_type)) { _compiler.setMultiConvPlanType(node->i(plan_type)); } const auto per_conv_reserved_tiles = c10::Symbol::attr("per_conv_reserved_tiles"); if (node->hasAttribute(per_conv_reserved_tiles)) { _compiler.setMultiConvPerConvReservedTiles( node->i(per_conv_reserved_tiles)); } const auto cycle_back_off = c10::Symbol::attr("cycle_back_off"); if (node->hasAttribute(cycle_back_off)) { _compiler.setMultiConvCycleBackOff(node->f(cycle_back_off)); } const torch::jit::ArrayRef node_outputs = node->outputs(); std::vector outputs = _compiler.endMultiConv(); ERROR_ON_MSG(outputs.size() != node_outputs.size(), "Wrong number of outputs for MultiConv. Expected " << node_outputs.size() << " outputs but only received " << outputs.size() << " outputs."); for (size_t i = 0; i < outputs.size(); i++) { _value_map.setTensor(node_outputs[i], outputs[i]); } printWasLoweredDebug(node, outputs.front()); } else if (kind == symbols::poptorch::canonicalised_cpu_call) { // CPU callbacks are referenced by an string identifier. std::string const id = node->s(c10::Symbol::attr("ID")); std::vector input_types; std::vector> input_shapes; // Get the torch jit SSA for the input/output values. std::vector inputs; std::transform(node->inputs().begin(), node->inputs().end(), std::back_inserter(inputs), [&](torch::jit::Value *val) { // Append type info from the inputs. platformAgnosticTypeInfoFromIRType(val, &input_types, &input_shapes); return _value_map.tensor(val); }); std::vector output_types; std::vector> output_shapes; for (torch::jit::Value *value : node->outputs()) { platformAgnosticTypeInfoFromIRType(value, &output_types, &output_shapes); } popart_compiler::TensorId const first_output_tensor = _compiler.addCPUCallback(inputs, _callbacks[id], input_types, input_shapes, output_types, output_shapes); for (std::uint64_t i = 0; i < node->outputs().size(); ++i) { torch::jit::Value *output = node->output(i); popart_compiler::TensorId const output_tensor = first_output_tensor + i; ERROR_ON_MSG(!_compiler.tensorIdIsValid(output_tensor), "Output " << i << " doesn't exist of Node " << *node); _value_map.setTensor(output, output_tensor); } printWasLoweredDebug(node, first_output_tensor); } else if (kind == symbols::poptorch::set_attribute) { const std::string &attribute = node->s(c10::Symbol::attr("attribute")); const std::string &key = node->s(c10::Symbol::attr("key")); const std::string &value = node->s(c10::Symbol::attr("value")); _compiler.setAttribute(attribute.c_str(), key.c_str(), value.c_str()); } else if (kind == symbols::poptorch::clear_attribute) { const std::string &attribute = node->s(c10::Symbol::attr("attribute")); const std::string &key = node->s(c10::Symbol::attr("key")); _compiler.clearAttribute(attribute.c_str(), key.c_str()); } else { ERROR("Couldn't find a registered operation for node " << *node); } } } void LowerToPopartImpl::lowerParameters() { // The "true" inputs are a mixture of tuples (which may be nested) and tensors // The parameters are all tensors. "_graph.inputs()." contains the inputs // first followed by the parameters at the end. // This will provide a view of all the tensors in _graph.inputs(), i.e. // by collapsing tuples. auto graph_t_inputs = collapsedGraphInputHierachy(&_graph); // Step 0, remove unused parameters // graph_t_inputs is updated but _graph.inputs() will retain unused // parameters std::vector parameter_used(graph_t_inputs.size(), true); for (size_t i = 0; i < graph_t_inputs.size(); ++i) { auto *value = graph_t_inputs[i]; if (value->uses().empty() && isParameter(value)) { parameter_used.at(i) = false; logging::trace("Skipping unused parameter: %{}", value->debugName()); } } maskVector(&graph_t_inputs, parameter_used); // Step 1, add tensor inputs for all tensors in the hierarchy and obtain // the resulting popart IDs. This can be done with collapsed hierarchy. ValueMap::TensorList parameter_popart_ids; std::vector parameter_values; size_t input_index = 0; size_t param_index = 0; for (auto *value : graph_t_inputs) { JitTensorInfo info = JitTensorInfo(value); const char *popart_type = typeToPopartCStr(info.scalar_type); if (isParameter(value)) { void *data_ptr = getDataSourceForValue(value); ERROR_ON_MSG(value->uses().empty(), "Parameter %" << value->debugName() << " isn't used and therefore should have been removed"); ERROR_ON(param_index > _parameter_names.size()); const std::string name = getParameterName(value); ERROR_ON_MSG(name.empty(), "No parameter name available for value %" << value->debugName()); _parameter_names.push_back(name); popart_compiler::TensorId id; PerReplicaSettings pr_settings; if (getParameterPerReplica(value, pr_settings)) { std::vector dims(info.dims.size() + 1); dims[0] = pr_settings.size0; memcpy(&dims[1], info.dims.data(), info.dims.size() * sizeof(std::int64_t)); id = _compiler.addInitializedInputTensor( name.c_str(), popart_type, dims, pr_settings.host_buffer->data(), pr_settings.comm_group_type, pr_settings.shards, pr_settings.variable_retrieval_mode); } else { id = _compiler.addInitializedInputTensor(name.c_str(), popart_type, info.dims, data_ptr); } // Compiler knows which buffers are updatable _compiler.registerUpdatableNamedBuffer(id); parameter_values.push_back(value); parameter_popart_ids.push_back(id); param_index++; } else { auto overlap_symbol = getOverlapSymbol("input", input_index); std::string overlap_str("no_overlap"); if (_graph.param_node()->hasAttribute(overlap_symbol)) { overlap_str = _graph.param_node()->s(overlap_symbol); } const auto id = _compiler.addInputTensor(popart_type, info.dims, overlap_str.c_str()); _input_tensor_hooks.push_back(id); input_index++; } } // Step 2, map the PopART tensor IDs to the JIT Value of the (not collapsed) // graph inputs logging::debug("graph("); auto input_tensor_it = _input_tensor_hooks.begin(); size_t index = 0; for (torch::jit::Value *value : _graph.inputs()) { if (isParameter(value)) { // Only process inputs continue; } ERROR_ON(value->node()->kind() != c10::prim::Param); const size_t num_tensors = numTensorsForType(value->type()); ValueMap::TensorList tensors; tensors.reserve(num_tensors); for (size_t i = 0; i < num_tensors; i++) { ERROR_ON(input_tensor_it == _input_tensor_hooks.end()); tensors.push_back(*input_tensor_it); input_tensor_it++; } if (value->type()->kind() == c10::TypeKind::TensorType) { ERROR_ON(tensors.size() != 1); _value_map.setTensor(value, tensors.front()); } else { ERROR_ON(value->type()->kind() != c10::TypeKind::TupleType); _value_map.setTuple(value, tensors); } const auto msg = fmt::format(" input: %{} : {} ->", value->debugName(), *value->type()); logging::debug("{} {} [{}]", msg, tensorNames(tensors), tensorTypesAndShapes(tensors)); index++; } // Step 3, map the PopART tensor IDs to the JIT Value of the parameters for (index = 0; index < parameter_popart_ids.size(); index++) { auto *value = parameter_values.at(index); auto &tensor(parameter_popart_ids.at(index)); const auto msg = fmt::format(" param: %{} : {} ->", value->debugName(), *value->type()); logging::debug("{} {} [{}]", msg, tensorNames(tensor, 1), tensorTypesAndShapes(tensor, 1)); _value_map.setTensor(value, tensor); } logging::debug(" ):"); } namespace { // Helper to let us filter string arguments into const char*s. This is to catch // the std::string produced by some attributes before they cross the ABI // boundary. template T convertType(T &&t) { return t; } // String, return const char*. const char *convertType(const std::string &s) { return s.c_str(); // NOLINT } // vector, return vector std::vector convertType(const std::vector &s) { std::vector result; std::transform(s.begin(), s.end(), std::back_inserter(result), [](const std::string &str) { return str.c_str(); // NOLINT }); return result; } // vector std::vector convertType(const std::vector &v) { std::vector result; std::transform(v.begin(), v.end(), std::back_inserter(result), [](double d) { return static_cast(d); }); return result; } popart_compiler::PopartConstant convertTensorConstantNode(const torch::jit::Node *node) { logging::LogContext const ctx("convertTensorConstantNode: processing " + nodeToString(node)); ERROR_ON_MSG( node->kind() != symbols::poptorch::tensor_constant, "Only a popart_compiler::tensor_constant can be converted into a popart " "constant"); auto output_type = *node->output()->type()->expect()->scalarType(); auto tensor_type = getNodeTensorAttrValue(node).scalar_type(); ERROR_ON_MSG(output_type != tensor_type, "Output type is " << c10::toString(output_type) << " but tensor type is " << c10::toString(tensor_type)); auto tensor = getNodeTensorAttrValue(node); ERROR_ON(!tensor.is_contiguous()); return {toPopartType(tensor.scalar_type()), tensor.data_ptr(), getTensorDimensions(tensor)}; } popart_compiler::HostSideConstant convertHostSideTensorConstantNode(const torch::jit::Node *node) { logging::LogContext const ctx( "convertHostSideTensorConstantNode: processing " + nodeToString(node)); ERROR_ON_MSG(node->kind() != symbols::poptorch::host_side_tensor_constant, "Only a poptorch::host_side_tensor_constant can be converted " "into a host side constant constant"); auto tensor = getNodeTensorAttrValue(node); ERROR_ON(!tensor.is_contiguous()); return {toPopartType(tensor.scalar_type()), tensor.data_ptr(), tensor.nbytes(), getTensorDimensions(tensor)}; } void processListAttribute( const char *name, const std::shared_ptr> &attributes, const IPyValue &elements) { const auto first_element = elements.getFromList(0); if (first_element->isInt()) { std::vector ints; ints.reserve(elements.getListSize()); elements.forEachInList([&ints](const IPyValue &int_obj) { ints.push_back(int_obj.toInt64()); }); attributes->emplace_back(name, ints); return; } if (first_element->isDouble()) { std::vector floats; floats.reserve(elements.getListSize()); elements.forEachInList([&floats](const IPyValue &float_obj) { floats.push_back(float_obj.toFloatWithRangeCheck()); }); attributes->emplace_back(name, floats); return; } if (first_element->isString()) { std::vector> strs; strs.reserve(elements.getListSize()); elements.forEachInList([&strs](const IPyValue &str) { strs.emplace_back(stringToUniquePtr(str.toString())); }); attributes->emplace_back(name, strs); return; } ERROR("Invalid type for Popart attribute."); } std::shared_ptr> convertCustomOpAttributes(const torch::jit::Node *node, const AttributeAccessor &attribute_accessor) { logging::LogContext const ctx("convertCustomOpAttributes: processing " + nodeToString(node)); std::string const attributes_id_str( node->s(c10::Symbol::attr("attributes_id"))); auto dict_obj = attribute_accessor(attributes_id_str); auto attributes = std::make_shared>(); dict_obj->forEachInDict([&attributes](const IPyValue &key, const IPyValue &attribute) { auto name = key.toString(); if (attribute.isInt()) { attributes->emplace_back(name.c_str(), attribute.toInt64()); } else if (attribute.isDouble()) { attributes->emplace_back(name.c_str(), attribute.toFloatWithRangeCheck()); } else if (attribute.isString()) { attributes->emplace_back(name.c_str(), stringToUniquePtr(attribute.toString())); } else if (attribute.isSetListOrTuple()) { processListAttribute(name.c_str(), attributes, attribute); } else { ERROR("Invalid attribute type"); } }); return attributes; } } // namespace LowerToPopartImpl::LowerToPopartImpl( torch::jit::Graph *g, InplaceGraphInfo &&inplace_info, bool training, std::vector &&opt, const popart_compiler::SessionOptions &options, const AttributeAccessor &attribute_accessor, CPUCallbackMap &&callback, const AnchorList &&anchors) : _graph(*g), _lowered(false), _inplace_info(std::move(inplace_info)), _optimizers(opt), _anchors(anchors), _compiler(training, options), _callbacks(callback) { // Init the function implementation map. This map will be populated by // elements which look something like: /* {"popart::Foo", [&](const std::vector &inputs, torch::jit::Node *node) { return _compiler.foo(inputs, node->i("attr::SomeIntegerAttr"), node->i("attr::SomeOtherIntegerAttr"), node->is("attr::AnIntArrayAttr"), node->f("attr::AFloatAttr")); } }, */ // Essentially this is just a map from the string IR symbol to a function to // be called that implements it. Those functions are also autogenerated by the // same macros in _compiler.hpp and _compiler.cpp. _functionToImplementation = { // Torch JIT api defines the attribute accessor as the following function names. #define INT_VEC is #define FLOAT_VEC fs #define FLOAT f #define INT i #define CHAR i #define BOOL i #define STRING s #define STRING_VEC ss // Useful NOP macro #define NONE // The arguments are processed by extracting the given type using the above // accessors, the name is converted into "attr::NAME" which is what pytorch JIT // expects for attribute accessing. #define ARG(Type, Name) , convertType(node->Type(c10::Symbol::attr(#Name))) #define POPART_CONST_ARG(unused) , convertTensorConstantNode(node) #define HOST_SIDE_CONST_ARG(unused) \ , std::move(convertHostSideTensorConstantNode(node)) #define POPART_ATTRIB_VEC_ARG(unused) \ , convertCustomOpAttributes(node, attribute_accessor) #define BODY_ARG(Name) NONE // Create a function decl with the given call and arguments. #define OP_DECL(ns, symbolName, function, unused, Args, unused2) \ {symbols::ns::symbolName, \ [&](const std::vector &inputs, \ torch::jit::Node *node) { \ (void)(node); \ return _compiler.function(inputs Args); \ }}, #define OP_DECL_NO_RETURN(ns, symbolName, function, unused, Args, unused2) \ {symbols::ns::symbolName, \ [&](const std::vector &inputs, \ torch::jit::Node *node) { \ _compiler.function(inputs Args); \ ERROR_ON_MSG(node->outputs().size() != 0, \ "Void return function called on torch::jit::Node which has " \ "outputs"); \ return popart_compiler::TensorId{}; \ }}, #include "popart_compiler/SupportedOperations.inc.hpp" #undef BODY_STR_ARG #undef STR_ARG #undef BODY_ARG #undef POPART_ATTRIB_VEC_ARG #undef HOST_SIDE_CONST_ARG #undef POPART_CONST_ARG #undef OP_DECL #undef OP_DECL_NO_RETURN #undef ARG #undef NONE #undef BOOL #undef CHAR #undef STRING #undef STRING_VEC #undef INT #undef FLOAT #undef FLOAT_VEC #undef INT_VEC }; // End map initalizer. } } // namespace detail LowerToPopart::LowerToPopart(torch::jit::Graph *graph, InplaceGraphInfo &&inplace_info, bool training, std::vector &&opt, const popart_compiler::SessionOptions &options, const AttributeAccessor &attribute_accessor, CPUCallbackMap callbacks, AnchorList &&anchors) { _impl = std::make_unique( graph, std::move(inplace_info), training, std::move(opt), std::move(options), attribute_accessor, std::move(callbacks), std::move(anchors)); } void LowerToPopart::lower() { _impl->lower(); } std::shared_ptr LowerToPopart::compile() { auto executable = _impl->compile(); if (logging::outputPopartIR()) { logging::debug("Popart IR: {}", executable->getPopartIR()); } return executable; } std::shared_ptr LowerToPopart::loadExecutableFromFile(const std::string &input_filename) { return _impl->loadExecutableFromFile(input_filename); } LowerToPopart::~LowerToPopart() = default; LowerToPopart::LowerToPopart(LowerToPopart &&lower) { _impl = std::move(lower._impl); } } // namespace poptorch ================================================ FILE: poptorch/source/LowerToPopartFactories.cpp ================================================ // Copyright (c) 2022 Graphcore Ltd. All rights reserved. #include "poptorch/LowerToPopartFactories.hpp" #include #include #include #include #include #include #include "popart_compiler/Compiler.hpp" #include "poptorch_logging/Logging.hpp" #include "poptorch_logging/Tracepoint.hpp" #include "poptorch/AliasProcessing.hpp" #include "poptorch/DispatchTracer.hpp" #include "poptorch/ImplicitCasting.hpp" #include "poptorch/InplaceOps.hpp" #include "poptorch/OverlappedIO.hpp" #include "poptorch/PopartCanonicalization.hpp" #include "poptorch/RequiresGrad.hpp" #include "poptorch/TypeAndConstantCanonicalization.hpp" #include "poptorch/Utils.hpp" namespace poptorch { poptorch::LowerToPopart lowerToPopartFromDispatch( SessionOptionsParser &parser, bool training, AnchorList &&anchors_list, const std::function &initCallbackBuffers, std::vector &&optimizers, const AttributeAccessor &attribute_accessor, CPUCallbackMap &callbacks) { auto &parsed_options = parser.options(); const std::shared_ptr graph = getTracedGraph(); logging::trace("Initial dispatched graph:\n{}", *graph); fixRequiresGradFromDispatch(graph.get()); torch::jit::EliminateDeadCode(graph); torch::jit::PeepholeOptimize(graph); logging::trace("Optimised graph:\n{}", *graph); InplaceGraphInfo inplace_info = getInplaceGraphInfo( anchors_list.size(), parsed_options.hasInputReplication() && parsed_options.broadcastBuffers()); logging::trace("Graph after handling inplace ops:\n{}", *graph); poptorch::attributiseOverlappedIO(graph.get()); logging::trace("Graph after attributising IO overlap specifiers:\n{}", *graph); fixForLoopInputs(*graph); verifyIfElseBlocksOrder(*graph); poptorch::type_and_constant_canonicalization::evaluateConstexprs(graph.get()); logging::trace("Graph after evaluating constant expressions:\n{}", *graph); poptorch::type_and_constant_canonicalization::canonicaliseConstants( graph.get()); logging::trace("Graph after constant canonicalisation:\n{}", *graph); poptorch::removeScatterAddIndexExpansion(graph.get()); poptorch::simplifyGatherWithExpandedIndices(graph.get()); logging::trace("Graph before PopART canonicalisation:\n{}", *graph); poptorch::canonicalize(graph.get()); logging::trace("Graph before PopART grouping gathers and scatters:\n{}", *graph); poptorch::groupScatterReduceAndGatherNodes(graph.get()); poptorch::annotateSubgraphs(graph.get(), graph->nodes().front()); // Collapse any `begin_cpu ... end_cpu` sequences into a single node, with the // correct inputs & outputs. poptorch::cpuOffloadingCleanup(graph.get()); if (graph->outputs().empty()) { logging::trace("No outputs, so all nodes cleared"); for (auto it = graph->nodes().rbegin(); it != graph->nodes().rend(); it++) { it.destroyCurrent(); } } // TODO(T67295): remove after we use our own dispatch key. removeDeadImplicitCasts(graph.get()); canonicalizeLate(graph.get()); logging::trace("Graph after PopART canonicalisation:\n{}", *graph); if (training) { poptorch::addDetachOperations(graph.get()); poptorch::removeSurplusIdentityLosses(graph.get()); logging::trace("Graph after adding detach operations:\n{}", *graph); } // Error the user if any operations couldn't be canonicalised. poptorch::errorOnUnsupportedAten(graph.get()); // Prepare CPU op callbacks, by allocating the CPU tensors where the // inputs/outputs will be stored. We have to do this at the last possible // moment due to tracing. initCallbackBuffers(); logging::trace("Graph before lowering to PopART:\n{}", *graph); poptorch::LowerToPopart lower( graph.get(), std::move(inplace_info), training, std::move(optimizers), parsed_options, attribute_accessor, callbacks, std::move(anchors_list)); lower.lower(); return lower; } } // namespace poptorch ================================================ FILE: poptorch/source/OpBuilder.cpp ================================================ // Copyright (c) 2020 Graphcore Ltd. All rights reserved. #include "popart_compiler/PopartEnums.hpp" #include "popart_compiler/Utils.hpp" #include "poptorch_logging/Logging.hpp" #include "poptorch/DispatchTracer.hpp" #include "poptorch/OpBuilder.hpp" #include "poptorch/PopartCanonicalization.hpp" #include "poptorch/Utils.hpp" #include "PoptorchSymbols.hpp" #include namespace poptorch { namespace { at::ScalarType scalarTypeFromInput(const torch::jit::Node *node, size_t num) { ERROR_ON_MSG(node->inputs().size() <= num, "Cannot get scalar type from input " << num << " as it does not exist"); return *node->input(num)->type()->expect()->scalarType(); } class SourceLocation { public: // SourceLocation is considered enabled if a location or metadata // has been explicitly set. bool isEnabled() const { return _enabled; } void setLocation(const std::string &filename, std::uint64_t line) { _enabled = true; _dirty = true; _filename = filename; _line = line; } void setMetadata(const std::string &metadata) { _enabled = true; _dirty = true; _metadata = metadata; } const torch::jit::SourceRange &sourceRange() { if (_dirty) { _dirty = false; c10::optional filename; if (!_filename.empty()) { filename = _filename; } ERROR_ON_MSG( _metadata.empty(), "[Internal] Metadata missing (setCurrentMetadata() missing)"); auto source = std::make_shared(_metadata, filename, _line); _source_range = torch::jit::SourceRange(source, 0, 1); } return _source_range; } private: bool _enabled{false}; bool _dirty{false}; torch::jit::SourceRange _source_range; std::string _metadata; std::string _filename; std::uint64_t _line; } current_source_location = {}; } // namespace void resetCurrentSourceLocation() { current_source_location = SourceLocation(); } void setCurrentPythonCodeLocation( const torch::jit::SourceRange &source_location) { auto file_line_col = source_location.file_line_col(); std::uint64_t line = 0; std::uint64_t col = 0; std::string filename; if (file_line_col) { std::tie(filename, line, col) = *file_line_col; } current_source_location.setLocation(filename, line); } void setCurrentMetadata(const std::string &metadata) { current_source_location.setMetadata(metadata); } WithNodeMetadata::WithNodeMetadata(torch::jit::Node *node) { // If no source location has been set yet // then the node won't contain any location information. if (current_source_location.isEnabled()) { std::string meta; auto sr = node->sourceRange(); if (sr.source()) { meta = sr.source()->text_str().str(); } setCurrentPythonCodeLocation(sr); setCurrentMetadata(meta); } } WithNodeMetadata::~WithNodeMetadata() { if (current_source_location.isEnabled()) { setCurrentPythonCodeLocation({}); setCurrentMetadata(""); } } torch::jit::Node *createNode(torch::jit::Graph *graph, torch::jit::NodeKind kind, torch::jit::ArrayRef inputs, const ImplicitCast implicit_cast, OutputType output_type, size_t num_outputs, c10::optional dtype) { torch::jit::Node *new_node; if (implicit_cast != ImplicitCast::None && !inputs.empty()) { logging::LogContext ctx(std::string("implicitly casting inputs of ") + kind.toQualString()); auto possibly_cast_inputs = implicitCastInputs(&inputs, implicit_cast); ctx.clear(); new_node = graph->create(kind, num_outputs); for (auto *input : possibly_cast_inputs) { new_node->addInput(input); } } else { new_node = graph->create(kind, inputs, num_outputs); } if (dtype) { if (*dtype != at::ScalarType::Undefined) { new_node->s_(c10::attr::dtype, scalarTypeToOnnxString(*dtype)); } } setNodeOutputsTypes(new_node, implicit_cast, output_type); return new_node; } torch::jit::Node * createAndInsertNode(torch::jit::Graph *graph, torch::jit::NodeKind kind, torch::jit::ArrayRef inputs, const ImplicitCast implicit_cast, OutputType output_type, size_t num_outputs, c10::optional dtype) { torch::jit::Node *new_node = createNode(graph, kind, inputs, implicit_cast, output_type, num_outputs, dtype); insertNodeInGraph(graph, new_node); return new_node; } torch::jit::Value *insertConstant(torch::jit::Graph *graph, const torch::jit::IValue &val) { return graph->insertConstant(val, current_source_location.sourceRange()); } void setSourceRangeToCurrentLocation(torch::jit::Node *node) { node->setSourceRange(current_source_location.sourceRange()); } void insertNodeInGraph(torch::jit::Graph *graph, torch::jit::Node *new_node) { setSourceRangeToCurrentLocation(new_node); graph->insertNode(new_node); setAvailableMemoryAddPossibleInputOp(new_node); } void insertNodeBeforeNode(torch::jit::Node *new_node, torch::jit::Node *insert_point) { setSourceRangeToCurrentLocation(new_node); new_node->insertBefore(insert_point); setAvailableMemoryAddPossibleInputOp(new_node); } void insertNodeAfterNode(torch::jit::Node *new_node, torch::jit::Node *insert_point) { setSourceRangeToCurrentLocation(new_node); new_node->insertAfter(insert_point); setAvailableMemoryAddPossibleInputOp(new_node); } // Sets the scalar types of every output of a node void setNodeOutputsTypes(torch::jit::Node *node, const ImplicitCast implicit_cast, const OutputType output_type) { at::ScalarType resolved_output_type; switch (output_type) { case OutputType::Unknown: { return; } case OutputType::AsFirstInput: { resolved_output_type = scalarTypeFromInput(node, 0); break; } case OutputType::FirstAsFirstInputSecondAlwaysInt: { node->output(0)->setType( c10::TensorType::create(scalarTypeFromInput(node, 0), c10::nullopt, c10::nullopt, c10::nullopt)); node->output(1)->setType(c10::TensorType::create( at::ScalarType::Int, c10::nullopt, c10::nullopt, c10::nullopt)); return; } case OutputType::AsThirdInput: { resolved_output_type = scalarTypeFromInput(node, 2); break; } case OutputType::AsImplicitCastPromoted: { const size_t input_idx = (implicit_cast == ImplicitCast::ExceptFirst) ? 1 : 0; resolved_output_type = scalarTypeFromInput(node, input_idx); break; } case OutputType::AsDtype: [[fallthrough]]; case OutputType::AsDtypeOrAsPromoted: { // Cast uses "to" not "dtype" and a string if (node->kind() == symbols::popart::cast) { // Type is handled in OpBuilder.cpp return; } if (node->hasAttribute(c10::attr::dtype)) { if (node->kindOf(c10::attr::dtype) == torch::jit::AttributeKind::i) { const auto onnx_dtype = node->i(c10::attr::dtype); resolved_output_type = onnxStrToScalarType( popart_compiler::onnxStrFromDtypeInt(onnx_dtype)); } else { const auto &onnx_dtype = node->s(c10::attr::dtype); resolved_output_type = onnxStrToScalarType(onnx_dtype.c_str()); } } else { // Without dtype, the input will be the correct type resolved_output_type = scalarTypeFromInput(node, 0); // This may be needed in the lower to popart stage node->s_(c10::attr::dtype, scalarTypeToOnnxString(resolved_output_type)); } break; } case OutputType::AlwaysBool: { resolved_output_type = at::ScalarType::Bool; break; } case OutputType::AlwaysFloat: { resolved_output_type = at::ScalarType::Float; break; } case OutputType::AlwaysInt: { resolved_output_type = at::ScalarType::Int; break; } case OutputType::AlwaysUint8: { resolved_output_type = at::ScalarType::Byte; break; } default: { ERROR("Unsupported output_type in setNodeOutputsTypes"); } } for (auto *output : node->outputs()) { output->setType(c10::TensorType::create(resolved_output_type, c10::nullopt, c10::nullopt, c10::nullopt)); } } torch::jit::Node *tensorToConstant(torch::jit::Graph *graph, const at::Tensor &t, UseOfNode constant_use) { c10::Symbol symbol; switch (constant_use) { case UseOfNode::HostSideOnly: symbol = symbols::poptorch::host_side_tensor_constant; break; case UseOfNode::PopARTOnly: symbol = symbols::poptorch::tensor_constant; break; case UseOfNode::HostSideAndPopART: symbol = symbols::poptorch::host_and_ipu_side_tensor_constant; break; } torch::jit::Node *new_node = createAndInsertNode(graph, symbol); new_node->output()->inferTypeFrom(t); setNodeTensorAttrValue(new_node, t); return new_node; } /* * Manually added operation. */ torch::jit::Node *createReshape(torch::jit::Graph *graph, torch::jit::Value *A, const std::vector &new_shape) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::popart::reshape_static_shape, {A}); new_node->is_(c10::attr::shape, new_shape); new_node->output()->setType( A->type()->expect()->withSizes(new_shape)); return new_node; } template torch::jit::Node *createConstant(torch::jit::Graph *graph, const std::vector &data, const std::vector &new_shape, at::ScalarType scalar_type) { const auto total_size = static_cast(std::accumulate( new_shape.cbegin(), new_shape.cend(), 1, std::multiplies())); size_t stride = 0; if (data.size() != 1) { ERROR_ON(total_size != data.size()); stride = 1; } auto t = at::empty( {new_shape}, at::dtype(scalar_type).memory_format(c10::MemoryFormat::Contiguous)); auto *t_data = t.data_ptr(); for (size_t i = 0; i < total_size; i++) { t_data[i] = static_cast(data[i * stride]); // NOLINT } return tensorToConstant(graph, t); } torch::jit::Node * createConstantInt(torch::jit::Graph *graph, const std::vector &data, const std::vector &new_shape) { return createConstant(graph, data, new_shape, at::ScalarType::Int); } torch::jit::Node * createConstantLong(torch::jit::Graph *graph, const std::vector &data, const std::vector &new_shape) { return createConstant(graph, data, new_shape, at::ScalarType::Long); } torch::jit::Node * createConstantFloat32(torch::jit::Graph *graph, const std::vector &data, const std::vector &new_shape) { return createConstant(graph, data, new_shape, at::ScalarType::Float); } torch::jit::Node * createConstantFloatLike(torch::jit::Graph *graph, torch::jit::Value *t, const std::vector &data, const std::vector &new_shape) { at::ScalarType const scalar_type = *t->type()->expect()->scalarType(); torch::jit::Node *new_node = createConstantFloat32(graph, data, new_shape); if (scalar_type == at::ScalarType::Half) { auto new_tensor = getNodeTensorAttrValue(new_node).to(scalar_type); setNodeTensorAttrValue(new_node, new_tensor); new_node->output()->inferTypeFrom(new_tensor); } return new_node; } torch::jit::Node *createInternalCast(torch::jit::Graph *graph, torch::jit::Value *A, const std::string &type) { // Convert from onnx string to a torch jit scalar object. c10::ScalarType const as_type = onnxStrToScalarType(type.c_str()); // Create the actual cast. return createCast(graph, A, as_type); } torch::jit::Node *createCast(torch::jit::Graph *graph, torch::jit::Value *A, c10::ScalarType scalar_type) { std::string const new_type = scalarTypeToOnnxString(scalar_type); auto *node = createCast(graph, {A}, new_type); const auto tensor_type = A->type()->expect(); node->output()->setType(tensor_type->withScalarType(scalar_type)); return node; } static std::vector convertPytorchPads(const std::vector &tensor_shape, std::vector pad_shape) { // PopART requires padding for each dimension to be specified, so pad the // padding vector with zeros twice for each unspecified dim (one for // padding_before, one for padding_after) pad_shape.resize(tensor_shape.size() * 2, 0); // Converting from PyTorch to PopART requires two steps: // 1. Reverse the order // (beginN, endN, ..., begin1, end1) -> // (end1, begin1, ..., endN, beginN) std::reverse(pad_shape.begin(), pad_shape.end()); // 2. Order padding dims by begin/end // (end1, begin1, ..., endN, beginN) -> // (begin1, ..., beginN, end1, ..., endN) // // This can be done with a single partition because begin and end // dims are at odd and even indices respectively. A stable partition // guarantees that the relative ordering of begin or end dims is unchanged std::stable_partition(pad_shape.begin(), pad_shape.end(), [&](const int64_t &dim) { auto index = &dim - std::addressof(pad_shape[0]); return index % 2 == 1; }); return pad_shape; } torch::jit::Node *createConstantPad(torch::jit::Graph *graph, torch::jit::Value *A, const std::vector &pad_shape, float constant, bool direct_pad_shape_input) { const auto converted_pad_shape = direct_pad_shape_input ? pad_shape : convertPytorchPads(shapeFromTensor(A), pad_shape); auto *pads_tensor = createConstantLong(graph, converted_pad_shape, {static_cast(converted_pad_shape.size())}) ->output(); auto *constant_value_tensor = createConstantFloat32(graph, {constant}, {1})->output(); return createAndInsertNode(graph, symbols::poptorch::constant_pad, {A, pads_tensor, constant_value_tensor}, ImplicitCast::None, OutputType::AsFirstInput); } torch::jit::Value *wrapInConstantVec(torch::jit::Graph *graph, const std::vector &data) { return createConstantInt(graph, data, {static_cast(data.size())}) ->output(); } torch::jit::Node *createEdgePad(torch::jit::Graph *graph, torch::jit::Value *A, const std::vector &pad_shape) { const auto converted_pad_shape = convertPytorchPads(shapeFromTensor(A), pad_shape); auto *pads_tensor = createConstantLong(graph, converted_pad_shape, {static_cast(converted_pad_shape.size())}) ->output(); return createAndInsertNode(graph, symbols::poptorch::edge_pad, {A, pads_tensor}, ImplicitCast::None, OutputType::AsFirstInput); ; } torch::jit::Node *createReflectionPad(torch::jit::Graph *graph, torch::jit::Value *A, const std::vector &pad_shape) { const auto converted_pad_shape = convertPytorchPads(shapeFromTensor(A), pad_shape); auto *pads_tensor = createConstantLong(graph, converted_pad_shape, {static_cast(converted_pad_shape.size())}) ->output(); return createAndInsertNode(graph, symbols::poptorch::reflection_pad, {A, pads_tensor}, ImplicitCast::None, OutputType::AsFirstInput); } torch::jit::Node *createAddNotInPlace(torch::jit::Graph *graph, torch::jit::Value *A, torch::jit::Value *B) { torch::jit::Node *new_node = createAndInsertNode( graph, symbols::poptorch::add_not_in_place, {A, B}, ImplicitCast::All, OutputType::AsImplicitCastPromoted); return new_node; } torch::jit::Node * createCustomOperation(torch::jit::Graph *graph, const std::vector &inputs, const std::string &name, const std::string &domain, std::int64_t domainVersion, std::int64_t numOutputs, const std::string &attributes_id_str) { const OutputType type = (numOutputs > 1) ? OutputType::Unknown : OutputType::AsFirstInput; torch::jit::Node *new_node = createAndInsertNode(graph, symbols::poptorch::custom_operation, inputs, ImplicitCast::None, type, numOutputs); new_node->s_(c10::Symbol::attr("name"), name); new_node->s_(c10::Symbol::attr("domain"), domain); new_node->i_(c10::Symbol::attr("version"), domainVersion); new_node->i_(c10::Symbol::attr("num_outputs"), numOutputs); new_node->s_(c10::Symbol::attr("attributes_id"), attributes_id_str); return new_node; } torch::jit::Node *createAddUntypedInputTensor(torch::jit::Graph *graph, torch::jit::Value *input) { torch::jit::Node *new_node = createAndInsertNode( graph, symbols::poptorch::add_untyped_input_tensor, {input}); return new_node; } torch::jit::Node *createAddOutputTensor(torch::jit::Graph *graph, torch::jit::Value *output) { // We explicitly don't want to add this one as we want to add it based on the // position of the other node. torch::jit::Node *new_node = graph->create(symbols::poptorch::addOutputTensor, {output}, 0); return new_node; } torch::jit::Node *createStartForLoop(torch::jit::Graph *graph, torch::jit::Value *inputs) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::poptorch::start_for_loop, inputs, ImplicitCast::None, OutputType::Unknown, 0); return new_node; } torch::jit::Node *createEndForLoop(torch::jit::Graph *graph, torch::jit::Value *outputs, torch::jit::Value *inputs, std::int64_t trip_count) { torch::jit::Node *new_node = createAndInsertNode( graph, symbols::poptorch::end_for_loop, {outputs, inputs}); new_node->i_(c10::Symbol::attr("trip_count"), trip_count); const std::size_t num_outputs = outputs->node()->inputs().size(); new_node->i_(c10::Symbol::attr("num_outputs"), num_outputs); return new_node; } torch::jit::Node *createStartIfBlock(torch::jit::Graph *graph, torch::jit::Value *condition) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::poptorch::start_if_block, condition, ImplicitCast::None, OutputType::Unknown, 0); return new_node; } torch::jit::Node *createStartElseBlock(torch::jit::Graph *graph, torch::jit::Value *outputs_then) { torch::jit::Node *new_node = createAndInsertNode( graph, symbols::poptorch::start_else_block, outputs_then, ImplicitCast::None, OutputType::Unknown, 0); return new_node; } torch::jit::Node *createEndIfBlock(torch::jit::Graph *graph, torch::jit::Value *outputs_else, torch::jit::Value *condition) { torch::jit::Node *new_node = createAndInsertNode( graph, symbols::poptorch::end_if_block, {outputs_else, condition}); const std::size_t num_outputs = outputs_else->node()->inputs().size(); new_node->i_(c10::Symbol::attr("num_outputs"), num_outputs); return new_node; } torch::jit::Node * createRandomNormal(torch::jit::Graph *graph, const std::vector &possible_inputs, const std::vector &shape, float mean, float scale, at::ScalarType dataType) { torch::jit::Node *new_node = createAndInsertNode( graph, symbols::poptorch::random_normal, possible_inputs, ImplicitCast::All, OutputType::AsDtypeOrAsPromoted, 1, dataType); new_node->is_(c10::attr::shape, shape); new_node->f_(c10::attr::mean, mean); new_node->f_(c10::attr::scale, scale); // At this point, the input is no longer needed for (size_t i = 0; i < possible_inputs.size(); i++) { new_node->removeInput(0); // input 1 and input 0 } return new_node; } torch::jit::Node *createRandomUniform(torch::jit::Graph *graph, torch::jit::Value *possible_input, const std::vector &shape, float high, float low, at::ScalarType dataType) { std::vector inputs; if (possible_input != nullptr) { inputs.push_back(possible_input); } torch::jit::Node *new_node = createAndInsertNode( graph, symbols::poptorch::random_uniform, inputs, ImplicitCast::None, OutputType::AsDtypeOrAsPromoted, 1, dataType); new_node->is_(c10::attr::shape, shape); new_node->f_(c10::attr::high, high); new_node->f_(c10::attr::low, low); // At this point, the input is no longer needed if (possible_input != nullptr) { new_node->removeInput(0); } return new_node; } torch::jit::Node *createCallCpuOp(torch::jit::Graph *graph, const std::vector &value, const std::string &id, torch::jit::Node *original_node) { const std::uint32_t num_outputs = original_node->outputs().size(); torch::jit::Node *new_node = createAndInsertNode( graph, symbols::poptorch::canonicalised_cpu_call, {value}, ImplicitCast::None, OutputType::AsDtypeOrAsPromoted, num_outputs); new_node->s_(c10::Symbol::attr("ID"), id); for (std::uint32_t i = 0; i < num_outputs; ++i) { torch::jit::Value *old_out = original_node->output(i); torch::jit::Value *new_out = new_node->output(i); new_out->copyMetadata(old_out); } return new_node; } torch::jit::Node *createSetAvailableMemory(torch::jit::Graph *graph, torch::jit::Value *value, float proportion) { torch::jit::Node *new_node = createAndInsertNode( graph, symbols::poptorch::set_available_memory, value); new_node->f_(c10::Symbol::attr("availableMemoryProportion"), proportion); new_node->output()->setType(value->type()); return new_node; } torch::jit::Node *createSetAttribute(torch::jit::Graph *graph, const std::string &attribute, const std::string &key, const std::string &value, bool insert_after_insertion_pnt) { torch::jit::Node *new_node = nullptr; if (insert_after_insertion_pnt) { new_node = createNode(graph, symbols::poptorch::set_attribute, {}, ImplicitCast::None, OutputType::Unknown, 0); insertNodeAfterNode(new_node, graph->insertPoint()); } else { new_node = createAndInsertNode(graph, symbols::poptorch::set_attribute, {}, ImplicitCast::None, OutputType::Unknown, 0); } new_node->s_(c10::Symbol::attr("attribute"), attribute); new_node->s_(c10::Symbol::attr("key"), key); new_node->s_(c10::Symbol::attr("value"), value); return new_node; } torch::jit::Node *createClearAttribute(torch::jit::Graph *graph, const std::string &attribute, const std::string &key, bool insert_after_insertion_pnt) { torch::jit::Node *new_node = nullptr; if (insert_after_insertion_pnt) { new_node = createNode(graph, symbols::poptorch::clear_attribute, {}, ImplicitCast::None, OutputType::Unknown, 0); insertNodeAfterNode(new_node, graph->insertPoint()); } else { new_node = createAndInsertNode(graph, symbols::poptorch::clear_attribute, {}, ImplicitCast::None, OutputType::Unknown, 0); } new_node->s_(c10::Symbol::attr("attribute"), attribute); new_node->s_(c10::Symbol::attr("key"), key); return new_node; } torch::jit::Node *createSetMatMulSerialization(torch::jit::Graph *graph, torch::jit::Value *matmul, const std::string &mode, int64_t factor, bool keep_precision) { torch::jit::Node *new_node = createAndInsertNode( graph, symbols::poptorch::set_matmul_serialization, {matmul}); new_node->s_(c10::Symbol::attr("mode"), mode); new_node->i_(c10::Symbol::attr("factor"), factor); new_node->i_( c10::Symbol::attr("keep_precision"), static_cast(keep_precision)); new_node->output()->setType(matmul->type()); return new_node; } torch::jit::Node *createBeginIpuBlock(torch::jit::Graph *graph, std::uint64_t stage_id, std::int64_t phase, std::int64_t ipu_id) { torch::jit::Node *new_node = createAndInsertNode( graph, c10::Symbol::fromQualString("poptorch::begin_ipu_block"), {}, ImplicitCast::None, OutputType::Unknown, 0); new_node->i_(c10::Symbol::attr("stage"), stage_id); new_node->i_(c10::Symbol::attr("phase"), phase); new_node->i_(c10::Symbol::attr("ipu"), ipu_id); return new_node; } torch::jit::Node * createOptimizerGroup(torch::jit::Graph *graph, std::uint64_t group, const std::vector &list_of_params) { torch::jit::Node *new_node = createAndInsertNode( graph, symbols::poptorch::optimizer_group, list_of_params, ImplicitCast::None, OutputType::Unknown, 0); new_node->i_(c10::Symbol::attr("group"), group); return new_node; } torch::jit::Node *createRecomputationCheckpoint(torch::jit::Graph *graph, torch::jit::Value *value) { return createAndInsertNode(graph, symbols::poptorch::recomputation_checkpoint, {value}, ImplicitCast::None, OutputType::AsFirstInput); } torch::jit::Node *createUnfold(torch::jit::Graph *graph, torch::jit::Value *value, int64_t dimension, int64_t size, int64_t step) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::poptorch::unfold, {value}, ImplicitCast::None, OutputType::AsFirstInput); new_node->i_(c10::Symbol::fromQualString("attr::dimension"), dimension); new_node->i_(c10::Symbol::fromQualString("attr::size"), size); new_node->i_(c10::Symbol::fromQualString("attr::step"), step); return new_node; } torch::jit::Node *createMultiConvPart(torch::jit::Graph *graph, torch::jit::Node *conv_node) { ERROR_ON_MSG(conv_node->kind() != symbols::popart::conv, "Can only create multi_conv_part from conv node"); torch::jit::Node *new_node = createAndInsertNode( graph, symbols::poptorch::multi_conv_part, conv_node->inputs(), ImplicitCast::All, OutputType::AsImplicitCastPromoted); new_node = new_node->copyAttributes(*conv_node); new_node->output()->setType(conv_node->output()->type()); return new_node; } torch::jit::Node *createGru(torch::jit::Graph *graph, const std::vector &args, int64_t hidden_size) { torch::jit::Node *new_node = createAndInsertNode( graph, symbols::poptorch::gru, args, ImplicitCast::All, OutputType::AsImplicitCastPromoted, 2); new_node->i_(c10::attr::hidden_size, hidden_size); return new_node; } torch::jit::Node *createRnn(torch::jit::Graph *graph, const std::vector &args, const std::vector &activations) { torch::jit::Node *new_node = createAndInsertNode( graph, symbols::poptorch::rnn, args, ImplicitCast::All, OutputType::AsImplicitCastPromoted, 2); new_node->ss_(c10::Symbol::attr("activations"), activations); return new_node; } torch::jit::Node *createPrelu(torch::jit::Graph *graph, torch::jit::Value *self, torch::jit::Value *weight) { torch::jit::Node *new_node = createAndInsertNode(graph, symbols::poptorch::prelu, {self, weight}, ImplicitCast::None, OutputType::AsFirstInput); return new_node; } /* * Auto generated operation. */ #include "CompilerOps.cpp.inc" } // namespace poptorch ================================================ FILE: poptorch/source/OverlappedIO.cpp ================================================ // Copyright (c) 2021 Graphcore Ltd. All rights reserved. #include #include #include "PoptorchSymbols.hpp" #include "poptorch/DispatchTracer.hpp" #include "poptorch/OverlappedIO.hpp" #include "poptorch/Utils.hpp" #include "poptorch_logging/Error.hpp" #include "poptorch_logging/Logging.hpp" namespace poptorch { namespace { void attributiseOverlappedInputs( torch::jit::Graph *graph, std::set *to_erase_output_and_delete, std::vector *to_delete) { logging::LogContext const ctx("attributiseOverlappedInputs"); int64_t input_num = -1; for (auto *input : graph->inputs()) { if (isParameter(input)) { continue; } input_num++; auto input_uses = input->uses(); if (input_uses.empty()) { continue; } auto &user(input_uses[0].user); if ((input_uses.size() == 1) && (user->kind() == poptorch::symbols::poptorch::set_overlap_for_input)) { auto *value_node = user->input(1)->node(); ERROR_ON(value_node->kind() != c10::prim::Constant); const auto &value_str = value_node->s(c10::attr::value); graph->param_node()->s_(getOverlapSymbol("input", input_num), value_str); to_delete->push_back(user); user->removeInput(1); // String constant may be shared if (value_node->output()->uses().empty()) { to_erase_output_and_delete->insert(value_node); } user->output()->replaceAllUsesWith(input); continue; } // This should be the only op for (const auto &other_use : input_uses) { ERROR_ON_MSG( other_use.user->kind() == poptorch::symbols::poptorch::set_overlap_for_input, "poptorch.set_overlap_for_input must be the only op applied to an " "input. This is not the case for input " << input->debugName() << " to the model."); } } } void errorOnDoubleReturnOfOutput(torch::jit::Node *node) { logging::LogContext const ctx("check double return of" + nodeToString(node)); uint32_t return_count = 0; std::function count_returns; count_returns = [&count_returns, &return_count](torch::jit::Value *input_value) { for (auto use : input_value->uses()) { if (use.user->kind() == poptorch::symbols::poptorch::set_overlap_for_output || use.user->kind() == c10::prim::ListConstruct || use.user->kind() == c10::prim::TupleConstruct) { count_returns(use.user->output()); } else if (use.user->kind() == c10::prim::Return) { return_count++; } } }; count_returns(node->input(0)); ERROR_ON(return_count == 0); ERROR_ON_MSG( return_count > 1, "poptorch.set_overlap_for_output cannot be " "used with a tensor that is returned twice. Please check all returned " "tensors including those nested in tuples/lists."); } void attributiseOverlappedOutputs( torch::jit::Graph *graph, std::set *to_erase_output_and_delete, std::vector *to_delete) { logging::LogContext const ctx("attributiseOverlappedOutputs"); int64_t output_num = 0; std::function process_node; process_node = [&process_node, graph, &output_num, to_erase_output_and_delete, to_delete](torch::jit::Node *node) { auto overlap_symbol = getOverlapSymbol("output", output_num); if (node->kind() == poptorch::symbols::poptorch::set_overlap_for_output) { errorOnDoubleReturnOfOutput(node); auto *value_node = node->input(1)->node(); ERROR_ON(value_node->kind() != c10::prim::Constant); const auto &value_str = value_node->s(c10::attr::value); graph->return_node()->s_(overlap_symbol, value_str); to_delete->push_back(node); node->removeInput(1); // String constant may be shared if (value_node->output()->uses().empty()) { to_erase_output_and_delete->insert(value_node); } node->output()->replaceAllUsesWith(node->input(0)); output_num++; } else if (node->kind() == c10::prim::ListConstruct || node->kind() == c10::prim::TupleConstruct) { for (auto *input : node->inputs()) { process_node(input->node()); } } else { const std::string value_str = "no_overlap"; graph->return_node()->s_(overlap_symbol, value_str); output_num++; } }; // Loop over all graph (there may always only be one as multiple inputs are // returned as a tuple/list) for (auto *output : graph->outputs()) { process_node(output->node()); } } } // namespace void attributiseOverlappedIO(torch::jit::Graph *graph) { std::set to_erase_output_and_delete; std::vector to_delete; attributiseOverlappedInputs(graph, &to_erase_output_and_delete, &to_delete); attributiseOverlappedOutputs(graph, &to_erase_output_and_delete, &to_delete); for (auto *node : to_erase_output_and_delete) { node->eraseOutput(0); node->destroy(); } for (torch::jit::Node *node : to_delete) { node->destroy(); } // Any other use of set_overlap_for_input or set_overlap_for_input is invalid for (auto *node : graph->nodes()) { ERROR_ON_MSG(node->kind() == poptorch::symbols::poptorch::set_overlap_for_input, "poptorch.set_overlap_for_input applied on a node which is " "not a tensor input to the model."); ERROR_ON_MSG(node->kind() == poptorch::symbols::poptorch::set_overlap_for_output, "poptorch.set_overlap_for_output applied on a node which is " "not a tensor output to the model."); } } } // namespace poptorch ================================================ FILE: poptorch/source/PopartCanonicalization.cpp ================================================ // Copyright (c) 2020 Graphcore Ltd. All rights reserved. #include #include #include #include #include #include "PoptorchSymbols.hpp" #include "popart_canonicalization/PopartCanonicalizationUtils.hpp" #include "poptorch/DispatchTracer.hpp" #include "poptorch/InplaceOps.hpp" #include "poptorch/OpBuilder.hpp" #include "poptorch/PopartCanonicalization.hpp" #include "poptorch/Utils.hpp" #include "poptorch_logging/Error.hpp" #include "poptorch_logging/Logging.hpp" namespace torch { namespace jit { bool isInplaceOp(const Node *node); } // namespace jit } // namespace torch namespace poptorch { namespace { struct ReplaceInfo { bool allow_original_input_modifications; torch::jit::Value *original_input; torch::jit::Value *modified_input; }; // In-place modification of slices is a special case. When we // modify a slice in-place, torch produces a graph like the // following: // // %x = input, shape = [4, 4] // %1 = slice(%x), shape = [2, 2] // %2 = add(%1, %1) // %3 = slice(%x), shape = [2, 2] // %4 = copy_(%3, %2), shape = [2, 2] // return %x, shape = [4, 4] // // The original input %x is returned because the slice %3 is a // view on %x, and thus any modifications to %3 are reflected // in %x. To simulate in-place modification to slices, we return // a dynamic update instead, so that we can perform the slice // modification out-of-place, and return the "modified" tensor // with the correct shape // // %x = input, shape = [4, 4] // %1 = slice(%x), shape = [2, 2] // %2 = add(%1, %1) // %3 = dynamic_update(%x, %2) shape = [4, 4] // return %3, shape = [4, 4] // torch::jit::Node * handleSliceModification(torch::jit::Graph *graph, torch::jit::Node *node, torch::jit::Value *modified_slice, std::vector *replace_infos) { torch::jit::Value *input = node->input(0); torch::jit::Node *new_node = modified_slice->node(); bool replace_infos_allow_input_modification = false; // Follow the chain of slices that are being operated on by the inplace op while (input->node()->kind() == symbols::popart::slice || input->node()->kind() == symbols::popart::reshape_static_shape) { // skip reshape_static_shape and continue scanning for slice op, example IR // handled in this way: // %1 = slice(%x) // %2 = popart::reshape_static_shape(%1) // %3 = slice(%2) // %4 = popart::reshape_static_shape(%3) // in addition, in such case original_input in replace_infos is allowed to // be modified during replace_infos processing if (input->node()->kind() == symbols::popart::reshape_static_shape) { input = input->node()->input(0); replace_infos_allow_input_modification = true; continue; } auto *slice = input->node(); auto *slice_input = slice->input(0); // Record the indices that we sliced: We need these for DynamicUpdate std::vector slice_starts = slice->is(c10::Symbol::attr("starts")); std::vector slice_ends = slice->is(c10::Symbol::attr("ends")); const std::vector slice_dims = slice->is(c10::Symbol::attr("axes")); auto *slice_offset = createConstantInt(graph, slice_starts, {static_cast(slice_starts.size())}) ->output(); std::vector sizes(slice_starts.size()); std::transform(std::begin(slice_ends), std::end(slice_ends), std::begin(slice_starts), std::begin(sizes), std::minus()); auto *dynamic_update = createDynamicupdate(graph, {slice_input, slice_offset, modified_slice}, slice_dims, sizes, /* noOverlap = */ 1); // Save the slice input and the result of the dynamic update // (i.e. the modified tensor) so that we can replace the original // inputs after PopART canonicalisation has taken place auto *modified_input = dynamic_update->output(); replace_infos->push_back( {replace_infos_allow_input_modification, slice_input, modified_input}); new_node = dynamic_update; // Repeat this process for the entire chain of slices - the // reconstructed modified input is used to reconstruct the next // modified slice input input = slice_input; modified_slice = modified_input; } // Dynamic update does not support step size. Slicing with step size is // implemented using subsample(slice(x)) if (input->node()->kind() == symbols::popart::subsample) { auto *subsample = input->node(); if (subsample->input(0)->node()->kind() == symbols::popart::slice) { ERROR("In-place modification of slices with step size other than 1 is " "not supported."); } } return new_node; } // Propagates half types across lists (tuple set to false) or tuples (tuple set // to true). // If the new node is a List/TupleConstruct, it will not, by default, have the // types set to half when they should be, because tracing is always performed // with floats. Use this function to rememby that on a List/Tuple construct // after it has been created. void propagateHalfOnListOrTupleConstruct(torch::jit::Node *n, bool tuple) { auto constr_type = tuple ? at::prim::TupleConstruct : at::prim::ListConstruct; auto unpack_type = tuple ? at::prim::TupleUnpack : at::prim::ListUnpack; if (n->kind() != constr_type) { return; } // Record which inputs were half: they would not have been on tracing but // would be change during canonicalization std::vector input_was_half; input_was_half.reserve(n->inputs().size()); for (auto *input : n->inputs()) { // Skip if it is not a tensor or has no scalar type auto tensor_type = input->type()->cast(); if ((!tensor_type) || !tensor_type->scalarType()) { input_was_half.emplace_back(false); continue; } input_was_half.emplace_back(getNodeScalarType(input) == at::ScalarType::Half); } // Propagate types on the unpack node(s) for (const auto &use : n->output()->uses()) { torch::jit::Node *unpack = use.user; if (unpack->kind() != unpack_type) { continue; } size_t idx = 0; for (auto *output : unpack->outputs()) { // The output will be float as tracing was carried out using floats. if (input_was_half[idx]) { output->setType( output->type()->expect()->withScalarType( c10::ScalarType::Half)); } idx++; } } } class CanonicalizeImpl { public: static void run(torch::jit::Graph *graph); }; /* * ConvertAtenToPopart implementation. */ void CanonicalizeImpl::run(torch::jit::Graph *graph) { logging::LogContext const ctx_func("PopartCanonicalization"); std::vector replace_infos; for (torch::jit::Node *node : graph->nodes()) { logging::LogContext const ctx("processing " + nodeToString(node)); const WithNodeMetadata metadata(node); torch::jit::WithInsertPoint const insert_point(node); torch::jit::Node *new_node = nullptr; torch::jit::Symbol const kind = node->kind(); if (const SymbolHandler handler = getHandler(kind)) { new_node = handler(graph, node); const bool was_inplace_op_on_view = node->hasAttributeS("was_inplace_on_view") && node->i(c10::Symbol::attr("was_inplace_on_view")) == 1; if (was_inplace_op_on_view || torch::jit::isInplaceOp(node)) { new_node = handleSliceModification(graph, node, new_node->output(), &replace_infos); } } // If we have a new node add it and replace the old use. if (new_node != nullptr) { // Mark this node for deletion. markNodeForDeletion(node); if (node->hasUses()) { for (std::uint64_t i = 0; i < node->outputs().size(); ++i) { if (i >= new_node->outputs().size()) { ERROR_ON_MSG( node->output(i)->hasUses(), "The canonicalised JIT node has fewer outputs than the " "dispatch function. This is only an issue because these " "outputs are used."); continue; } // As well as replacing the use, this will copy across shape/type // if not explicitly set. replaceOutputUse(node, new_node, i); } // Propagate half types across ListConstructs and TupleConstructs propagateHalfOnListOrTupleConstruct(new_node, true); propagateHalfOnListOrTupleConstruct(new_node, false); } } } // Replace slice inputs with their modified counterparts for (auto curr_info_iter = replace_infos.begin(); curr_info_iter != replace_infos.end(); ++curr_info_iter) { curr_info_iter->original_input->replaceAllUsesAfterNodeWith( curr_info_iter->modified_input->node(), curr_info_iter->modified_input); for (auto next_info_iter = curr_info_iter + 1; next_info_iter != replace_infos.end(); ++next_info_iter) { // if original input modification is allowed this code will modify // subsequent replace infos if original inputs are the same. // // example: // replace_infos[0] = {x, // don't care // original_input = %1, // modified_input = %2} // replace_infos[1] = {false, // original_input = %1, // modified_input = %3} // replace_infos[2] = {true, // original_input = %1, // modified_input = %4} // // >>>> will modify replace info struct at index 2: // // replace_infos[1] = {false, // original_input = %1, // modified_input = %3} // replace_infos[2] = {true, // original_input = %2, <-- was %1 // modified_input = %4} if (next_info_iter->allow_original_input_modifications && curr_info_iter->original_input == next_info_iter->original_input) { next_info_iter->original_input = curr_info_iter->modified_input; } } } // Build a list of nodes marked for deletion. std::unordered_set to_delete; for (torch::jit::Node *node : graph->nodes()) { if (isMarkedForDeletion(node)) { to_delete.insert(node); } } // Remove the dead nodes. searchAndPossiblyDestroy(to_delete); } } // namespace void canonicalize(torch::jit::Graph *graph) { const CanonicalizeImpl converter; converter.run(graph); } } // namespace poptorch ================================================ FILE: poptorch/source/PopartLateCanonicalization.cpp ================================================ // Copyright (c) 2020 Graphcore Ltd. All rights reserved. #include #include #include #include "PoptorchSymbols.hpp" #include "popart_canonicalization/PopartCanonicalizationUtils.hpp" #include "poptorch/OpBuilder.hpp" #include "poptorch/PopartCanonicalization.hpp" #include "poptorch/Utils.hpp" #include "poptorch_logging/Error.hpp" #include "poptorch_logging/Logging.hpp" namespace poptorch { using FunctionTy = std::function; // broadcast the scalar option vector to match num_convs template void broadcast(std::vector &option, size_t num_convs) { if (option.size() != 1 || num_convs == 1) { return; } option.insert(option.end(), num_convs - 1, option[0]); } class MultiConvHandler { public: explicit MultiConvHandler(torch::jit::Graph *g) : _graph(g) {} bool inMultiConv() const { return _in_multi_conv; } void begin(torch::jit::Node *node) { ERROR_ON_MSG(inMultiConv(), "Nested poptorch.MultiConv is not supported."); _in_multi_conv = true; _to_delete.insert(node); } void part(torch::jit::Node *node) { _parts.push_back(node); } FunctionTy end(torch::jit::Node *node) { ERROR_ON_MSG(!inMultiConv() || _parts.empty(), "Unexpected end_multi_conv, is the IR malformed?"); _in_multi_conv = false; applyOptions(node); _parts_queue.push(_parts); _parts.clear(); return [this, node]() { applyPartLinks(node); }; } void cleanup() { searchAndPossiblyDestroy(_to_delete); } private: void applyOptions(torch::jit::Node *end_node) { // Fold any supplied options as attributes of the end_node. // Mark all options for deletion when we cleanup the IR. // available_memory_proportions torch::jit::Node *available_mem_props = end_node->input(0)->node(); _to_delete.insert(available_mem_props); if (!isNone(available_mem_props)) { std::vector vals = constantListToVec(available_mem_props, constantToFloat); broadcast(vals, _parts.size()); end_node->fs_(c10::Symbol::attr("available_memory_proportions"), vals); } // partials_types torch::jit::Node *partials_types = end_node->input(1)->node(); _to_delete.insert(partials_types); if (!isNone(partials_types)) { std::vector vals = constantToLongVec(partials_types); broadcast(vals, _parts.size()); end_node->is_(c10::Symbol::attr("partials_types"), vals); } // plan_type torch::jit::Node *plan_type = end_node->input(2)->node(); _to_delete.insert(plan_type); if (!isNone(plan_type)) { end_node->i_(c10::Symbol::attr("plan_type"), constantToLong(plan_type)); } // per_conv_reserved_tiles torch::jit::Node *reserved_tiles = end_node->input(3)->node(); _to_delete.insert(reserved_tiles); if (!isNone(reserved_tiles)) { end_node->i_(c10::Symbol::attr("per_conv_reserved_tiles"), constantToLong(reserved_tiles)); } // cycle_back_off torch::jit::Node *back_off = end_node->input(4)->node(); _to_delete.insert(back_off); if (!isNone(back_off)) { end_node->f_(c10::Symbol::attr("cycle_back_off"), constantToFloat(back_off)); } // enable_conv_dithering torch::jit::Node *enable_conv_ditherings = end_node->input(5)->node(); _to_delete.insert(enable_conv_ditherings); if (!isNone(enable_conv_ditherings)) { std::vector vals = constantToLongVec(enable_conv_ditherings); broadcast(vals, _parts.size()); end_node->is_(c10::Symbol::attr("enable_conv_ditherings"), vals); } // Clear all the options from the end node inputs as they are now // incorporated as node attributes end_node->removeAllInputs(); } void applyPartLinks(torch::jit::Node *end_node) { // Swaps out conv nodes with multi_conv_part which are then linked to the // end_node. Each conv output flows through the end_multi_conv instruction. uint64_t num_outputs = 0; // Track the earliest user for the multiconv outputs torch::jit::Node *earliest_user = nullptr; for (torch::jit::Node *node : _parts_queue.front()) { // Create the multi_conv_part node and insert it after the original conv WithNodeMetadata meta(node); torch::jit::Node *conv_part = createMultiConvPart(_graph, node); conv_part->moveAfter(node); _to_delete.insert(node); // Attach the multi_conv_part to the end_multi_conv instruction. end_node->addInput(conv_part->output()); torch::jit::Value *output_i = end_node->addOutput(); output_i->setType(conv_part->output()->type()); replaceOutputUse(node->output(), end_node->output(num_outputs)); // Keep track of the first node that consumes the multiconv outputs torch::jit::Node *output_user = findEarliestUser(output_i); if ((earliest_user == nullptr) || earliest_user->isAfter(output_user)) { earliest_user = output_user; } num_outputs++; } _parts_queue.pop(); if (end_node->isBefore(earliest_user)) { // All good, nothing further to do here return; } // Move the end_multi_conv instruction directly before its first consumer // and check for any dependency violations that might have been introduced. end_node->moveBefore(earliest_user); torch::jit::node_list checklist{end_node}; while (!checklist.empty()) { torch::jit::Node *consumer = checklist.back(); checklist.pop_back(); for (torch::jit::Value *value : consumer->inputs()) { torch::jit::Node *producer = value->node(); // Fix any topological ordering violations and check any moved nodes if (producer->isAfter(consumer)) { producer->moveBefore(consumer); checklist.push_back(producer); } } } } torch::jit::Graph *_graph; std::unordered_set _to_delete; torch::jit::node_list _parts; std::queue _parts_queue; bool _in_multi_conv = false; }; // Reorders set_matmul_serialization and reshape if required FunctionTy reorderMatmulSeralisationIfRequired(torch::jit::Node *node) { return [node]() { ERROR_ON(node->kind() != symbols::poptorch::set_matmul_serialization); auto *reshape_node = node->input()->node(); // If the input is a matmul, no reordering is necessary if (reshape_node->kind() == symbols::popart::matmul) { return; } ERROR_ON(reshape_node->kind() != symbols::popart::reshape_static_shape); ERROR_ON(reshape_node->input()->node()->kind() != symbols::popart::matmul); // Change matmul -> reshape -> set_matmul_seralization // to matmul -> set_matmul_seralization -> reshape node->moveBefore(reshape_node); node->replaceInput(0, reshape_node->input()); // matmul -> reshape // \-> set_matmul_seralization -> ... node->output()->replaceAllUsesWith(reshape_node->output()); // matmul -> reshape -> ... // \-> set_matmul_seralization reshape_node->replaceInput(0, node->output()); // matmul -> set_matmul_seralization -> reshape -> ... }; } void canonicalizeLate(torch::jit::Graph *graph) { logging::LogContext ctx_func("canonicalizeLate"); /* * Perform the operation by looking for nodes we know need to be patched and * add the patching code to the callback which then all get called at once. * (To perserve the iterators.) */ std::vector callbacks; MultiConvHandler multi_conv_handler(graph); // Look for the nodes. for (torch::jit::Node *node : graph->nodes()) { logging::LogContext ctx("Processing " + nodeToString(node)); const torch::jit::Symbol kind = node->kind(); if (kind == symbols::poptorch::begin_multi_conv) { multi_conv_handler.begin(node); } else if (multi_conv_handler.inMultiConv() && kind == symbols::popart::conv) { multi_conv_handler.part(node); } else if (kind == symbols::poptorch::end_multi_conv) { callbacks.emplace_back(multi_conv_handler.end(node)); } else if (kind == symbols::poptorch::push_name_scope) { std::string name = constantToString(node->input(0)->node()); node->s_(c10::Symbol::attr("name"), name); // Remove inputs converted to attributes. callbacks.emplace_back( [node]() { removeAndPossiblyDestroyAllInputs(node); }); } else if (kind == symbols::poptorch::set_attribute) { if (node->inputs().empty()) { continue; } std::string attribute = constantToString(node->input(0)->node()); std::string key = constantToString(node->input(1)->node()); std::string value = constantToString(node->input(2)->node()); // Remove inputs converted to attributes. callbacks.emplace_back( [node]() { removeAndPossiblyDestroyAllInputs(node); }); node->s_(c10::Symbol::attr("attribute"), attribute); node->s_(c10::Symbol::attr("key"), key); node->s_(c10::Symbol::attr("value"), value); } else if (kind == symbols::poptorch::clear_attribute) { if (node->inputs().empty()) { continue; } std::string attribute = constantToString(node->input(0)->node()); std::string key = constantToString(node->input(1)->node()); node->s_(c10::Symbol::attr("attribute"), attribute); node->s_(c10::Symbol::attr("key"), key); // Remove inputs converted to attributes. callbacks.emplace_back( [node]() { removeAndPossiblyDestroyAllInputs(node); }); } else if (kind == symbols::poptorch::set_matmul_serialization) { callbacks.emplace_back(reorderMatmulSeralisationIfRequired(node)); } else if (kind == symbols::poptorch::set_available_memory) { callbacks.emplace_back( [node]() { moveSetAvailableMemoryIfRequired(node); }); } } // Execute the patchups. for (auto &callback : callbacks) { callback(); } multi_conv_handler.cleanup(); } } // namespace poptorch ================================================ FILE: poptorch/source/PoplarExecutable.cpp ================================================ // Copyright (c) 2020 Graphcore Ltd. All rights reserved. #include #include #include #include #include "poptorch/DispatchTracer.hpp" #include "poptorch/InplaceOps.hpp" #include "poptorch/PoplarExecutable.hpp" #include "poptorch/Utils.hpp" #include "poptorch_logging/Error.hpp" #include "poptorch_logging/Logging.hpp" namespace poptorch { void PoplarExecutable::updateOptimizers( const std::vector &optimizers) { _compiler.updateOptimizers(optimizers); } std::vector PoplarExecutable::run(std::vector &inTensors) { const std::vector tensor_views; // Set up the input tensors in the poplar graph to point to the incoming // pytorch tensors. for (std::size_t i = 0; i < _popart_inputs.size(); ++i) { popart_compiler::TensorId const popart_id = _popart_inputs[i]; const at::Tensor &pytorch_tensor = inTensors.at(i); ERROR_ON(!pytorch_tensor.is_contiguous()); // Convert to correct data type. std::vector popart_dims(pytorch_tensor.sizes().size()); std::transform(pytorch_tensor.sizes().begin(), pytorch_tensor.sizes().end(), popart_dims.begin(), [](std::int64_t j) { return j; }); // Handle input based on the PyTorch input type at::ScalarType const elem_type = pytorch_tensor.scalar_type(); void *data_ptr = nullptr; if (pytorch_tensor.is_cpu()) { data_ptr = pytorch_tensor.data_ptr(); } else { data_ptr = getDataSource(pytorch_tensor); } ERROR_ON(data_ptr == nullptr); switch (elem_type) { case at::ScalarType::Byte: _compiler.setUpInputOp(popart_id, static_cast(data_ptr), popart_dims); break; case at::ScalarType::Char: _compiler.setUpInputOp(popart_id, static_cast(data_ptr), popart_dims); break; case at::ScalarType::Float: _compiler.setUpInputOp(popart_id, static_cast(data_ptr), popart_dims); break; case at::ScalarType::Half: _compiler.setUpInputOp(popart_id, static_cast(data_ptr), popart_dims, true); break; case at::ScalarType::Short: _compiler.setUpInputOp(popart_id, static_cast(data_ptr), popart_dims); break; case at::ScalarType::Int: _compiler.setUpInputOp(popart_id, static_cast(data_ptr), popart_dims); break; case at::ScalarType::Bool: _compiler.setUpInputOp(popart_id, static_cast(data_ptr), popart_dims); break; case at::ScalarType::Long: // If it's an IPU tensor then it should have been handled by the // dispatcher. ERROR_ON_MSG(!pytorch_tensor.is_cpu(), "Only supported for CPU tensors"); _converted_inputs[i] = pytorch_tensor.toType(at::ScalarType::Int); _compiler.setUpInputOp( popart_id, static_cast(_converted_inputs[i].data_ptr()), popart_dims); break; case at::ScalarType::Double: case at::ScalarType::BFloat16: // If it's an IPU tensor then it should have been handled by the // dispatcher. ERROR_ON_MSG(!pytorch_tensor.is_cpu(), "Only supported for CPU tensors"); _converted_inputs[i] = pytorch_tensor.toType(at::ScalarType::Float); _compiler.setUpInputOp( popart_id, static_cast(_converted_inputs[i].data_ptr()), popart_dims); break; default: ERROR("Unsupported input type torch." << c10::toString(elem_type)); } } // Temp buffers for the output state. std::vector returnees; returnees.reserve(_popart_outputs.size()); // Set up the outputs. for (size_t i = 0; i < _popart_outputs.size(); i++) { const popart_compiler::TensorId &popart_id(_popart_outputs[i]); auto dims = _compiler.getSize(popart_id); ERROR_ON_MSG(dims == popart_compiler::Compiler::invalid_size, "Shape inference failed"); std::uint64_t const b_dim = _compiler.popartBatchDimForAnchor(popart_id); if (b_dim > 1) { // Treat scalars as 1D tensors if necessary for batching. if (dims.empty()) { dims.push_back(1); } // Adjust by the popart batch dim, accounting for the anchor. dims[0] *= b_dim; } // Create the torch tensor and use its memory for the popart tensor. at::ScalarType const type = _popart_output_types[i]; returnees.emplace_back(at::empty( {dims}, at::dtype(type).memory_format(c10::MemoryFormat::Contiguous))); auto *data_ptr = returnees.back().toTensor().data_ptr(); switch (type) { case at::ScalarType::Byte: _compiler.setUpOutputOp(popart_id, static_cast(data_ptr), dims); break; case at::ScalarType::Char: _compiler.setUpOutputOp(popart_id, static_cast(data_ptr), dims); break; case at::ScalarType::Float: _compiler.setUpOutputOp(popart_id, static_cast(data_ptr), dims); break; case at::ScalarType::Half: case at::ScalarType::Short: _compiler.setUpOutputOp(popart_id, static_cast(data_ptr), dims); break; case at::ScalarType::Int: _compiler.setUpOutputOp(popart_id, static_cast(data_ptr), dims); break; case at::ScalarType::Bool: _compiler.setUpOutputOp(popart_id, static_cast(data_ptr), dims); break; default: ERROR("Unexpected type returned from popart"); } } // Execute the compiled poplar graph. _compiler.run(); const auto &mapping = _inplace_info.input_output_mapping; for (size_t i = 0; i < mapping.size(); i++) { if (mapping[i] == InplaceGraphInfo::no_mapping) { continue; } auto out_tensor = returnees.at(mapping[i]).toTensor(); inTensors.at(i).copy_(out_tensor, false); } returnees.resize(_inplace_info.num_tensor_outputs); return returnees; } void PoplarExecutable::loadEngineAndConnectStreams() { if (!_compiler.isAttachedToDevice()) { _compiler.attachToDevice(); } _compiler.loadEngineAndConnectStreams(); } // Tell popart to copy weights off the IPU and write into host memory. void PoplarExecutable::copyWeightsToHost( const std::map &buffers) { std::vector pointers; pointers.reserve(_parameter_names.size()); for (const std::string &name : _parameter_names) { pointers.push_back(buffers.at(name)); } _compiler.copyWeightsToHost(pointers); } // Tell popart to copy weights from host into IPU memory. void PoplarExecutable::copyWeightsToDevice( const std::map &buffers) { std::vector pointers; pointers.reserve(_parameter_names.size()); for (const std::string &name : _parameter_names) { pointers.push_back(buffers.at(name)); } _compiler.copyWeightsToDevice(pointers); } // Tell popart to copy named buffers from host into IPU memory. void PoplarExecutable::copyNamedBuffersToDevice( const std::map &buffers) { std::vector pointers; pointers.reserve(buffers.size()); for (const auto &buffer : buffers) { pointers.push_back(buffer.second); } _compiler.copyNamedBuffersToDevice(pointers); } const std::vector & PoplarExecutable::outputTypes() const { return _compiler.outputTypes(); } std::string PoplarExecutable::getPopartIR() const { auto managed_ptr = _compiler.getPopartIR(); const char *raw_ptr = static_cast(managed_ptr.get()); // Convert to std::string, copying again. return raw_ptr; } std::set PoplarExecutable::getTensorNames() const { std::set casted_ids; const auto tensor_ids = _compiler.getTensorNames(); for (const auto &tensor_id : tensor_ids) { const char *raw_ptr = static_cast(tensor_id.get()); // Convert to std::string, copying again. casted_ids.insert(raw_ptr); } return casted_ids; } void PoplarExecutable::detachFromDevice() { _compiler.detachFromDevice(); } void PoplarExecutable::attachToDevice() { _compiler.attachToDevice(); } bool PoplarExecutable::isAttachedToDevice() const { return _compiler.isAttachedToDevice(); } } // namespace poptorch ================================================ FILE: poptorch/source/PoptorchStaticInit.hpp ================================================ // Copyright (c) 2020 Graphcore Ltd. All rights reserved. #ifndef SOURCE_POPTORCH_STATIC_INIT_H #define SOURCE_POPTORCH_STATIC_INIT_H // The constants below set priorities for constructor functions used to // initialize static data. Functions with lower numbers run first. // Priority value for symbol initialisation functions #define SYMBOL_INIT_PRIORITY 101 // Priority value for shape inference registration functions #define SHAPE_INFERENCE_INIT_PRIORITY 102 // Priority value for handler registration functions #define HANDLER_INIT_PRIORITY 103 #endif ================================================ FILE: poptorch/source/PoptorchSymbols.cpp ================================================ // Copyright (c) 2020 Graphcore Ltd. All rights reserved #include #include #include "PoptorchStaticInit.hpp" #include "PoptorchSymbols.hpp" #include "poptorch_logging/Logging.hpp" #define SYMBOL_INIT(Namespace, Name) \ Name = c10::Symbol::fromQualString(#Namespace "::" #Name) namespace c10::aten { c10::Symbol multilabel_soft_margin_loss; // NOLINT // clang-format off __attribute__((constructor(SYMBOL_INIT_PRIORITY))) static void initializeAtenSymbols() { // clang-format on poptorch::logging::trace("Initializing aten symbols"); SYMBOL_INIT(aten, multilabel_soft_margin_loss); } } // namespace c10::aten namespace torch_scatter { c10::Symbol scatter_max; // NOLINT c10::Symbol scatter_min; // NOLINT c10::Symbol scatter_mul; // NOLINT // clang-format off __attribute__((constructor(SYMBOL_INIT_PRIORITY))) static void initializeTorchScatterSymbols() { // clang-format on poptorch::logging::trace("Initializing torch_scatter symbols"); SYMBOL_INIT(torch_scatter, scatter_max); SYMBOL_INIT(torch_scatter, scatter_min); SYMBOL_INIT(torch_scatter, scatter_mul); } } // namespace torch_scatter namespace torch_cluster { c10::Symbol grid; // NOLINT // clang-format off __attribute__((constructor(SYMBOL_INIT_PRIORITY))) static void initializeTorchScatterSymbols() { // clang-format on poptorch::logging::trace("Initializing torch_scatter symbols"); SYMBOL_INIT(torch_cluster, grid); } } // namespace torch_cluster namespace torch_spline_conv { c10::Symbol spline_basis; // NOLINT c10::Symbol spline_weighting; // NOLINT // clang-format off __attribute__((constructor(SYMBOL_INIT_PRIORITY))) static void initializeTorchSplineConvSymbols() { // clang-format on poptorch::logging::trace("Initializing torch_spline_conv symbols"); SYMBOL_INIT(torch_spline_conv, spline_basis); SYMBOL_INIT(torch_spline_conv, spline_weighting); } } // namespace torch_spline_conv namespace poptorch { namespace symbols { #define OP_DECL(Namespace, FuncName, function, OnnxImpl, Args, BodyArgs) \ c10::Symbol Namespace::FuncName; #define OP_DECL_NO_RETURN(Namespace, FuncName, function, OnnxImpl, Args, \ BodyArgs) \ c10::Symbol Namespace::FuncName; #include "popart_compiler/SupportedOperations.inc.hpp" // NOLINT #undef OP_DECL #undef OP_DECL_NO_RETURN // clang-format off __attribute__((constructor(SYMBOL_INIT_PRIORITY))) static void initializeSupportedOperations() { // clang-format on logging::trace("Initializing supported operationss"); #define OP_DECL(Namespace, FuncName, function, OnnxImpl, Args, BodyArgs) \ Namespace::FuncName = \ c10::Symbol::fromQualString(#Namespace "::" #FuncName); // NOLINT #define OP_DECL_NO_RETURN(Namespace, FuncName, function, OnnxImpl, Args, \ BodyArgs) \ Namespace::FuncName = \ c10::Symbol::fromQualString(#Namespace "::" #FuncName); // NOLINT #include "popart_compiler/SupportedOperations.inc.hpp" // NOLINT #undef OP_DECL #undef OP_DECL_NO_RETURN } namespace poptorch { c10::Symbol nop; c10::Symbol dynamic_slice; c10::Symbol dynamic_update; c10::Symbol begin_ipu_block; c10::Symbol internal_cast; c10::Symbol end_ipu_block; c10::Symbol identity_loss; c10::Symbol set_available_memory; c10::Symbol set_matmul_serialization; c10::Symbol set_overlap_for_input; c10::Symbol set_overlap_for_output; c10::Symbol optimizer_group; c10::Symbol begin_multi_conv; c10::Symbol multi_conv_part; c10::Symbol end_multi_conv; c10::Symbol update_param_inplace; c10::Symbol host_side_cast; c10::Symbol start_for_loop; c10::Symbol end_for_loop; c10::Symbol start_if_block; c10::Symbol start_else_block; c10::Symbol end_if_block; c10::Symbol push_name_scope; c10::Symbol pop_name_scope; c10::Symbol add_untyped_input_tensor; c10::Symbol host_and_ipu_side_tensor_constant; c10::Symbol call_cpu_op; c10::Symbol end_cpu_op; c10::Symbol canonicalised_cpu_call; c10::Symbol ctc_beam_search_decoder; c10::Symbol set_attribute; c10::Symbol clear_attribute; c10::Symbol fps; c10::Symbol nearest; c10::Symbol nearest_batch_list; // clang-format off __attribute__((constructor(SYMBOL_INIT_PRIORITY))) static void initializePoptorchSymbols() { // clang-format on logging::trace("Initializing poptorch symbols"); SYMBOL_INIT(poptorch, nop); SYMBOL_INIT(poptorch, dynamic_slice); SYMBOL_INIT(poptorch, dynamic_update); SYMBOL_INIT(poptorch, begin_ipu_block); SYMBOL_INIT(poptorch, internal_cast); SYMBOL_INIT(poptorch, end_ipu_block); SYMBOL_INIT(poptorch, identity_loss); SYMBOL_INIT(poptorch, set_available_memory); SYMBOL_INIT(poptorch, set_matmul_serialization); SYMBOL_INIT(poptorch, set_overlap_for_input); SYMBOL_INIT(poptorch, set_overlap_for_output); SYMBOL_INIT(poptorch, optimizer_group); SYMBOL_INIT(poptorch, begin_multi_conv); SYMBOL_INIT(poptorch, multi_conv_part); SYMBOL_INIT(poptorch, end_multi_conv); SYMBOL_INIT(poptorch, host_side_cast); SYMBOL_INIT(poptorch, update_param_inplace); SYMBOL_INIT(poptorch, start_for_loop); SYMBOL_INIT(poptorch, end_for_loop); SYMBOL_INIT(poptorch, start_if_block); SYMBOL_INIT(poptorch, start_else_block); SYMBOL_INIT(poptorch, end_if_block); SYMBOL_INIT(poptorch, push_name_scope); SYMBOL_INIT(poptorch, pop_name_scope); SYMBOL_INIT(poptorch, add_untyped_input_tensor); SYMBOL_INIT(poptorch, host_and_ipu_side_tensor_constant); SYMBOL_INIT(poptorch, call_cpu_op); SYMBOL_INIT(poptorch, end_cpu_op); SYMBOL_INIT(poptorch, canonicalised_cpu_call); SYMBOL_INIT(poptorch, ctc_beam_search_decoder); SYMBOL_INIT(poptorch, set_attribute); SYMBOL_INIT(poptorch, clear_attribute); SYMBOL_INIT(poptorch, fps); SYMBOL_INIT(poptorch, nearest); SYMBOL_INIT(poptorch, nearest_batch_list); } } // namespace poptorch } // namespace symbols c10::Symbol getOverlapSymbol(const char *suffix, unsigned int num) { return c10::Symbol::attr( fmt::format("poptorch_overlap_for_{}{}", suffix, num)); } } // namespace poptorch ================================================ FILE: poptorch/source/PoptorchSymbols.hpp ================================================ // Copyright (c) 2020 Graphcore Ltd. All rights reserved. #ifndef SOURCE_POPTORCH_SYMBOLS_H #define SOURCE_POPTORCH_SYMBOLS_H #include #include // Create missing C10 symbols. // PyTorch initialises aten Symbols from native_functions.yml (see // `aten_interned_strings.h`, and `gen_interned_strings` in torchgen). However, // not all the aten Symbols we need are present in native_functions.yml. namespace c10::aten { extern c10::Symbol multilabel_soft_margin_loss; // NOLINT } // namespace c10::aten namespace poptorch { namespace symbols { #define OP_DECL(Namespace, FuncName, function, OnnxImpl, Args, BodyArgs) \ namespace Namespace { \ extern c10::Symbol FuncName; \ } #define OP_DECL_NO_RETURN(Namespace, FuncName, function, OnnxImpl, Args, \ BodyArgs) \ namespace Namespace { \ extern c10::Symbol FuncName; \ } #include "popart_compiler/SupportedOperations.inc.hpp" #undef OP_DECL #undef OP_DECL_NO_RETURN } // namespace symbols namespace symbols::poptorch { extern c10::Symbol nop; extern c10::Symbol dynamic_slice; extern c10::Symbol dynamic_update; extern c10::Symbol begin_ipu_block; extern c10::Symbol internal_cast; extern c10::Symbol end_ipu_block; extern c10::Symbol identity_loss; extern c10::Symbol set_available_memory; extern c10::Symbol set_matmul_serialization; extern c10::Symbol set_overlap_for_input; extern c10::Symbol set_overlap_for_output; extern c10::Symbol optimizer_group; extern c10::Symbol begin_multi_conv; extern c10::Symbol multi_conv_part; extern c10::Symbol end_multi_conv; // In order to allow a paramater/buffer to be updated in place, the only // guaranteed inplace op by PopART, use update_param_inplace. extern c10::Symbol update_param_inplace; // Casting is done before passing the input to the IPU: the op is used so that // so that that input types match those received from pytorch but that the input // types to later ops have the correct type. extern c10::Symbol host_side_cast; extern c10::Symbol start_for_loop; extern c10::Symbol end_for_loop; extern c10::Symbol start_if_block; extern c10::Symbol start_else_block; extern c10::Symbol end_if_block; extern c10::Symbol push_name_scope; extern c10::Symbol pop_name_scope; extern c10::Symbol add_untyped_input_tensor; extern c10::Symbol host_and_ipu_side_tensor_constant; extern c10::Symbol call_cpu_op; extern c10::Symbol end_cpu_op; extern c10::Symbol canonicalised_cpu_call; extern c10::Symbol ctc_beam_search_decoder; extern c10::Symbol set_attribute; extern c10::Symbol clear_attribute; extern c10::Symbol unfold; extern c10::Symbol prelu; extern c10::Symbol fps; extern c10::Symbol nearest; extern c10::Symbol nearest_batch_list; } // namespace symbols::poptorch // Return the attribute symbol refering to having overlap for a given input c10::Symbol getOverlapSymbol(const char *suffix, unsigned int num); } // namespace poptorch // Define symbols used by PyG torch_scatter library namespace torch_scatter { extern c10::Symbol scatter_max; extern c10::Symbol scatter_min; extern c10::Symbol scatter_mul; } // namespace torch_scatter namespace torch_cluster { extern c10::Symbol grid; } // namespace torch_cluster namespace torch_spline_conv { extern c10::Symbol spline_basis; extern c10::Symbol spline_weighting; } // namespace torch_spline_conv #endif // SOURCE_POPTORCH_SYMBOLS_H ================================================ FILE: poptorch/source/RemoveSurplusIdentityLosses.cpp ================================================ // Copyright (c) 2020 Graphcore Ltd. All rights reserved. #include #include "PoptorchSymbols.hpp" #include "poptorch/OpBuilder.hpp" #include "poptorch/PopartCanonicalization.hpp" #include "poptorch/Utils.hpp" #include "poptorch_logging/Logging.hpp" /* Removes losses such that the module only has one loss at the end. 1. Finds any loss in the module. 2. Looks through the use-def chain of that loss to see if it is used in another loss, if so removes it. 3. At the end there will only be one loss used. */ namespace poptorch { bool traverseUseDef(torch::jit::Node *node) { bool used_in_loss = false; // Look through the use-def chain. for (torch::jit::Value *output : node->outputs()) { // name for (torch::jit::Use use : output->uses()) { const torch::jit::Symbol kind = use.user->kind(); // If this is a loss then |node| is used in a loss. if (kind == symbols::popart::identityloss) { used_in_loss = true; } // Uses can't be circular. used_in_loss |= traverseUseDef(use.user); // Early exit if true. if (used_in_loss) { return true; } } } return used_in_loss; } void removeSurplusIdentityLosses(torch::jit::Graph *graph) { std::unordered_set to_delete; // For diagnostics. std::size_t total_found_losses = 0; std::size_t independent_loss_count = 0; // For all nodes in the IR. for (torch::jit::Node *node : graph->nodes()) { WithNodeMetadata meta{node}; const torch::jit::Symbol kind = node->kind(); // For each loss see if it is used in a loss. if (kind == symbols::popart::identityloss) { total_found_losses++; bool used_in_loss = traverseUseDef(node); if (used_in_loss) { // Remove the node by replacing it with either the input or the input // transformed by some operation. torch::jit::Node *new_node = node->input()->node(); // If the operation was performing a reduction replace it with a manual // reduction operation. const std::size_t reduction = node->i(c10::Symbol::attr("reduction")); if (reduction < 2) { // Flatten it into 1D. torch::jit::Node *flattened = createFlatten(graph, {new_node->output()}, 0); flattened->moveAfter(node); // Reduce across that 1D tensor. if (reduction == 0) { // Sum new_node = createReducesum(graph, {flattened->output()}, {1}, 0); new_node->moveAfter(flattened); } else if (reduction == 1) { // Mean new_node = createReducemean(graph, {flattened->output()}, {1}, 0); new_node->moveAfter(flattened); } } node->replaceAllUsesWith(new_node); to_delete.insert(node); } else { independent_loss_count++; } } } logging::debug("Found {} losses and removed {}", total_found_losses, total_found_losses - independent_loss_count); ERROR_ON_MSG(total_found_losses == 0, "Couldn't find a loss in graph!"); ERROR_ON_MSG(independent_loss_count != 1, "Multiple independent losses found" " in graph. Graph must have one final loss." " Wrap final graph loss in poptorch.identity_loss."); // Remove the dead nodes. searchAndPossiblyDestroy(to_delete); } } // namespace poptorch ================================================ FILE: poptorch/source/RequiresGrad.cpp ================================================ // Copyright (c) 2022 Graphcore Ltd. All rights reserved. #include #include #include "poptorch/DispatchTracer.hpp" #include "poptorch/RequiresGrad.hpp" #include "poptorch/Utils.hpp" #include "poptorch_logging/Logging.hpp" namespace poptorch { void fixRequiresGradFromDispatch(torch::jit::Graph *graph) { // For each output of each node in the graph. for (auto *node : graph->nodes()) { for (auto *output : node->outputs()) { auto tensor_type = output->type()->cast(); if (!tensor_type) { continue; } auto device = tensor_type->device(); if (!device) { continue; } if (device->type() != at::DeviceType::IPU) { continue; } // If the output is an IPU floating-point tensor, check if any // of the inputs has requires_grad set, and update the Value if // needed. bool requires_grad = false; if (tensor_type->scalarType().has_value() && c10::isFloatingType(tensor_type->scalarType().value())) { for (auto *input : node->inputs()) { if (input->requires_grad()) { requires_grad = true; break; } } } if (requires_grad != output->requires_grad()) { logging::trace("[requires_grad] Set requires_grad={} on node {}", requires_grad, nodeToString(node)); output->setType(tensor_type->withRequiresGrad(requires_grad)); } } } } } // namespace poptorch ================================================ FILE: poptorch/source/SessionOptionsParser.cpp ================================================ // Copyright (c) 2022 Graphcore Ltd. All rights reserved. #include "poptorch/SessionOptionsParser.hpp" #include "popart_compiler/Compiler.hpp" #include "popart_compiler/Utils.hpp" #include "poptorch/ImplicitCasting.hpp" #include "poptorch_logging/Tracepoint.hpp" namespace poptorch { float IPyValue::toFloatWithRangeCheck() const { // A python "float" is a double const double value = toDouble(); ERROR_ON_MSG(value > std::numeric_limits::max(), value << " is too high for a Popart float attribute."); ERROR_ON_MSG(value < std::numeric_limits::lowest(), value << " is too low for a Popart float attribute."); return static_cast(value); } std::vector IPyValue::toVectorString() const { std::vector out; out.reserve(getListSize()); forEachInList([&out](const IPyValue &val) { out.push_back(val.toString()); }); return out; } SessionOptionsParser::~SessionOptionsParser() = default; popart_compiler::SessionOptions &SessionOptionsParser::options() { return *_opts; } SessionOptionsParser::SessionOptionsParser(const IPyValue &py_opts) : _opts(std::make_unique()) { const logging::LogContext ctx_func("parseSessionOptions"); // steps, replicationFactor, profile auto &options = *_opts; py_opts.forEachInDict([&options, &py_opts](const IPyValue &name_val, const IPyValue &value) { const auto name = name_val.toString(); const logging::LogContext ctx("option: " + name); // Options excluded here: // - patterns_level is handled at the same time as "patterns". // - anchored_tensors is dealt with exclusively in Python. if (name == "patterns_level" || name == "anchored_tensors") { return; } if (name == "compilation_progress_bar_fn") { options.setCompilationProgressLogger(value.toFunction()); } else if (value.isBoolean()) { options.addBoolOption(name.c_str(), value.toBoolean()); } else if (value.isDouble()) { options.addDoubleOption(name.c_str(), value.toDouble()); } else if (value.isInt()) { options.addUint64Option(name.c_str(), value.toUInt64()); } else if (value.isString()) { options.addStringOption(name.c_str(), value.toString().c_str()); } else if (value.isSetListOrTuple()) { value.forEachInList([&options, &name](const IPyValue &str_opt) { options.insertStringOption(name.c_str(), str_opt.toString().c_str()); }); } else if (value.isDict()) { if (name == "available_memory_proportion") { value.forEachInDict( [&options](const IPyValue &ipu, const IPyValue &memory) { options.setMemoryProportion(ipu.toUInt64(), memory.toFloatWithRangeCheck()); }); } else if (name == "patterns") { auto patterns_level = py_opts.getFromDict("patterns_level"); ERROR_ON_MSG(patterns_level == nullptr, "PopART option 'patterns' should not be set " "without first setting 'patterns_level'."); options.setPatternsLevel(patterns_level->toUInt64()); value.forEachInDict([&options](const IPyValue &pattern, const IPyValue &enabled) { options.addPattern(pattern.toString().c_str(), enabled.toBoolean()); }); } else if (name.rfind("location_", 0) == 0) { value.forEachInDict([&options, &name](const IPyValue &tensor, const IPyValue &location) { options.setTensorLocation(name.c_str(), tensor.toString().c_str(), location.toUInt64()); }); } else { value.forEachInDict([&options, &name](const IPyValue &str_key, const IPyValue &str_value) { options.insertStringPairOption(name.c_str(), str_key.toString().c_str(), str_value.toString().c_str()); }); } } else { ERROR("Unknown value type " << value.type() << " for option " << name); } }); } } // namespace poptorch ================================================ FILE: poptorch/source/Utils.cpp ================================================ // Copyright (c) 2020 Graphcore Ltd. All rights reserved. #include #include #include #include #include #include #include "poptorch_logging/Error.hpp" #include "poptorch_logging/Logging.hpp" #include "PoptorchSymbols.hpp" #include "popart_canonicalization/PopartCanonicalizationUtils.hpp" #include "poptorch/OpBuilder.hpp" #include "poptorch/Utils.hpp" namespace poptorch { torch::jit::Node *findEarliestUser(const torch::jit::Value *value) { const auto &uses(value->uses()); if (uses.empty()) { return nullptr; } torch::jit::Node *earliest_user = uses[0].user; for (size_t i = 1; i < uses.size(); i++) { auto *node = uses[i].user; if (node->isBefore(earliest_user)) { earliest_user = node; } } return earliest_user; } bool isNondeterministic(const torch::jit::Node &node) { if (node.isNondeterministic()) { return true; } // Handle extra cases until this is fixed upstream // https://github.com/pytorch/pytorch/issues/52599 static const auto non_deterministic_nodes = { c10::aten::normal, c10::aten::normal_, c10::aten::feature_dropout, c10::aten::randint, c10::aten::bernoulli, c10::aten::bernoulli_, c10::aten::uniform_, c10::aten::randperm, c10::aten::exponential_, c10::aten::random_, }; return std::find(non_deterministic_nodes.begin(), non_deterministic_nodes.end(), node.kind()) != non_deterministic_nodes.end(); } std::string nodeToString(const torch::jit::Node *node) { std::stringstream ss; node->print(ss, 0, nullptr, true, false, false, false); std::string node_str = ss.str(); return node_str; } std::string scalarTypeToOnnxString(const at::ScalarType type) { switch (type) { case at::ScalarType::Byte: return "UINT8"; case at::ScalarType::Char: return "INT8"; case at::ScalarType::Short: return "INT16"; case at::ScalarType::Int: return "INT32"; case at::ScalarType::Long: return "INT64"; case at::ScalarType::Half: return "FLOAT16"; case at::ScalarType::Float: return "FLOAT"; case at::ScalarType::Double: return "DOUBLE"; case at::ScalarType::ComplexHalf: return "UNDEFINED"; case at::ScalarType::ComplexFloat: return "COMPLEX64"; case at::ScalarType::ComplexDouble: return "COMPLEX128"; case at::ScalarType::Bool: return "BOOL"; case at::ScalarType::BFloat16: return "BFLOAT16"; case at::ScalarType::QInt8: case at::ScalarType::QUInt8: case at::ScalarType::QInt32: return "UNDEFINED"; default: return "(unknown type)"; } } at::ScalarType onnxStrToScalarType(const char *type_str) { if (strcmp(type_str, "UINT8") == 0) { return at::ScalarType::Byte; } if (strcmp(type_str, "INT8") == 0) { return at::ScalarType::Char; } if (strcmp(type_str, "INT16") == 0) { return at::ScalarType::Short; } if (strcmp(type_str, "INT32") == 0) { return at::ScalarType::Int; } if (strcmp(type_str, "INT64") == 0) { return at::ScalarType::Long; } if (strcmp(type_str, "FLOAT16") == 0) { return at::ScalarType::Half; } if (strcmp(type_str, "FLOAT") == 0) { return at::ScalarType::Float; } if (strcmp(type_str, "DOUBLE") == 0) { return at::ScalarType::Double; } if (strcmp(type_str, "COMPLEX64") == 0) { return at::ScalarType::ComplexFloat; } if (strcmp(type_str, "COMPLEX128") == 0) { return at::ScalarType::ComplexDouble; } if (strcmp(type_str, "BOOL") == 0) { return at::ScalarType::Bool; } if (strcmp(type_str, "BFLOAT16") == 0) { return at::ScalarType::BFloat16; } ERROR("No at::scalar_type for " << type_str); } at::ScalarType coerceToSupportedType(at::ScalarType type) { switch (type) { case at::ScalarType::Double: return at::ScalarType::Float; case at::ScalarType::Long: return at::ScalarType::Int; default: break; } return type; } torch::jit::Node *createAndInsertCastOp(torch::jit::Graph *graph, torch::jit::Value *val, at::ScalarType type) { // create args for cast torch value to type auto *const long_dtype = insertConstant(graph, type); auto *const false_val = insertConstant(graph, false); auto *const none = graph->createNone(); insertNodeInGraph(graph, none); // create and add upcast index to long auto *cast = createAndInsertNode(graph, c10::aten::to, {val, long_dtype, false_val /*non_blocking*/, false_val /*copy*/, none->output() /*memory_format*/}); cast->output()->setType( val->type()->expect()->withScalarType(type)); return cast; } namespace { // Adds a null pointers for every unused tensor in an unused tuple void addNullPtrsForUnusedTuple(const c10::TupleType *tuple_type, std::vector *tensors) { for (const auto &element : tuple_type->elements()) { switch (element->kind()) { case c10::TypeKind::TensorType: { tensors->push_back(nullptr); break; } case c10::TypeKind::TupleType: { auto type = element->expect(); addNullPtrsForUnusedTuple(type.get(), tensors); break; } default: { ERROR("Unsupported input type '" << c10::typeKindToString(element->kind()) << "'"); } } } } void processInput(torch::jit::Graph *graph, torch::jit::Value *input, std::vector *tensors) { switch (input->type()->kind()) { case c10::TypeKind::TensorType: ERROR_ON(input->node()->kind() != c10::prim::Param && input->node()->kind() != c10::prim::TupleUnpack); tensors->push_back(input); break; case c10::TypeKind::ListType: // Fallthrough. case c10::TypeKind::TupleType: { // Find the TupleUnpack node if (input->hasUses()) { ERROR_ON(input->uses().size() != 1); auto *unpack = input->uses()[0].user; ERROR_ON(unpack->kind() != c10::prim::TupleUnpack); for (auto *element : unpack->outputs()) { // Recurse for nested tuple support processInput(graph, element, tensors); } } else { // We need placeholders or the values will not align with input tensors auto tuple_type = input->type()->expect(); addNullPtrsForUnusedTuple(tuple_type.get(), tensors); } break; } default: ERROR("Unsupported input type '" << c10::typeKindToString(input->type()->kind()) << "'"); } } } // namespace std::vector collapsedGraphInputHierachy(torch::jit::Graph *graph) { std::vector tensors; for (auto *input : graph->inputs()) { processInput(graph, input, &tensors); } return tensors; } size_t numTensorsForType(const c10::TypePtr &type) { switch (type->kind()) { case c10::TypeKind::TensorType: return 1; case c10::TypeKind::ListType: { const auto list_type = type->cast(); ERROR_ON(!list_type); return list_type->numElements(); } case c10::TypeKind::TupleType: { size_t num_tensors = 0; const auto tuple = type->expect(); for (const auto &element_type : tuple->elements()) { num_tensors += numTensorsForType(element_type); } return num_tensors; } default: ERROR("Unsupported output type '" << c10::typeKindToString(type->kind()) << "'"); } } namespace { bool shouldDestroy(torch::jit::Node *node) { // Skip parameters and nodes with any uses. return !(node->kind() == c10::prim::Param || node->hasUses()); } // Store the inputs used by this node. // Ops may use the same input twice, so use a set to store only unique inputs. std::unordered_set copyInputs(torch::jit::Node *node) { std::unordered_set inputs; for (torch::jit::Value *user : node->inputs()) { inputs.insert(user->node()); } return inputs; } void searchAndPossiblyDestroyInternal( torch::jit::Node *node, std::unordered_set *destroyed) { if (destroyed->count(node) != 0u) { return; } if (!shouldDestroy(node)) { return; } const auto inputs = copyInputs(node); node->destroy(); destroyed->insert(node); // If any of the previously used values now have no users repeat the process // for them. for (auto *user : inputs) { searchAndPossiblyDestroyInternal(user, destroyed); } } } // namespace void searchAndPossiblyDestroy( const std::unordered_set &to_test) { std::unordered_set destroyed; for (auto *node : to_test) { searchAndPossiblyDestroyInternal(node, &destroyed); } } void removeAndPossiblyDestroyAllInputs(torch::jit::Node *node) { std::unordered_set inputs; for (auto *i : node->inputs()) { inputs.insert(i->node()); } node->removeAllInputs(); searchAndPossiblyDestroy(inputs); } std::unique_ptr stringToUniquePtr(const std::string &str) { auto ptr = std::unique_ptr(new char[str.size() + 1]); str.copy(ptr.get(), std::string::npos); ptr.get()[str.size()] = '\0'; return ptr; } // Convert that IR type into a C++ vector of ints. std::vector shapeFromTensor(const torch::jit::Value *value) { // Extract the type from the pytorch IR. const c10::TensorTypePtr as_tensor = value->type()->expect(); const c10::VaryingShape varying_shape = as_tensor->sizes(); const auto &optional_shape_size = varying_shape.size(); const auto &optional_dims = varying_shape.sizes(); // Convert that IR type into a C++ vector of ints. std::vector shape; if (optional_shape_size) { shape.reserve(optional_shape_size.value()); } if (optional_dims) { const auto &dims = optional_dims.value(); for (const auto &optional_dim : dims) { if (optional_dim) { shape.push_back(optional_dim.value()); } } } return shape; } void castWeightAndBias(torch::jit::Graph *graph, torch::jit::Value *input, torch::jit::Value *&weight, torch::jit::Value *&bias) { const c10::ScalarType input_type = input->type()->expect()->scalarType().value(); if (!isNone(weight->node())) { const c10::ScalarType weight_type = weight->type()->expect()->scalarType().value(); if (weight_type != input_type) { weight = createCast(graph, weight, input_type)->output(); } } if (!isNone(bias->node())) { const c10::ScalarType bias_type = bias->type()->expect()->scalarType().value(); if (bias_type != input_type) { bias = createCast(graph, bias, input_type)->output(); } } } JitTensorInfo::JitTensorInfo(const at::Tensor &tensor) { scalar_type = tensor.scalar_type(); dims = tensor.sizes().vec(); } JitTensorInfo::JitTensorInfo(torch::jit::Value *value) { auto tensor_type = value->type()->cast(); ERROR_ON_MSG(!tensor_type->scalarType().has_value(), "Data type not set"); ERROR_ON_MSG(!tensor_type->sizes().concrete_sizes().has_value(), "Size not set"); scalar_type = *tensor_type->scalarType(); dims = *tensor_type->sizes().concrete_sizes(); } std::string JitTensorInfo::toString() const { std::stringstream ss; ss << scalar_type << "("; std::string sep; for (auto d : dims) { ss << sep << d; sep = ", "; } ss << ")"; return ss.str(); } void validateTensorShapeAndType(torch::jit::Value *value, const at::Tensor &tensor) { const JitTensorInfo jit(value); const JitTensorInfo torch(tensor); const bool match = std::tie(torch.scalar_type, torch.dims) == std::tie(jit.scalar_type, jit.dims); ERROR_ON_MSG(!match, "Shape/Type mismatch: JIT tensor %" << value->debugName() << " " << jit.toString() << " is incompatible with " << torch.toString()); } void setNodeTensorAttrValue(torch::jit::Node *node, torch::jit::TensorAttr::ConstructorType value) { node->ts_(c10::attr::value, {std::forward(value)}); } const torch::jit::TensorAttr::ValueType & getNodeTensorAttrValue(const torch::jit::Node *node) { ERROR_ON_MSG(node->kindOf(c10::attr::value) != torch::jit::AttributeKind::ts, "[Internal] expected type 'ts' but got " << torch::jit::toString(node->kindOf(c10::attr::value))); const auto &ts = node->ts(c10::attr::value); ERROR_ON(ts.size() != 1); return ts.at(0); } std::string ListTypeWithNumElements::str() const { return fmt::format("TensorList[{}]", _num_elements); } } // namespace poptorch ================================================ FILE: poptorch/source/dispatch_tracer/CMakeLists.txt ================================================ set(CMAKE_POSITION_INDEPENDENT_CODE ON) add_library(dispatch_tracer STATIC RegisterAtenOverloads.cpp CommonHelperFunctions.cpp dispatchers/IDispatch.cpp dispatchers/JitDispatch.cpp InplaceAliasMapper.cpp ValueMapper.cpp Tensor.cpp TypeInferenceHandler.cpp ) target_link_libraries(dispatch_tracer PUBLIC torch PRIVATE poptorch_internal_headers poptorch_logging poptorch_compiler popart_compiler poptorch_err ) set_property(TARGET dispatch_tracer PROPERTY CXX_STANDARD 17) ================================================ FILE: poptorch/source/dispatch_tracer/CommonHelperFunctions.cpp ================================================ // Copyright (c) 2021 Graphcore Ltd. All rights reserved. #include "CommonHelperFunctions.hpp" #include #include #include #include #include #include #include #include #include "../popart_canonicalization/PopartCanonicalizationUtils.hpp" #include "InplaceAliasMapper.hpp" #include "ValueMapper.hpp" #include "poptorch/DispatchTracer.hpp" #include "poptorch/OpBuilder.hpp" #include "poptorch/PopartCanonicalization.hpp" #include "poptorch/TypeAndConstantCanonicalization.hpp" #include "poptorch/Utils.hpp" #include "poptorch_logging/Logging.hpp" namespace poptorch { namespace { bool isGenericListOfTensors(c10::IValue &value) { if (!value.isList()) { return false; } bool not_empty = false; for (c10::IValue const list_value : value.toList()) { if (!list_value.isTensor()) { return false; } not_empty = true; } return not_empty; } bool isListOfOptionalTensors(c10::IValue &value) { if (!value.isList()) { return false; } return value.toList().elementType() == c10::getTypePtr>(); } torch::jit::Value *insertValueIntoGraphAndTrackIt(c10::IValue &value, torch::jit::Graph &graph, ValueMapper &mapper) { if (value.isTensor()) { // Handle tensors. at::Tensor const tensor = value.toTensor(); // Undefined tensors are optional tensors. if (!tensor.defined()) { // Create a null IR value. torch::jit::Node *node = graph.createNone(); insertNodeInGraph(&graph, node); return node->output(); } torch::jit::Value *val = mapper.getValueForTensor(tensor); if (val == nullptr) { ERROR_ON_MSG(tensor.device().type() == c10::DeviceType::IPU, "Attempted to promote a Tensor converted (using " ".to(\"ipu\") or .ipu()) outside an IPUScope or IPUContext " "with the PopART compiler."); // This is probably an external tensor that we didn't catch. Assume // it's a constant. val = insertConstant(graph, copyAndCoerceType(tensor)); setSourceRangeToCurrentLocation(val->node()); // Don't track constants in the ValueMapper as they are CPU tensors. } logging::trace( "[DISPATCHER] Tensor input: tensor ptr {} ({}), jit ir %{} (scalar " "type {})", reinterpret_cast(tensor.unsafeGetTensorImpl()), toString(tensor), val->debugNameBase(), val->type()->expect()->scalarType().value_or( at::ScalarType::Undefined)); return val; } // If a generic list only contains tensors then it is a tensor // list and we handle both the same way. if (value.isTensorList() || isGenericListOfTensors(value)) { // Handle tensor lists. std::vector list_values; if (value.isTensorList()) { for (c10::IValue list_value : value.toTensorVector()) { list_values.push_back( insertValueIntoGraphAndTrackIt(list_value, graph, mapper)); } } else { for (c10::IValue list_value : value.toList()) { list_values.push_back( insertValueIntoGraphAndTrackIt(list_value, graph, mapper)); } } // We assume all lists with the same jit values are the same list in python. torch::jit::Value *val = mapper.getValueForTensorList(list_values); if (val == nullptr) { c10::TypePtr type_ptr; if (value.isTensorList()) { type_ptr = c10::TensorType::get(); } else if (isListOfOptionalTensors(value)) { type_ptr = c10::OptionalType::create(c10::TensorType::get()); } auto *list = graph.createList(type_ptr, list_values); insertNodeInGraph(&graph, list); val = list->output(); mapper.addTensorList(list_values, val); } return val; } // Assume value is a true constant and not a tensor so we don't have to // track it in the value mapper. It will get canonicalised later. torch::jit::Value *val = insertConstant(&graph, value); ERROR_ON_MSG(val == nullptr, "Internal: graph could not insert a constant"); logging::trace("[DISPATCHER] Constant input: jit ir %{}, ivalue tag kind: {}", val->debugNameBase(), value.tagKind()); return val; } // Create a node based on the schema which deduces the input types // from the inputs/stack and the name from the schema. As far as our // canonicalisation is concerned this *is* the "aten" node it purports to be // however it may not match it exacty, and is not created by the normal JIT // process. torch::jit::Node * createAtenTarget(torch::jit::Graph &graph, const c10::FunctionSchema &schema, const std::vector &inputs, c10::Stack *stack, ValueMapper &mapper) { logging::trace("[DISPATCHER] Create aten target {}", schema.name()); torch::jit::Symbol const symbol = torch::jit::Symbol::fromQualString(schema.name()); // Create the aten target node for our canonicalisation to target. torch::jit::Node *aten_target = createAndInsertNode(&graph, symbol, inputs, ImplicitCast::None, OutputType::Unknown, schema.returns().size()); for (std::size_t i = 0; i < aten_target->inputs().size(); ++i) { torch::jit::Value *in = aten_target->input(i); // If we are a constant. if (in->node()->kind() == at::prim::Constant) { c10::IValue val = stack->at(i); if (val.isTensor()) { at::Tensor const as_tensor = val.toTensor(); // But actually we are a previously seen tensor which has been demoted // to a constant. torch::jit::Value *new_val = mapper.getValueForTensor(as_tensor); if ((new_val != nullptr) && new_val != in) { in->replaceAllUsesWith(new_val); in->node()->destroy(); } } } } return aten_target; } } // namespace at::ScalarType scalarTypeOrDefault(c10::optional dtype) { return dtype ? *dtype : at::ScalarType::Float; } at::Tensor copyAndCoerceType(const at::Tensor &tensor) { at::Tensor const copy; const auto scalar_type = tensor.scalar_type(); const auto coerced_scalar_type = coerceToSupportedType(scalar_type); if (scalar_type != coerced_scalar_type) { static std::uint64_t log_repeat = 0; logging::warn(log_repeat, "[DISPATCHER] Tensor (ptr {}) type coerced from {} to {}", static_cast(tensor.unsafeGetTensorImpl()), scalar_type, coerced_scalar_type); return tensor.to(coerced_scalar_type); } return tensor; } std::vector getInplaceArguments(const c10::Stack &stack, const c10::FunctionSchema &schema) { logging::trace("[DISPATCHER][JIT] Looking for inplace arguments in schema {}", schema); std::vector results; const auto inplace_arg_id = InplaceArgAliasMapper::getInplaceArg(schema.name()); for (std::size_t arg = 0; arg < schema.arguments().size(); ++arg) { const c10::Argument &argument = schema.arguments()[arg]; const c10::IValue value = stack[arg]; if (value.isTensor()) { at::Tensor const &tensor = value.toTensor(); // Undefined tensors are optional tensors. if (!tensor.defined()) { continue; } if (((argument.alias_info() != nullptr) && argument.alias_info()->isWrite()) || inplace_arg_id == arg) { logging::trace("[DISPATCHER][JIT] Found inplace argument, tensor ptr " "{}, tensor {}", reinterpret_cast(tensor.unsafeGetTensorImpl()), toString(tensor)); results.push_back(tensor); } } } return results; } torch::jit::Node *lowerFromSchema(const c10::FunctionSchema &schema, c10::Stack *stack, torch::jit::Graph &graph, ValueMapper &mapper) { std::vector inputs; for (std::size_t arg = 0; arg < schema.arguments().size() && arg < stack->size(); ++arg) { auto value = (*stack)[arg]; inputs.push_back(insertValueIntoGraphAndTrackIt(value, graph, mapper)); } return createAtenTarget(graph, schema, inputs, stack, mapper); } std::string toString(const at::Tensor &t) { return fmt::format("sizes={}, type={}", t.sizes(), t.scalar_type()); } bool isHalfTensor(const at::Tensor &t) { return t.scalar_type() == at::ScalarType::Half; } c10::Device deviceOrDefaultIpu(c10::optional device) { return device ? *device : c10::Device(at::kIPU, 0); } std::string getSchemaKey(const c10::FunctionSchema &schema) { // Unfortunately we can't overload based only on the schema symbol as it does // not contain the overload info. if (schema.overload_name().empty()) { return schema.name(); } return schema.name() + "." + schema.overload_name(); } } // namespace poptorch ================================================ FILE: poptorch/source/dispatch_tracer/CommonHelperFunctions.hpp ================================================ // Copyright (c) 2021 Graphcore Ltd. All rights reserved. #ifndef POPTORCH_DISPATCH_COMMON_HELPERS_HPP_ #define POPTORCH_DISPATCH_COMMON_HELPERS_HPP_ #include #include #include #include #include #include namespace torch { namespace jit { struct Graph; struct Node; struct Value; } // namespace jit } // namespace torch namespace poptorch { class ValueMapper; at::Tensor copyAndCoerceType(const at::Tensor &tensor); // From the schema deduce which argument if any is inplace. Only return the // first one which is inplace. This might include an argument of an op that // is not truly inplace, e.g. it returns the 'out' argument in the schema // op(Tensor self, Tensor(a!) out) -> (Tensor(a!)) even when 'self' and 'out' // are not the same tensor. std::vector getInplaceArguments(const c10::Stack &stack, const c10::FunctionSchema &schema); // Using the schema definition as a guide look up all the correct // torch::jit::Values in the stack and create a jit node with the correct // symbol. Input values from the stack are also inserted into the graph. torch::jit::Node *lowerFromSchema(const c10::FunctionSchema &schema, c10::Stack *stack, torch::jit::Graph &graph, ValueMapper &mapper); // Return a string containing the tensor sizes and type. std::string toString(const at::Tensor &t); bool isHalfTensor(const at::Tensor &t); at::ScalarType scalarTypeOrDefault(c10::optional dtype); // If device is set: return device, otherwise return the default device (ipu0) c10::Device deviceOrDefaultIpu(c10::optional device); std::string getSchemaKey(const c10::FunctionSchema &schema); } // namespace poptorch #endif // POPTORCH_DISPATCH_COMMON_HELPERS_HPP_ ================================================ FILE: poptorch/source/dispatch_tracer/InplaceAliasMapper.cpp ================================================ // Copyright (c) 2023 Graphcore Ltd. All rights reserved. #include #include "InplaceAliasMapper.hpp" namespace poptorch { InplaceArgAliasMapper &InplaceArgAliasMapper::getInstance() { static InplaceArgAliasMapper instance; return instance; } void InplaceArgAliasMapper::registerInplaceArgId( const std::string &operator_name, std::size_t alias_arg_id) { std::string key = _namespace ? fmt::format("{}::{}", _namespace.value(), operator_name) : operator_name; _operator_name_to_arg_id.emplace(key, alias_arg_id); } std::optional InplaceArgAliasMapper::getInplaceArg(const std::string &operator_name) { auto &operator_name_to_arg_id = getInstance()._operator_name_to_arg_id; const auto it = operator_name_to_arg_id.find(operator_name); if (it != operator_name_to_arg_id.end()) { return it->second; } return std::nullopt; } void InplaceArgAliasMapper::setNamespace(const std::string &p_namespace) { _namespace = p_namespace; } void InplaceArgAliasMapper::unsetNamespace() { _namespace = std::nullopt; } InplaceArgAliasMapperInit::InplaceArgAliasMapperInit( void (*init_mapper)(InplaceArgAliasMapper &), const std::string &p_namespace) { auto &alias_mapper = InplaceArgAliasMapper::getInstance(); alias_mapper.setNamespace(p_namespace); init_mapper(alias_mapper); alias_mapper.unsetNamespace(); } INPLACE_ARG_MAPPER_IMPL(torch_scatter, mapper) { mapper.registerInplaceArgId("scatter_mul", 3); mapper.registerInplaceArgId("scatter_max", 3); mapper.registerInplaceArgId("scatter_min", 3); } } // namespace poptorch ================================================ FILE: poptorch/source/dispatch_tracer/InplaceAliasMapper.hpp ================================================ // Copyright (c) 2023 Graphcore Ltd. All rights reserved. #ifndef POPTORCH_DISPATCH_INPLACE_ALIAS_MAPPER_HPP_ #define POPTORCH_DISPATCH_INPLACE_ALIAS_MAPPER_HPP_ #include #include #include #include namespace poptorch { class InplaceArgAliasMapper { public: static InplaceArgAliasMapper &getInstance(); static std::optional getInplaceArg(const std::string &operator_name); void registerInplaceArgId(const std::string &operator_name, std::size_t alias_arg_id); void setNamespace(const std::string &p_namespace); void unsetNamespace(); private: InplaceArgAliasMapper() = default; ~InplaceArgAliasMapper() = default; InplaceArgAliasMapper(const InplaceArgAliasMapper &) = delete; InplaceArgAliasMapper(InplaceArgAliasMapper &&) = delete; InplaceArgAliasMapper &operator=(const InplaceArgAliasMapper &) = delete; InplaceArgAliasMapper &operator=(InplaceArgAliasMapper &&) = delete; std::unordered_map _operator_name_to_arg_id; std::optional _namespace; }; struct InplaceArgAliasMapperInit { InplaceArgAliasMapperInit(void (*init_mapper)(InplaceArgAliasMapper &), const std::string &p_namespace); }; #define INPLACE_ARG_MAPPER_IMPL(Namespace, mapper) \ _INPLACE_ARG_MAPPER_IMPL(Namespace, mapper, C10_UID) #define _INPLACE_ARG_MAPPER_IMPL(Namespace, mapper, uid) \ static void Namespace##_##uid##_init_mapper_(InplaceArgAliasMapper &); \ static InplaceArgAliasMapperInit Namespace##_##uid##_init_arg_mapper = \ InplaceArgAliasMapperInit(&Namespace##_##uid##_init_mapper_, \ #Namespace); \ static void Namespace##_##uid##_init_mapper_(InplaceArgAliasMapper &(mapper)) } // namespace poptorch #endif // POPTORCH_DISPATCH_INPLACE_ALIAS_MAPPER_HPP_ ================================================ FILE: poptorch/source/dispatch_tracer/README.md ================================================ ## Dispatch tracing Dispatch tracing is our own implementation of torch::jit::trace which allows us to sidestep some of the constraints of that API as well as trace autograd functions. We support two backends. - JIT : Traces the incoming user model into normal PyTorch JIT IR first then canonicalises them into our PopART compatible JIT IR. - MLIR: Traces the model directly into our PyTorch native MLIR backend. Can use the above mechanism internally to decompose operations into the PopART subset or support them directly. RegisterAtenOverloads intercepts the initial call from PyTorch then directs that to whichever backend is active. A backend must provide a fallback operation and a function for any overloaded PyTorch function which cannot be "boxed" or has unique properties which make it easier. # JIT JIT works by using the normal PyTorch JIT API to turn the given OperatorHandle and Stack (of at::tensors/scalar/vector types) into JIT nodes. We then canonicalise that into our own IR. Once the graph has been traced, the traced graph can be retrieved and used in our compile process as a stand in for the normal torch::jit::Trace compiledgraph. Most cleanup stages are no longer required at this point. Models can still only be traced in inference mode, with PopART optionally applying its own autograd to turn the traced inference graph into a training graph. # MLIR MLIR is somewhat more complex as it is able to trace more of the graph as it uses the PyTorch autograd and gradients directly. This means it gets exposed to more of PyTorch so must handle more unexpected but legal inputs. For example in the autograd PyTorch stores variables for later processing, like the forward input to be later retrieved in the backward pass. In some of these cases PyTorch will softcopy the tensor by just swapping the storage pointer. However to our eyes it is a new tensor. So in the MLIR path we have to handle more tensor to value resolution code. Other than having to faithfully lower more varied legal input than in JIT the main difference is that it has two paths to lower a node. - It can use the JIT path to guarantee it can support at least as much as PopART and reuses our canonicalisation code to break down nodes further. - It can directly map a torch operation onto IR without needing canonicalisation. See CompilerDispatchTable.cpp for all the calls. The API with MLIR is generated automatically by MLIR and can be seen in the poptorch_compiler pytorch_bridge include folder. - DirectlySupportedOps.h.inc : Maps aten operations directly onto an MLIR operation. - PopartAPISupportedOps.h.inc: Maps aten operations onto the PopART subset via unpacking JIT arguments, just like LowerToPopart. # Code overview | File | Description | | ---- | --- | | RegisterAtenOverloads.cpp | Dispatcher point of first contact. Registers hooks with PyTorch to pick up the incoming calls. | | ValueMapper.cpp/hpp | Handles some state/logic to help map at::Tensors onto IR values and MLIR Tensors. | | CommonHelperFunctions.cpp/hpp | Helper functions used by JIT and MLIR backends which handle the JIT graph. | | dispatchers | Folder containing the backend specific dispatch code. | | Tracer.hpp | Abstract backend definition. | | JitDispatch.hpp/cpp | Contains the implementation of the JIT backend. | | MLIRDispatch.hpp/cpp | Contains the implementation of the MLIR backend. | | CompilerDispatchTable.cpp | Dispatch table used by MLIR backend | See MLIR section for details on DirectlySupportedOps/PopartAPISupportedOps. ================================================ FILE: poptorch/source/dispatch_tracer/RegisterAtenOverloads.cpp ================================================ // Copyright (c) 2021 Graphcore Ltd. All rights reserved. #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "../PoptorchSymbols.hpp" #include "../popart_canonicalization/PopartCanonicalizationUtils.hpp" #include "CommonHelperFunctions.hpp" #include "Tensor.hpp" #include "poptorch/DispatchTracer.hpp" #include "poptorch/InplaceOps.hpp" #include "poptorch/Utils.hpp" #include "poptorch_err/ExceptionHandling.hpp" #include "poptorch_logging/Error.hpp" #include "poptorch_logging/Logging.hpp" #include "dispatchers/IDispatch.hpp" #include "pytorch_bridge/IpuSession.hpp" #include "dispatchers/JitDispatch.hpp" #include "pytorch_bridge/CompilerOptions.hpp" // The functions in this file are called via Torch's dispatcher, therefore // we should only catch the exceptions which are not handled by // the dispatcher. #define PTC(f) \ PoptorchCatchWrapperImpl::wrap #define PTC_BOXED(f) torch::CppFunction::makeFromBoxedFunction() namespace poptorch { namespace { std::string valueToString(const c10::IValue &ivalue) { if (ivalue.isTensor()) { return str(ivalue.toTensor()); } // TODO(T59880) // Don't rely on operator<< for everything as we're currently using // the XLA dispatch key but using our own Tensor type: bad things // might happen if upstream torch tries to print a tensor by itself. if (ivalue.isNone() || ivalue.isScalar() || ivalue.isString() || ivalue.isDevice() || ivalue.isStream() || ivalue.isObject() || ivalue.isEnum()) { std::stringstream ss; ss << ivalue; return ss.str(); } if (ivalue.isList()) { std::stringstream ss; std::string sep; ss << ivalue.tagKind() << " ["; for (const auto &v : ivalue.toList()) { ss << sep << valueToString(v); sep = ", "; } ss << "]"; return ss.str(); } return "<" + ivalue.tagKind() + ">"; } bool isIpuDevice(const c10::Device &d) { return d.type() == c10::DeviceType::IPU; } /* * The dispatchers are statically registered and called without any additional * context so we need a static structure to handle the initial interception. * Afterwards we redirect to one of the handlers to avoid keeping around too * much static state. */ struct GlobalTracerContext { // When we are in a live dispatch context. Used to prevent redispatch back // to us when we call CPU implementations and to call CPU when we are in // BackendSelect and out of scope. inline bool isDispatchOn() { return dispatch_on; } bool hasActiveDispatch() { return static_cast(_active_dispatch); } IDispatch *activeDispatch() { ERROR_ON_MSG(!_active_dispatch, "There is no active dispatch"); return _active_dispatch.get(); } void resetActiveDispatch(std::unique_ptr new_dispatch) { _active_dispatch = std::move(new_dispatch); } void updatePythonCallstack() { activeDispatch()->setPythonStack(torch::jit::tracer::pythonCallstack()); } void throwPoptorchError(const PoptorchErrorInfo &info) { if (_poptorch_error_thrower) { _poptorch_error_thrower(info); } } // A simple guard to stop us from redispatching when we are already in a // dispatch context. bool dispatch_on{false}; // A state used to determine if the new tensors we receive from the dispatcher // are inputs or parameters. bool moving_parameters{false}; // A state used to determine whether we are currently registering output // tensors for the graph (in IPUScope.outputs()). If we're not, moving // output tensors may result in bad data, so we warn. An example of when // this might happen is using torch dynamic slicing in the dispatcher // (instead of poptorch.dynamic_slice()). bool moving_outputs{false}; // We can't make the difference between inputs and constants so for // now we ask the user to manually specify the input tensors. // We use TensorImpl* cast as void* to identify them. // // Note: these should only be used for pointer comparisons and should never // be dereferenced as TensorImpl objects as we don't know if they still // exist. std::set graph_inputs; // Create and store Tensors... TensorStore tensor_store; void setPoptorchErrorThrower(PoptorchErrorThrower thrower) { _poptorch_error_thrower = std::move(thrower); } private: // The active dispatcher. Created once upon dispatch start. std::unique_ptr _active_dispatch; PoptorchErrorThrower _poptorch_error_thrower; }; std::unique_ptr context = std::make_unique(); GlobalTracerContext &getContext() { return *context; } // Poplar doesn't support long, so cast to int if needed. at::Tensor downCastIfNeeded(const at::Tensor &t) { if (t.scalar_type() == at::ScalarType::Long) { return t.to(at::ScalarType::Int); } if (t.scalar_type() == at::ScalarType::Double) { return t.to(at::ScalarType::Float); } return t; } // NOLINTNEXTLINE void hostSideCast(void *dest, c10::ScalarType dest_scalar_type, void *src, const void *src_end, c10::ScalarType src_scalar_type) { // NOLINTNEXTLINE AT_DISPATCH_ALL_TYPES_AND( at::ScalarType::Half, dest_scalar_type, "copy_", [&] { using dest_t = scalar_t; // NOLINTNEXTLINE AT_DISPATCH_ALL_TYPES_AND( at::ScalarType::Half, src_scalar_type, "copy_", [&] { scalar_t *src_ = reinterpret_cast(src); dest_t *dest_ = reinterpret_cast(dest); // TODO(T69558): use vectorised casts // at::vec::convert(src, dest, numel); while (reinterpret_cast(src_) != src_end) { *(dest_++) = c10::static_cast_with_inter_type::apply( *(src_++)); } }); }); } // Return true if the given IPU tensor is a parameter. inline bool isParameter(const at::Tensor &tensor) { ERROR_ON(!getContext().hasActiveDispatch()); return getContext().activeDispatch()->isParameter(tensor); } // copy_(Tensor(a!) self, Tensor src, bool non_blocking=False) -> Tensor(a!) void copyInplace(const c10::OperatorHandle &op, c10::Stack *stack) { const c10::FunctionSchema &schema = op.schema(); const auto num_arguments = schema.arguments().size(); auto arguments = torch::jit::last(stack, num_arguments); // In an ideal world self would be allowed to change to reflect type coercion. // Unfortunately, pytorch's boxed function interface does not properly support // outputs. To work around this if need to re-allocate self we map both the // new and old values is the value mapper within the dispatcher. // Self is marked as const here to ensure we don't accidentally change it const at::Tensor self = arguments.at(0).toTensor(); const at::Tensor src = arguments.at(1).toTensor(); logging::debug("[DISPATCHER] Intercepting aten::copy_"); logging::trace("[Input] self {}", str(self)); logging::trace("[Input] src {}", str(src)); // In eager mode the dispatcher is always active so this will only be true // when working with static graphs if (!getContext().hasActiveDispatch()) { if (self.is_ipu() && src.is_cpu()) { logging::trace("copy_ CPU -> IPU, outside dispatch"); auto scalar_type = src.scalar_type(); auto coerced_type = coerceToSupportedType(scalar_type); ERROR_ON_MSG(scalar_type != coerced_type, "Unsupported scalar type `" << scalar_type << "'. Please cast to `" << coerced_type << "' before moving this tensor to the IPU."); getContext().tensor_store.copyFromCpu(self, src); } else if (self.is_cpu() && src.is_ipu()) { logging::trace("copy_ IPU -> CPU, outside dispatch"); getContext().tensor_store.copyToCpu(self, src); } else if (self.is_ipu() && src.is_ipu()) { if (!getHostBuffer(self).hasData()) { getContext().tensor_store.allocateBuffer(self); } const auto &self_buffer = getHostBuffer(self).getCpuData(); const auto &src_buffer = getHostBuffer(src).getCpuData(); ERROR_ON(!src_buffer); if (self.dtype() != src.dtype()) { logging::trace("copy_ cast from {} to {} on CPU, outside dispatch", src.dtype(), self.dtype()); hostSideCast( self_buffer->data(), self.scalar_type(), src_buffer->data(), src_buffer->data() + src_buffer->size(), src.scalar_type()); } else { ERROR_ON_MSG(self_buffer->size() != src_buffer->size(), "Failed to copy_ outside dispatch: src and self host-side " "buffer sizes are not equal."); *self_buffer = *src_buffer; } } else { ERROR("Intercepted unexpected copy_ outside dispatch: only copies " "between CPU, IPU tensors as well as between IPU tensors " "themselves are supported."); } torch::jit::drop(stack, num_arguments); torch::jit::push(stack, self); return; } getContext().updatePythonCallstack(); if (self.is_ipu()) { if (src.is_cpu()) { std::stringstream ss; ss << "copy_ CPU -> IPU "; if (isParameter(self) || getContext().moving_parameters) { getContext().activeDispatch()->addParameter(downCastIfNeeded(src), self); // Make sure the parameter flag is preserved. ss << "parameter"; } else { ERROR_ON_MSG( src.requires_grad(), "An input tensor to an IPU model can not have requires_grad set " "to True."); if (getContext().graph_inputs.count(src.unsafeGetTensorImpl()) > 0) { getContext().activeDispatch()->addInput(downCastIfNeeded(src), self); } else { getContext().activeDispatch()->addConstant(downCastIfNeeded(src), self); } ss << "input"; // Make sure the parameter flag is preserved. } ss << ", new self " << str(self); logging::debug(ss.str().c_str()); torch::jit::drop(stack, num_arguments); torch::jit::push(stack, self); } else { ERROR_ON(!src.is_ipu()); logging::debug("copy_ IPU {} -> IPU {}", src.dtype(), self.dtype()); getContext().activeDispatch()->fallback(op, stack); } } else { ERROR_ON(!self.is_cpu()); if (src.is_ipu()) { ERROR_ON_MSG(!getContext().moving_outputs, "Illegal move to CPU (via `.to(\"cpu\")`) when using the " "dispatcher. Instead, return this output as an IPU tensor."); logging::debug("copy_ output IPU -> CPU"); getContext().activeDispatch()->addOutput(src, self); torch::jit::drop(stack, num_arguments); torch::jit::push(stack, self); } else { ERROR("Unexpected tensor of type " << src.unsafeGetTensorImpl()->device_type() << ", did you forget to move a tensor to " "the IPU?"); } } } } // namespace void startParametersMove() { getContext().moving_parameters = true; } void endParametersMove() { getContext().moving_parameters = false; } void startOutputsMove() { getContext().moving_outputs = true; } void endOutputsMove() { getContext().moving_outputs = false; } // Turn on. void startDispatch() { getContext().dispatch_on = true; } void setPoptorchErrorThrower(PoptorchErrorThrower thrower) { getContext().setPoptorchErrorThrower(std::move(thrower)); } void throwPoptorchError(const PoptorchErrorInfo &info) { getContext().throwPoptorchError(info); } // Turn off. void endDispatch(bool error_occurred) { getContext().dispatch_on = false; if (error_occurred) { // If an error occurred we need to destroy the dispatcher as it will be in // an inconsistent state. destroyDispatcher(); } } // Cleanup on exit callback to avoid global destructor ordering issues void poptorchAtExit() { // Ensure that the context is deleted before globals are destroyed to avoid // issues with global destructor ordering context.reset(); } // Destroys the dispatcher after we have finished compiling void destroyDispatcher() { if (getContext().isDispatchOn()) { endDispatch(); } getContext().resetActiveDispatch(nullptr); } void setParameterName(const at::Tensor &tensor, const std::string &name) { getContext().activeDispatch()->setParameterName(tensor, name); } std::string getParameterName(torch::jit::Value *value) { return getContext().activeDispatch()->getParameterName(value); } void setParameterPerReplica(const std::string ¶m_name, const at::Tensor &tensor, int comm_group_type, int shards, int variable_retrieval_mode) { getContext().activeDispatch()->setParameterPerReplica( param_name, tensor, comm_group_type, shards, variable_retrieval_mode); } bool getParameterPerReplica(torch::jit::Value *value, PerReplicaSettings &settings) { return getContext().activeDispatch()->getParameterPerReplica(value, settings); } // Returns true if the current compilation is being handled using a dispatcher. // // This is needed because in some cases, we don't want calls to be dispatched to // us, but still want to maintain information about the dispatcher. bool isCompilingWithDispatcher() { return getContext().hasActiveDispatch(); } // Returns true if the dispatcher is currently 'on', and should intercept calls // to us. bool isDispatcherOn() { return getContext().isDispatchOn(); } CompilerOptions createMLIROptions(const std::vector &source_location_excludes) { CompilerOptions options; std::transform( source_location_excludes.begin(), source_location_excludes.end(), std::back_inserter(options.dispatcher.source_location_excludes), [](const std::string &exclude) { return std::vector(exclude.begin(), exclude.end()); }); return options; } // Take the inputs to the graph and turn them into our IR graph // inputs/parameters. void createGraph(TracingMode mode, const std::vector &inputs, const CompilerOptions &options) { if (mode == TracingMode::POPART) { getContext().resetActiveDispatch( std::make_unique(options, &getContext().tensor_store)); } else { ERROR("Unsupported target"); } getContext().updatePythonCallstack(); getContext().graph_inputs.clear(); for (const auto &input : inputs) { getContext().graph_inputs.emplace( reinterpret_cast(input.unsafeGetTensorImpl())); } } void cpuFallback(const c10::OperatorHandle &op, torch::jit::Stack *stack) { const auto name = c10::toString(op.operator_name()); logging::trace("[CPU Fallback] Running {} on CPU", name); // Call the actual boxed CPU fallback. at::native::cpu_fallback(op, stack); } void fallback(const c10::OperatorHandle &op, c10::Stack *stack) { const c10::FunctionSchema &schema = op.schema(); logging::debug("[DISPATCHER] Intercepting {} ", schema); getContext().updatePythonCallstack(); for (const auto &t : *stack) { logging::trace("[Input {}] {}", schema.name(), valueToString(t)); } getContext().activeDispatch()->fallback(op, stack); for (const auto &t : *stack) { logging::trace("[Output {}] {}", schema.name(), valueToString(t)); } } InplaceGraphInfo getInplaceGraphInfo(size_t num_anchors, bool replicas_needing_broadcast) { auto *jit = dynamic_cast(getContext().activeDispatch()); ERROR_ON_MSG(jit == nullptr, "[User Unreachable] Tracer context is null."); return jit->finalizeInplaceGraphInfo(num_anchors, replicas_needing_broadcast); } std::shared_ptr getTracedGraph() { auto *jit = dynamic_cast(getContext().activeDispatch()); ERROR_ON_MSG(jit == nullptr, "[User Unreachable] Tracer context is null."); // Build a list of nodes marked for deletion. std::unordered_set to_delete; for (torch::jit::Node *node : jit->graph->nodes()) { if (isMarkedForDeletion(node)) { to_delete.insert(node); } } // Remove the dead nodes. searchAndPossiblyDestroy(to_delete); // Return the real graph because popart_compiler will call // getDataSourceForValue() on some of these nodes and if we // clone the graph we won't be able to find the mappings. return jit->graph; } void finalizeGraph() { getContext().activeDispatch()->finalizeGraph(); } void *getDataSource(const at::Tensor &tensor) { return getHostBuffer(tensor).getCpuData()->data(); } void *getDataSourceForValue(torch::jit::Value *value) { return getContext().activeDispatch()->getDataSource(value); } bool isParameter(torch::jit::Value *value) { return getContext().activeDispatch()->isParameter(value); } // This is the function called by Torch to trigger an IPU to Host // sync: we forward it to the CPU backend which will then issue // some copy_ calls between IPU and CPU tensors instead. at::Scalar localScalarDense(const at::Tensor &self) { logging::trace("Sync to CPU"); return at::native::call_fallback_fn<&poptorch::cpuFallback, ATEN_OP(_local_scalar_dense)>::call(self); } at::Scalar item(const at::Tensor &self) { ERROR("aten::item is only supported in eager mode, but was intercepted in " "a static graph. This means an IPU to CPU copy was triggered before " "the end of the graph, for example by calling tensor.item(). " "Please ensure that any such copies are removed."); return at::native::call_fallback_fn<&poptorch::cpuFallback, ATEN_OP(item)>::call(self); } at::Tensor emptyBase(at::IntArrayRef size, c10::optional dtype = c10::nullopt, c10::optional layout = c10::nullopt, c10::optional device = c10::nullopt, c10::optional pin_memory = c10::nullopt, c10::optional memory_format = c10::nullopt) { ERROR_ON(!device); // Internal error: shouldn't happen if (isIpuDevice(*device)) { // We use the device ID to determine if a tensor is a parameter // (device 1) or not (device 0) but in reality all the tensors // currently live on the same IPU so always use the default IPU. at::Tensor output = getContext().tensor_store.allocateTensor( size, dtype, nullptr, deviceOrDefaultIpu({})); // TODO(T61576) Find a better way to identify parameters and buffers. if (getContext().hasActiveDispatch()) { getContext().updatePythonCallstack(); getContext().activeDispatch()->registerEmptyTensor( output, getContext().moving_parameters); } return output; } // Native calls are a dispatch endpoint so will not be redispatched. at::Tensor output = at::native::empty_cpu(size, dtype, layout, device, pin_memory, memory_format); return output; } // Handler for IPU empty tensors: this means the returned tensor must be // an IPU tensor. at::Tensor emptyMemoryFormat( at::IntArrayRef size, c10::optional dtype = c10::nullopt, c10::optional layout = c10::nullopt, c10::optional device = c10::nullopt, c10::optional pin_memory = c10::nullopt, c10::optional memory_format = c10::nullopt) { auto device_or_default = deviceOrDefaultIpu(device); logging::debug( "[DISPATCHER] Intercepting aten::empty.memory_format, device {}", device_or_default.str()); return poptorch::emptyBase(size, dtype, layout, device_or_default, pin_memory, memory_format); } // func: empty_strided(int[] size, int[] stride, *, ScalarType? dtype=None, // Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor at::Tensor emptyStrided(at::IntArrayRef size, at::IntArrayRef stride, c10::optional dtype = c10::nullopt, c10::optional layout = c10::nullopt, c10::optional device = c10::nullopt, c10::optional pin_memory = c10::nullopt) { ERROR_ON(!device); // Internal error: shouldn't happen ERROR_ON(!isIpuDevice(*device)); logging::debug("[DISPATCHER] Intercepting aten::empty_strided, device {}", device->str()); ERROR_ON(at::detail::defaultStrides(size) != stride); return emptyBase(size, dtype, layout, device, pin_memory); } at::Tensor linalgMatrixNorm(const at::Tensor &self, const at::Scalar &ord, at::IntArrayRef dim, bool keepdim, c10::optional dtype) { auto ord_double = ord.toDouble(); auto abs_ord = std::abs(ord_double); if (abs_ord != 2.) { // As long as we're not dealing with a 2-norm, we can call the // operator as usual, which will redispatch the constituent operations return at::native::linalg_matrix_norm(self, ord, dim, keepdim, dtype); } // The 2-norm is defined as the largest (for +2) or smallest (for -2) // singular value of the matrix. ERROR("Matrix 2-norm is not supported."); } at::Tensor linalgMatrixNormStrOrd(const at::Tensor &self, c10::string_view ord, at::IntArrayRef dim, bool keepdim, c10::optional dtype) { if (ord != "nuc") { // As long as we're not dealing with a nuclear norm, we can call the // operator as usual, which will redispatch the constituent operations return at::native::linalg_matrix_norm(self, ord, dim, keepdim, dtype); } // The nuclear norm is defined as the sum of singular values of the matrix. ERROR("Matrix nuclear norm is not supported."); } // aten::detach(Tensor(a) self) -> (Tensor(a)) void detach(const c10::OperatorHandle &op, c10::Stack *stack) { logging::debug("[DISPATCHER] Intercepting aten::detach"); if (getContext().hasActiveDispatch()) { getContext().updatePythonCallstack(); // Perform the shallow copy and detach. getContext().activeDispatch()->detach(op, stack, getContext().moving_parameters); } else { const c10::FunctionSchema &schema = op.schema(); const auto num_arguments = schema.arguments().size(); const auto arguments = torch::jit::last(stack, num_arguments); ERROR_ON(arguments.size() != 1); const at::Tensor in = arguments.front().toTensor(); const at::Tensor out(in.unsafeGetTensorImpl()->shallow_copy_and_detach( /*version_counter=*/in.unsafeGetTensorImpl()->version_counter(), /*allow_tensor_metadata_change=*/true)); torch::jit::drop(stack, num_arguments); torch::jit::push(stack, out); } } // NOTE: This gets called by _weight_norm's handler, if certain conditions are // met. However, those conditions never used to be met, and so we never had to // implement this handler. Now we do, so for now just emulate the old behaviour. void weightNormInterface(const c10::OperatorHandle &op, c10::Stack *stack) { const auto num_arguments = op.schema().arguments().size(); auto arguments = torch::jit::last(stack, num_arguments); const auto v = arguments.at(0).toTensor(); const auto g = arguments.at(1).toTensor(); const std::int64_t dim = arguments.at(2).toInt(); torch::jit::drop(stack, num_arguments); const auto out = v * (g / at::norm_except_dim(v, 2, dim)); torch::jit::push(stack, out); // Strictly speaking the schema of `_weight_norm_interface` returns a // (Tensor, Tensor); in its sole usage in `_weight_norm`, only the first // member is used, so just return something empty of the right shape. torch::jit::push(stack, at::empty_like(g)); } void replaceValueDispatcher(torch::jit::Value *v_old, torch::jit::Value *v_new) { if (!getContext().hasActiveDispatch()) { return; } getContext().activeDispatch()->replaceValue(v_old, v_new); } std::uint64_t getIpuTensorId(const at::Tensor &tensor) { ERROR_ON_MSG(!isIpuTensor(tensor), "You may only call getIpuTensorId on an IPU tensor"); return ipuTensorId(tensor); } } // namespace poptorch /* The actual dispatcher part. Overriding these keys causes most operations to fall through to our fallback catchers. */ TORCH_LIBRARY_IMPL(_, IPU, m) { m.fallback(PTC_BOXED(poptorch::fallback)); } TORCH_LIBRARY_IMPL(_, AutogradIPU, m) { m.fallback(PTC_BOXED(poptorch::fallback)); } /* There are two kinds of PyTorch ops: the ones that require registration (and a backend-specific kernel) and the ones that are optional. If optional ops are not registered they get decomposed into several required ops that must then be intercepted by the backend provider. More information on this can be found at https://pytorch.org/tutorials/advanced/extend_dispatcher.html. In essence: - required ops have 'dispatch' set to TRUE and 'default' set to FALSE in RegistrationDeclarations.h - optional ops have 'dispatch' set to FALSE or 'default' set to TRUE in RegistrationDeclarations.h RegisterOptionalAtenOps.cpp.inc registers the optional ops that our backend intercepts. RegisterMetaOps.cpp.inc registers the meta implementations of operations that are used for type inference */ #include "RegisterMetaOps.cpp.inc" #include "RegisterOptionalAtenOps.cpp.inc" // These cannot be intercepted using the non-autograd key unless // torch.inference_mode is used TORCH_LIBRARY_IMPL(aten, AutogradIPU, m) { // This is required to intercept detach calls when moving parameters to the // IPU. m.impl("detach", PTC_BOXED(poptorch::detach)); // These must be intercepted at the autograd level otherwise they'll go // through fallback m.impl("linalg_matrix_norm", PTC(poptorch::linalgMatrixNorm)); m.impl("linalg_matrix_norm.str_ord", PTC(poptorch::linalgMatrixNormStrOrd)); } void popArgumentsFromStack(const c10::OperatorHandle &op, c10::Stack *stack) { ERROR_ON(op.schema().arguments().size() > stack->size()); stack->erase(std::prev(stack->end(), op.schema().arguments().size()), stack->end()); } void pushResultsToStack(c10::Stack *stack, const std::vector &results) { stack->insert(stack->end(), results.begin(), results.end()); } // Pop op's arguments from the stack, and (if given) push any results to the // back. void updateStack(const c10::OperatorHandle &op, c10::Stack *stack, const std::vector &results = {}) { popArgumentsFromStack(op, stack); if (!results.empty()) { pushResultsToStack(stack, results); } } // Get an argument from the given stack. c10::IValue getNthArgument(const c10::OperatorHandle &op, c10::Stack *stack, size_t n) { ERROR_ON(op.schema().arguments().size() > stack->size()); return stack->at((stack->size() - op.schema().arguments().size()) + n); } void opReturningFirstArgument(const c10::OperatorHandle &op, c10::Stack *stack) { const auto front = getNthArgument(op, stack, 0); updateStack(op, stack, {front}); } void opWithoutOutputs(const c10::OperatorHandle &op, c10::Stack *stack) { if (poptorch::isDispatcherOn()) { poptorch::fallback(op, stack); } else { updateStack(op, stack); } } void callCpuOp(const c10::OperatorHandle &op, c10::Stack *stack) { opWithoutOutputs(op, stack); if (poptorch::isDispatcherOn()) { poptorch::endDispatch(); } } void endCpuOp(const c10::OperatorHandle &op, c10::Stack *stack) { // This op might have been called as part of a CPU model in which case we // don't want to re-start the dispatcher. if (poptorch::isCompilingWithDispatcher()) { poptorch::startDispatch(); poptorch::fallback(op, stack); } opReturningFirstArgument(op, stack); } at::Tensor castOp(const at::Tensor &tensor, const std::string &type) { // If the type to cast to is f16 then we need to cast to f32. The reason being // is that by default we will just ignore the type, however this will only // work if the original type was f32. // Consider: /* MyTensor = MyTensor.as(INT8) MyTensor = MyTensor.half() # Convert to half. out = conv(MyTensor) # This would be an illegal INT8 convolution. */ if (type == "FLOAT16" || type == "FLOAT32") { return tensor.to(at::ScalarType::Float); } return tensor; } // c10::List // customOperation(c10::List inputs, // std::string name, std::string domain, // int64_t version, int64_t num_outputs, // c10::List example_outputs, // std::string attributes_map_id) { // return example_outputs; // } void customOperation(const c10::OperatorHandle &op, c10::Stack *stack) { auto out = getNthArgument(op, stack, 5); updateStack(op, stack, {out}); } // dynamic_slice(Tensor self, int dim, Tensor start, int size, int step) -> // Tensor at::Tensor dynamicSlice(const at::Tensor &self, int64_t dim, const at::Tensor &start, int64_t size, int64_t step) { auto st = start.scalar_type(); std::int64_t start_int; if (st == torch::kInt64) { start_int = start.data_ptr()[0]; } else if (st == torch::kInt32) { start_int = start.data_ptr()[0]; } else if (st == torch::kInt16) { start_int = start.data_ptr()[0]; } else { ERROR("Expected integer typed start tensor"); } return at::slice(self, dim, {start_int}, {start_int + size}, step); } // dynamic_update(Tensor self, Tensor src, int dim, Tensor start, int size) -> // Tensor at::Tensor dynamicUpdate(const at::Tensor &self, const at::Tensor &src, int64_t dim, const at::Tensor &start, int64_t size) { auto st = start.scalar_type(); std::int64_t start_int; if (st == torch::kInt64) { start_int = start.data_ptr()[0]; } else if (st == torch::kInt32) { start_int = start.data_ptr()[0]; } else if (st == torch::kInt16) { start_int = start.data_ptr()[0]; } else { ERROR("Expected integer typed start tensor"); } return at::slice_scatter(self, src, dim, start_int, start_int + size, 1); } std::tuple ctcBeamSearchDecoder(const at::Tensor &log_probs, const at::Tensor & /*lengths*/, int64_t /*blank*/, int64_t /*width*/, int64_t top_paths) { ERROR_ON_MSG(log_probs.sizes().size() != 3, "Incorrect shape for first input to CTC beam search decoder."); const unsigned input_len = log_probs.sizes()[0]; const unsigned batch_size = log_probs.sizes()[1]; const at::Tensor path_probs = at::zeros({batch_size, top_paths}); const at::Tensor path_lens = at::zeros({batch_size, top_paths}); const at::Tensor decoded_paths = at::zeros({batch_size, top_paths, input_len}); return {path_probs, path_lens, decoded_paths}; } // at::Tensor identityLoss(const at::Tensor &t, int64_t reduction) at::Tensor identityLoss(const at::Tensor &t, int64_t reduction) { constexpr int64_t sum = 0; constexpr int64_t mean = 1; constexpr int64_t none = 2; switch (reduction) { case sum: return at::sum(t); case mean: return at::mean(t); case none: return t.clone(); default: ERROR("reduction must be sum (0), mean (1) or none (2)"); } } #define OP_WITHOUT_OUTPUTS(signature) \ torch::schema(signature, c10::AliasAnalysisKind::CONSERVATIVE), \ PTC_BOXED(opWithoutOutputs) TORCH_LIBRARY(poptorch, m) { // These operations have no outputs, and so are registered with side-effects // to prevent being pruned by dead-code elimination m.def(OP_WITHOUT_OUTPUTS( "begin_ipu_block(int stage_id, int phase_id, int ipu_id) -> ()")); m.def(OP_WITHOUT_OUTPUTS("end_ipu_block() -> ()")); m.def(OP_WITHOUT_OUTPUTS("start_for_loop(Tensor[] inputs) -> ()")); m.def(OP_WITHOUT_OUTPUTS("start_if_block(Tensor condition) -> ()")); m.def(OP_WITHOUT_OUTPUTS("start_else_block(Tensor[] outputs_true) -> ()")); m.def( OP_WITHOUT_OUTPUTS("optimizer_group(int group, Tensor[] inputs) -> ()")); m.def(OP_WITHOUT_OUTPUTS("begin_multi_conv() -> ()")); m.def(OP_WITHOUT_OUTPUTS( "end_multi_conv(float[]? " "available_memory_proportions, int[]? partials_types, int? " "plan_type, int? per_conv_reserved_tiles, float? " "cycle_back_off, int[]? enableConvDithering) -> ()")); m.def(OP_WITHOUT_OUTPUTS("push_name_scope(str name) -> ()")); m.def(OP_WITHOUT_OUTPUTS("pop_name_scope() -> ()")); m.def(OP_WITHOUT_OUTPUTS( "set_attribute(str attribute, str key, str value) -> ()")); m.def(OP_WITHOUT_OUTPUTS("clear_attribute(str attribute, str key) -> ()")); // Operations returning the first argument m.def("ipu_print_tensor(Tensor self, str title, int print_gradient, int " "summarize_threshold, int edge_items, int max_line_width, int digits, " "int float_format, str separator, str open_bracket, str close_bracket) " "-> Tensor"); m.def("nop(Tensor self) -> Tensor"); m.def("end_for_loop(Tensor[] outputs, Tensor[] " "inputs, int trip_count) -> Tensor[]"); m.def("end_if_block(Tensor[] outputs, Tensor condition) -> Tensor[]"); m.def("set_matmul_serialization(Tensor matmul, str " "mode, int factor, bool keep_precision) -> Tensor"); m.def("set_overlap_for_input(Tensor t, str mode) -> Tensor"); m.def("set_overlap_for_output(Tensor t, str mode) -> Tensor"); m.def("recomputation_checkpoint(Tensor self) -> Tensor"); m.def("set_available_memory(Tensor t, float mem) -> Tensor"); m.def("custom_operation(Tensor[] inputs, str name, str domain, int " "domain_version, int num_outputs, Tensor[] outputs, str attributes) -> " "Tensor[]"); m.def("ctc_beam_search_decoder(Tensor probs, " "Tensor lengths, int blank, int beam_width, int " "top_paths) -> (Tensor, Tensor, Tensor)"); m.def("dynamic_slice(Tensor self, int dim, Tensor start, int size, int step) " "-> Tensor"); m.def("dynamic_update(Tensor self, Tensor src, int dim, Tensor start, int " "size) " "-> Tensor"); m.def("identity_loss(Tensor x, int reduction) -> Tensor"); m.def("internal_cast(Tensor self, str dtype) -> Tensor"); // call_cpu_op and end_cpu_op are special cases because they must // immediately switch the dispatcher on/off so the default poptorch // fallback cannot be used. They are also registered with side-effects // to ensure they are not reintercepted during constexpr evaluation m.def(torch::schema("end_cpu_op(Tensor[] output) -> Tensor[]", c10::AliasAnalysisKind::CONSERVATIVE), PTC_BOXED(endCpuOp)); m.def(torch::schema("call_cpu_op(Tensor[] inputs, str name) -> ()", c10::AliasAnalysisKind::CONSERVATIVE), PTC_BOXED(callCpuOp)); m.def("fps(Tensor src, " "int[] ptr, float ratio, bool random_start) -> Tensor"); m.def("nearest(Tensor x, Tensor y, " "Tensor batch_x, Tensor batch_y) -> Tensor"); m.def("nearest_batch_list(Tensor x, Tensor y, " "int[] batch_x, int[] batch_y) -> Tensor"); } TORCH_LIBRARY_IMPL(poptorch, CPU, m) { // Operations returning the first argument m.impl("end_for_loop", PTC_BOXED(opReturningFirstArgument)); m.impl("end_if_block", PTC_BOXED(opReturningFirstArgument)); m.impl("ipu_print_tensor", PTC_BOXED(opReturningFirstArgument)); m.impl("nop", PTC_BOXED(opReturningFirstArgument)); m.impl("recomputation_checkpoint", PTC_BOXED(opReturningFirstArgument)); m.impl("set_available_memory", PTC_BOXED(opReturningFirstArgument)); m.impl("set_matmul_serialization", PTC_BOXED(opReturningFirstArgument)); m.impl("set_overlap_for_input", PTC_BOXED(opReturningFirstArgument)); m.impl("set_overlap_for_output", PTC_BOXED(opReturningFirstArgument)); // Operations with their own CPU implementations m.impl("ctc_beam_search_decoder", PTC(ctcBeamSearchDecoder)); m.impl("custom_operation", PTC_BOXED(customOperation)); m.impl("dynamic_slice", PTC(dynamicSlice)); m.impl("dynamic_update", PTC(dynamicUpdate)); m.impl("identity_loss", PTC(identityLoss)); m.impl("internal_cast", PTC(castOp)); } // By default, if we don't register anything for autograd, the the outputs of // `poptorch::` ops will have no `grad_fn` (making them leaves). For PopART it's // not inherently an issue since PopART does its own thing in the backward pass. // However, PyTorch will error if you put the output of one of these ops through // an inplace op: `a leaf Variable that requires grad is being used in an // in-place operation.` // // The JIT trace will have the `grad_fn`s filled with whatever the previous // `grad_fn` of the input was, so this isn't an issue. // // Note: Presumably, for non-PopART backends these will need to have // implementations (`torch::autograd::Function` subclasses). TORCH_LIBRARY_IMPL(poptorch, AutogradIPU, m) { m.impl("begin_ipu_block", torch::autograd::autogradNotImplementedFallback()); m.impl("end_ipu_block", torch::autograd::autogradNotImplementedFallback()); m.impl("ipu_print_tensor", torch::autograd::autogradNotImplementedFallback()); m.impl("internal_cast", torch::autograd::autogradNotImplementedFallback()); m.impl("nop", torch::autograd::autogradNotImplementedFallback()); m.impl("dynamic_slice", torch::autograd::autogradNotImplementedFallback()); m.impl("custom_operation", torch::autograd::autogradNotImplementedFallback()); m.impl("ctc_beam_search_decoder", torch::autograd::autogradNotImplementedFallback()); m.impl("identity_loss", torch::autograd::autogradNotImplementedFallback()); m.impl("start_for_loop", torch::autograd::autogradNotImplementedFallback()); m.impl("end_for_loop", torch::autograd::autogradNotImplementedFallback()); m.impl("start_if_block", torch::autograd::autogradNotImplementedFallback()); m.impl("start_else_block", torch::autograd::autogradNotImplementedFallback()); m.impl("end_if_block", torch::autograd::autogradNotImplementedFallback()); m.impl("fps", torch::autograd::autogradNotImplementedFallback()); m.impl("nearest", torch::autograd::autogradNotImplementedFallback()); m.impl("nearest_batch_list", torch::autograd::autogradNotImplementedFallback()); m.impl("optimizer_group", torch::autograd::autogradNotImplementedFallback()); m.impl("set_matmul_serialization", torch::autograd::autogradNotImplementedFallback()); m.impl("set_overlap_for_input", torch::autograd::autogradNotImplementedFallback()); m.impl("set_overlap_for_output", torch::autograd::autogradNotImplementedFallback()); m.impl("recomputation_checkpoint", torch::autograd::autogradNotImplementedFallback()); m.impl("set_available_memory", torch::autograd::autogradNotImplementedFallback()); m.impl("begin_multi_conv", torch::autograd::autogradNotImplementedFallback()); m.impl("end_multi_conv", torch::autograd::autogradNotImplementedFallback()); m.impl("push_name_scope", torch::autograd::autogradNotImplementedFallback()); m.impl("pop_name_scope", torch::autograd::autogradNotImplementedFallback()); m.impl("end_cpu_op", torch::autograd::autogradNotImplementedFallback()); m.impl("call_cpu_op", torch::autograd::autogradNotImplementedFallback()); m.impl("set_attribute", torch::autograd::autogradNotImplementedFallback()); m.impl("clear_attribute", torch::autograd::autogradNotImplementedFallback()); } ================================================ FILE: poptorch/source/dispatch_tracer/RegisterMetaOps.cpp.inc ================================================ // Copyright (c) 2022 Graphcore Ltd. All rights reserved. #include #include namespace poptorch::meta { std::tuple nllLoss2dForward(const at::Tensor &self, const at::Tensor &/*target*/, const c10::optional &/*weight*/, int64_t reduction, int64_t /*ignore_index*/) { // If reduction is none, the shape is the the input without number of // classes, which is the second element, i.e. (N, C, ...) to (N, ...) // except in the case of a 1D input (C) when it is (). std::vector shape; if (reduction == 0){ shape = std::vector(self.sizes().begin(), self.sizes().end()); if(shape.size() == 1) { shape.clear(); } else { ERROR_ON(shape.size() < 2); shape.erase(shape.begin() + 1); } } at::Tensor output = at::meta::empty(shape, self.scalar_type()); at::Tensor total_weight = at::meta::empty({}, self.scalar_type()); return {output, total_weight}; } at::Tensor ctcLoss(const at::Tensor &log_probs, const at::Tensor &/*targets*/, at::IntArrayRef /*input_lengths*/, at::IntArrayRef /*target_lengths*/, int64_t /*blank*/, int64_t reduction, bool /*zero_infinity*/) { std::vector shape; if (reduction == 0 && log_probs.sizes().size() == 3) { shape = {log_probs.sizes()[1]}; } return at::meta::empty(shape, log_probs.scalar_type()); } at::Tensor bincount(const at::Tensor &, const c10::optional & weights, int64_t minlength) { ERROR_ON_MSG(minlength <= 0, "Bincount `minlength` must be specified and must be a constant. " "On the IPU MK2 platform the minimum length is also the " "maximum length"); return at::meta::empty({minlength}, weights ? weights->scalar_type() : c10::ScalarType::Int); } at::Tensor & bincountOut(const at::Tensor &, const c10::optional &, int64_t minlength, at::Tensor & out) { ERROR_ON_MSG(minlength <= 0, "Bincount `minlength` must be specified and must be a constant. " "On the IPU MK2 platform the minimum length is also the " "maximum length"); return out; } TORCH_API at::Tensor bucketize(const at::Tensor & self, const at::Tensor &, bool out_int32=false, bool right=false) { UNUSED(right); UNUSED(out_int32); const auto input_shape = self.sizes().vec(); return at::meta::empty(input_shape, out_int32 ? c10::ScalarType::Int : c10::ScalarType::Long); } TORCH_API at::Tensor& hardsigmoidOut(at::Tensor const&, at::Tensor& out) { return out; } TORCH_API at::Tensor& siluOut(at::Tensor const&, at::Tensor& out) { return out; } TORCH_API at::Tensor & bucketizeOut(const at::Tensor & , const at::Tensor & , bool , bool , at::Tensor & out) { return out; } TORCH_API bool equal(const at::Tensor &, const at::Tensor &) { return {}; } torch::Tensor grid(torch::Tensor pos, torch::Tensor size, torch::optional optional_start, torch::optional optional_end) { pos = pos.view({pos.size(0), -1}); ERROR_ON_MSG(size.numel() != pos.size(1), "grid: size.numel() == pos.size(1)"); if (!optional_start.has_value()) optional_start = std::get<0>(pos.min(0)); else ERROR_ON_MSG(optional_start.value().numel() != pos.size(1), "grid: optional_start.value().numel() == pos.size(1)"); if (!optional_end.has_value()) optional_end = std::get<0>(pos.max(0)); else ERROR_ON_MSG(optional_start.value().numel() != pos.size(1), "grid: optional_start.value().numel() == pos.size(1)"); auto start = optional_start.value(); auto end = optional_end.value(); pos = pos - start.unsqueeze(0); auto num_voxels = (end - start).true_divide(size).toType(torch::kLong) + 1; num_voxels = num_voxels.cumprod(0); num_voxels = torch::cat({torch::ones({1}, num_voxels.options()), num_voxels}, 0); num_voxels = num_voxels.narrow(0, 0, size.size(0)); auto out = pos.true_divide(size.view({1, -1})).toType(torch::kLong); out *= num_voxels.view({1, -1}); out = out.sum(1); return out; } at::Tensor ctcLossTensor(const at::Tensor &log_probs, const at::Tensor &/*targets*/, const at::Tensor &/*input_lengths*/, const at::Tensor &/*target_lengths*/, int64_t /*blank*/, int64_t reduction, bool /*zero_infinity*/) { // If no reduction, get the batch size; from docs, this will be // `log_probs`' second dimension if it's 3D. std::vector shape; if (reduction == 0 && log_probs.sizes().size() == 3) { shape = {log_probs.sizes()[1]}; } return at::meta::empty(shape, log_probs.scalar_type()); } at::Tensor median(const at::Tensor &self) { return at::meta::empty({}, self.scalar_type()); } std::tuple medianDim(const at::Tensor &self, int64_t dim, bool keepdim) { std::vector shape = self.sizes().vec(); dim = dim < 0 ? dim + self.sizes().size() : dim; if (!shape.empty()) { if (keepdim) { shape[dim] = 1; } else { shape.erase(shape.begin() + dim); } } auto values = at::meta::empty(shape, self.scalar_type()); auto indices = at::meta::empty(shape, c10::ScalarType::Long); return {values, indices}; } at::Tensor countNonzero(const at::Tensor &self, at::IntArrayRef dim) { auto dim_vec = dim.vec(); for (auto &d : dim_vec) { d = d < 0 ? d + self.sizes().size() : d; } std::vector shape = {1}; if (dim.size() > 0) { shape = self.sizes().vec(); auto sorted_dims = dim_vec; std::sort(sorted_dims.begin(), sorted_dims.end(), std::greater<>{}); ERROR_ON_MSG(std::adjacent_find(sorted_dims.begin(), sorted_dims.end()) != sorted_dims.end(), "The dimensions to count must be unique"); for (auto d : sorted_dims) { shape.erase(shape.begin() + d); } } return at::meta::empty(shape, self.scalar_type()); } at::Tensor oneHot(const at::Tensor &self, int64_t num_classes) { ERROR_ON_MSG(num_classes == -1, "OneHot num classes must be specified and must be constant."); auto shape = self.sizes().vec(); shape.push_back(num_classes); return at::meta::empty(shape, self.scalar_type()); } at::Tensor upsampleNearest3d(const at::Tensor &input, at::OptionalSymIntArrayRef output_size, c10::optional> scale_factors) { ERROR_ON_MSG(!scale_factors && !output_size, "Must specify either output_size or scale_factors, but not both."); const auto input_shape = input.sizes().vec(); std::vector actual_output_size; if (output_size.has_value()) { ERROR_ON_MSG(scale_factors, "Must specify either output_size or scale_factors, but not both."); actual_output_size.reserve(output_size->size()); for (auto i : output_size.value()) { actual_output_size.push_back(i.as_int_unchecked()); } } else if (scale_factors.has_value()) { std::transform(scale_factors->begin(), scale_factors->end(), input_shape.end() - scale_factors->size(), std::back_inserter(actual_output_size), [](double sf, std::int64_t shape) { return static_cast(static_cast(shape) * sf); }); } ERROR_ON_MSG(actual_output_size.size() > input_shape.size(), "The number of dimensions of the input (" + std::to_string(input_shape.size()) + ") must be more than the number of dimensions in the output (" + std::to_string(actual_output_size.size()) + ")"); std::vector shape(input_shape.begin(), input_shape.end() - actual_output_size.size()); shape.insert(shape.end(), actual_output_size.begin(), actual_output_size.end()); return at::meta::empty(shape, input.scalar_type()); } at::Tensor maxPool3d(const at::Tensor &self, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool ceil_mode) { std::vector input_shape = self.sizes().vec(); ERROR_ON_MSG(input_shape.size() != kernel_size.size() + 1 && input_shape.size() != kernel_size.size() + 2, "The kernel size (" << kernel_size.size() << ") must be 1 or 2 less than the input rank (" << input_shape.size() << ")"); ERROR_ON(kernel_size.size() != stride.size()); ERROR_ON(kernel_size.size() != padding.size()); ERROR_ON(kernel_size.size() != dilation.size()); const size_t offset = (input_shape.size() == kernel_size.size() + 1) ? 1 : 2; for (auto s = 0u; s < kernel_size.size(); s++) { double tmp = (input_shape[offset + s] + 2. * padding[s] - dilation[s] * (kernel_size[s] - 1.) - 1.) / stride[s] + 1.; if (ceil_mode) { input_shape[offset + s] = std::ceil(tmp); } else { input_shape[offset + s] = std::floor(tmp); } } return at::meta::empty(input_shape, self.scalar_type()); } at::Tensor nonzero(const at::Tensor &) { ERROR("Operations using aten::nonzero are unsupported because " "the output shape is determined by the tensor values. " "The IPU cannot support dynamic output shapes."); } // torch_scatter std::tuple scatterMinMax(at::Tensor src, at::Tensor /*index*/, int64_t dim, c10::optional out, c10::optional dim_size) { std::vector out_shape = src.sizes().vec(); dim = dim < 0 ? dim + out_shape.size() : dim; if (out) { out_shape = out->sizes().vec(); } else if (dim_size) { out_shape[dim] = *dim_size; } else { ERROR("You must provide either an output parameter or specify dim_size so the output shape may be inferred"); } if (dim_size.has_value()) { ERROR_ON_MSG(*dim_size != out_shape[dim], "dim_size = " << *dim_size << " expected to be the same as out.shape()[dim] = " << out_shape[dim] << ", dim = " << dim ); } const auto output = at::meta::empty(out_shape, src.scalar_type()); const auto argminmax = at::meta::empty(out_shape, c10::ScalarType::Long); return {output, argminmax}; } at::Tensor scatterMul(at::Tensor src,at::Tensor index, int64_t dim, c10::optional out, c10::optional dim_size) { return std::get<0>(scatterMinMax(src, index, dim, out, dim_size)); } // torch_cluster at::Tensor fps(const torch::Tensor &src, const std::vector &ptr, double ratio, bool /*random_start*/) { const auto dim = src.dim(); const auto ptr_size = ptr.size(); const auto src_size = src.size(0); ERROR_ON_MSG(ratio <= 0.0 || ratio > 1.0, "`ratio` (" << ratio << ") has to be in range (0.0, 1.0>"); ERROR_ON_MSG(dim != 2, "`src` is supposed to be 2d Tensor, while it has " << dim << " dims"); ERROR_ON_MSG( ptr_size < 2 || ptr_size > static_cast(src_size) + 1, "`ptr` length (" << ptr_size << ") is supposed to be < src.size(0) (" << src_size << ")"); std::int64_t out_size = 0; for (size_t i = 1; i < ptr_size; i++) out_size += std::ceil(static_cast(ptr[i] - ptr[i - 1]) * ratio); return at::meta::empty({out_size}, c10::ScalarType::Int); } // torch_spline_conv std::tuple splineBasis(at::Tensor pseudo, at::Tensor /*kernel_size*/, at::Tensor /*is_open_spline*/, int64_t degree) { const std::vector in_shape = pseudo.sizes().vec(); const std::int64_t numEdges = in_shape[0]; const std::int64_t numDims = in_shape[1]; const std::int64_t numSplines = std::pow(degree + 1, numDims) + 0.5; const std::vector out_shape({numEdges, numSplines}); const auto basis = at::meta::empty(out_shape, pseudo.scalar_type()); const auto weightIndex = at::meta::empty(out_shape, c10::ScalarType::Int); return {basis, weightIndex}; } at::Tensor splineWeighting(at::Tensor input, at::Tensor weight, at::Tensor /*basis*/, at::Tensor /*weight_index*/) { const std::vector in_shape = input.sizes().vec(); const std::vector w_shape = weight.sizes().vec(); const std::vector out_shape({in_shape[0], w_shape[2]}); const auto output = at::meta::empty(out_shape, input.scalar_type()); return output; } at::Tensor nearest(const torch::Tensor &x, const torch::Tensor &, const torch::Tensor &, const torch::Tensor &) { return at::meta::empty({x.sizes().front()}, c10::ScalarType::Int); } at::Tensor nearest_batch_list(const torch::Tensor &x, const torch::Tensor &, const std::vector &, const std::vector &) { return at::meta::empty({x.sizes().front()}, c10::ScalarType::Int); } // poptorch // dynamic_slice(Tensor self, int dim, Tensor start, int size, int step) -> Tensor at::Tensor dynamicSlice(const at::Tensor &self, int64_t dim, const at::Tensor &/*start*/, int64_t size, int64_t step) { auto shape = self.sizes().vec(); shape[dim] = (size + (step - 1)) / step; return at::meta::empty(shape, self.scalar_type()); } // dynamic_update(Tensor self, Tensor src, int dim, Tensor start, int size, int step) -> Tensor at::Tensor dynamicUpdate(const at::Tensor &self, const at::Tensor & /*src*/, int64_t /*dim*/, const at::Tensor & /*start*/, int64_t /*size*/) { auto shape = self.sizes().vec(); return at::meta::empty(shape, self.scalar_type()); } // custom_operation(Tensor[] inputs, str name, str domain, int domain_version, int num_outputs, Tensor(a!)[] outputs, str attributes) -> Tensor(a!)[] std::vector customOperation(const std::vector &/*inputs*/, const std::string &/*name*/, const std::string &/*domain*/, int64_t /*domain_version*/, int64_t /*num_outputs*/, const std::vector &outputs, const std::string &/*attributes*/) { std::vector ret; for (const auto &t : outputs) { ret.push_back(at::meta::empty(t.sizes(), t.scalar_type())); } return ret; } at::Tensor & tanh_backward_out(const at::Tensor& , const at::Tensor &, at::Tensor & grad_input) { return grad_input; } std::tuple native_layer_norm_backward( const at::Tensor&, const at::Tensor& input, at::IntArrayRef , const at::Tensor&, const at::Tensor&, const c10::optional& weight, const c10::optional& bias, ::std::array output_mask) { return { at::meta::empty(output_mask[0] ? input.sizes().vec() : std::vector{}, input.scalar_type()), at::meta::empty(output_mask[1] && weight ? weight->sizes().vec() : std::vector{}, weight && weight->defined() ? weight->scalar_type() : input.scalar_type()), at::meta::empty(output_mask[2] && bias ? bias->sizes().vec() : std::vector{}, bias && weight->defined() ? bias->scalar_type() : input.scalar_type()) }; } // ctc_beam_search_decoder(Tensor probs, Tensor lengths, int blank, int beam_width, int top_paths) -> (Tensor, Tensor, Tensor) std::tuple ctcBeamSearchDecoder(const at::Tensor &probs, const at::Tensor &/*lengths*/, int64_t /*blank*/, int64_t /*beam_width*/, int64_t top_paths) { ERROR_ON_MSG(probs.sizes().size() != 3, "Input probablities tensor must be rank-3 for " "`ctc_beam_search_decoder`."); const auto input_size = probs.sizes()[0]; const auto batch_size = probs.sizes()[1]; auto out_probs = at::meta::empty({batch_size, top_paths}, probs.scalar_type()); auto out_paths = at::meta::empty({batch_size, top_paths, input_size}, probs.scalar_type()); return {out_probs, out_probs, out_paths}; } // identity_loss(Tensor x, str reduction) -> Tensor at::Tensor identityLoss(const at::Tensor &x, int64_t reduction) { constexpr int64_t sum = 0; constexpr int64_t mean = 1; constexpr int64_t none = 2; std::vector sizes; switch (reduction) { case sum: case mean: break; case none: sizes = x.sizes().vec(); break; default: ERROR("reduction must be sum (0), mean (1) or none (2)"); } return at::meta::empty(sizes, x.scalar_type()); } void opWithoutOutputs(const c10::OperatorHandle &/*op*/, c10::Stack *stack) { stack->clear(); } void opReturningFirstArgument(const c10::OperatorHandle &/*op*/, c10::Stack *stack) { stack->erase(stack->begin() + 1, stack->end()); } } // namespace poptorch::meta TORCH_LIBRARY_IMPL(aten, Meta, m) { m.impl("bincount", PTC(poptorch::meta::bincount)); m.impl("bincount.out", PTC(poptorch::meta::bincountOut)); m.impl("bucketize.Tensor", PTC(poptorch::meta::bucketize)); m.impl("bucketize.Tensor_out", PTC(poptorch::meta::bucketizeOut)); m.impl("equal", PTC(poptorch::meta::equal)); m.impl("hardsigmoid.out", PTC(poptorch::meta::hardsigmoidOut)); m.impl("rrelu_with_noise", PTC_BOXED(poptorch::meta::opReturningFirstArgument)); m.impl("count_nonzero.dim_IntList", PTC(poptorch::meta::countNonzero)); m.impl("ctc_loss.Tensor", PTC(poptorch::meta::ctcLossTensor)); m.impl("ctc_loss.IntList", PTC(poptorch::meta::ctcLoss)); m.impl("max_pool3d", PTC(poptorch::meta::maxPool3d)); m.impl("median", PTC(poptorch::meta::median)); m.impl("median.dim", PTC(poptorch::meta::medianDim)); m.impl("nll_loss2d_forward", PTC(poptorch::meta::nllLoss2dForward)); m.impl("nonzero", PTC(poptorch::meta::nonzero)); m.impl("one_hot", PTC(poptorch::meta::oneHot)); m.impl("silu.out", PTC(poptorch::meta::siluOut)); m.impl("upsample_nearest3d.vec", PTC(poptorch::meta::upsampleNearest3d)); m.impl("tanh_backward.grad_input", PTC(poptorch::meta::tanh_backward_out)); m.impl("native_layer_norm_backward", PTC(poptorch::meta::native_layer_norm_backward)); } TORCH_LIBRARY_IMPL(torch_scatter, Meta, m) { m.impl("scatter_max", PTC(poptorch::meta::scatterMinMax)); m.impl("scatter_min", PTC(poptorch::meta::scatterMinMax)); m.impl("scatter_mul", PTC(poptorch::meta::scatterMul)); } TORCH_LIBRARY_IMPL(torch_cluster, Meta, m) { m.impl("grid", PTC(poptorch::meta::grid)); } TORCH_LIBRARY_IMPL(torch_spline_conv, Meta, m) { m.impl("spline_basis", PTC(poptorch::meta::splineBasis)); m.impl("spline_weighting", PTC(poptorch::meta::splineWeighting)); } TORCH_LIBRARY_IMPL(poptorch, Meta, m) { m.impl("push_name_scope", PTC_BOXED(poptorch::meta::opWithoutOutputs)); m.impl("pop_name_scope", PTC_BOXED(poptorch::meta::opWithoutOutputs)); m.impl("begin_ipu_block", PTC_BOXED(poptorch::meta::opWithoutOutputs)); m.impl("end_ipu_block", PTC_BOXED(poptorch::meta::opWithoutOutputs)); m.impl("start_for_loop", PTC_BOXED(poptorch::meta::opWithoutOutputs)); m.impl("start_if_block", PTC_BOXED(poptorch::meta::opWithoutOutputs)); m.impl("start_else_block", PTC_BOXED(poptorch::meta::opWithoutOutputs)); m.impl("optimizer_group", PTC_BOXED(poptorch::meta::opWithoutOutputs)); m.impl("call_cpu_op", PTC_BOXED(poptorch::meta::opWithoutOutputs)); m.impl("set_attribute", PTC_BOXED(poptorch::meta::opWithoutOutputs)); m.impl("clear_attribute", PTC_BOXED(poptorch::meta::opWithoutOutputs)); m.impl("begin_multi_conv", PTC_BOXED(poptorch::meta::opWithoutOutputs)); m.impl("end_multi_conv", PTC_BOXED(poptorch::meta::opWithoutOutputs)); m.impl("end_cpu_op", PTC_BOXED(poptorch::meta::opReturningFirstArgument)); m.impl("end_for_loop", PTC_BOXED(poptorch::meta::opReturningFirstArgument)); m.impl("end_if_block", PTC_BOXED(poptorch::meta::opReturningFirstArgument)); m.impl("internal_cast", PTC_BOXED(poptorch::meta::opReturningFirstArgument)); m.impl("ipu_print_tensor", PTC_BOXED(poptorch::meta::opReturningFirstArgument)); m.impl("nop", PTC_BOXED(poptorch::meta::opReturningFirstArgument)); m.impl("recomputation_checkpoint", PTC_BOXED(poptorch::meta::opReturningFirstArgument)); m.impl("set_available_memory", PTC_BOXED(poptorch::meta::opReturningFirstArgument)); m.impl("set_matmul_serialization", PTC_BOXED(poptorch::meta::opReturningFirstArgument)); m.impl("set_overlap_for_input", PTC_BOXED(poptorch::meta::opReturningFirstArgument)); m.impl("set_overlap_for_output", PTC_BOXED(poptorch::meta::opReturningFirstArgument)); m.impl("ctc_beam_search_decoder", PTC(poptorch::meta::ctcBeamSearchDecoder)); m.impl("custom_operation", PTC(poptorch::meta::customOperation)); m.impl("dynamic_slice", PTC(poptorch::meta::dynamicSlice)); m.impl("dynamic_update", PTC(poptorch::meta::dynamicUpdate)); m.impl("identity_loss", PTC(poptorch::meta::identityLoss)); m.impl("fps", PTC(poptorch::meta::fps)); m.impl("nearest", PTC(poptorch::meta::nearest)); m.impl("nearest_batch_list", PTC(poptorch::meta::nearest_batch_list)); } TORCH_LIBRARY_IMPL(poptorch, AutogradMeta, m) { m.impl("begin_ipu_block", torch::autograd::autogradNotImplementedFallback()); m.impl("end_ipu_block", torch::autograd::autogradNotImplementedFallback()); m.impl("ipu_print_tensor", torch::autograd::autogradNotImplementedFallback()); m.impl("internal_cast", torch::autograd::autogradNotImplementedFallback()); m.impl("nop", torch::autograd::autogradNotImplementedFallback()); m.impl("dynamic_slice", torch::autograd::autogradNotImplementedFallback()); m.impl("dynamic_update", torch::autograd::autogradNotImplementedFallback()); m.impl("custom_operation", torch::autograd::autogradNotImplementedFallback()); m.impl("ctc_beam_search_decoder", torch::autograd::autogradNotImplementedFallback()); m.impl("identity_loss", torch::autograd::autogradNotImplementedFallback()); m.impl("start_for_loop", torch::autograd::autogradNotImplementedFallback()); m.impl("end_for_loop", torch::autograd::autogradNotImplementedFallback()); m.impl("start_if_block", torch::autograd::autogradNotImplementedFallback()); m.impl("start_else_block", torch::autograd::autogradNotImplementedFallback()); m.impl("end_if_block", torch::autograd::autogradNotImplementedFallback()); m.impl("optimizer_group", torch::autograd::autogradNotImplementedFallback()); m.impl("set_matmul_serialization", torch::autograd::autogradNotImplementedFallback()); m.impl("set_overlap_for_input", torch::autograd::autogradNotImplementedFallback()); m.impl("set_overlap_for_output", torch::autograd::autogradNotImplementedFallback()); m.impl("recomputation_checkpoint", torch::autograd::autogradNotImplementedFallback()); m.impl("set_available_memory", torch::autograd::autogradNotImplementedFallback()); m.impl("begin_multi_conv", torch::autograd::autogradNotImplementedFallback()); m.impl("end_multi_conv", torch::autograd::autogradNotImplementedFallback()); m.impl("push_name_scope", torch::autograd::autogradNotImplementedFallback()); m.impl("pop_name_scope", torch::autograd::autogradNotImplementedFallback()); m.impl("end_cpu_op", torch::autograd::autogradNotImplementedFallback()); m.impl("call_cpu_op", torch::autograd::autogradNotImplementedFallback()); m.impl("set_attribute", torch::autograd::autogradNotImplementedFallback()); m.impl("clear_attribute", torch::autograd::autogradNotImplementedFallback()); m.impl("fps", torch::autograd::autogradNotImplementedFallback()); m.impl("nearest", torch::autograd::autogradNotImplementedFallback()); m.impl("nearest_batch_list", torch::autograd::autogradNotImplementedFallback()); } // For some reason these operations are first dispatched to AutogradMeta, // so we ignore and allow them pass through to Meta TORCH_LIBRARY_IMPL(aten, AutogradMeta, m) { m.impl("ctc_loss.Tensor", torch::autograd::autogradNotImplementedFallback()); m.impl("ctc_loss.IntList", torch::autograd::autogradNotImplementedFallback()); m.impl("max_pool3d", torch::autograd::autogradNotImplementedFallback()); m.impl("one_hot", torch::autograd::autogradNotImplementedFallback()); m.impl("bucketize.Tensor", torch::autograd::autogradNotImplementedFallback()); m.impl("bucketize.Tensor_out", torch::autograd::autogradNotImplementedFallback()); m.impl("bucketize.Scalar", torch::autograd::autogradNotImplementedFallback()); } TORCH_LIBRARY_IMPL(torch_scatter, AutogradMeta, m) { m.impl("scatter_max", torch::autograd::autogradNotImplementedFallback()); m.impl("scatter_min", torch::autograd::autogradNotImplementedFallback()); m.impl("scatter_mul", torch::autograd::autogradNotImplementedFallback()); } TORCH_LIBRARY_IMPL(torch_cluster, AutogradMeta, m) { m.impl("grid", torch::autograd::autogradNotImplementedFallback()); } TORCH_LIBRARY_IMPL(torch_spline_conv, AutogradMeta, m) { m.impl("spline_basis", torch::autograd::autogradNotImplementedFallback()); m.impl("spline_weighting", torch::autograd::autogradNotImplementedFallback()); } ================================================ FILE: poptorch/source/dispatch_tracer/RegisterOptionalAtenOps.cpp.inc ================================================ // Copyright (c) 2022 Graphcore Ltd. All rights reserved. TORCH_LIBRARY_IMPL(aten, IPU, m) { // These ops otherwise require direct access to the storage of an // `IpuTensorImpl`, so we must implement them ourselves. m.impl("alias", PTC_BOXED(poptorch::fallback)); m.impl("bucketize.Tensor", PTC_BOXED(poptorch::fallback)); m.impl("bucketize.Tensor_out", PTC_BOXED(poptorch::fallback)); m.impl("bucketize.Scalar", PTC_BOXED(poptorch::fallback)); m.impl("copy_", PTC_BOXED(poptorch::copyInplace)); m.impl("detach", PTC_BOXED(poptorch::detach)); m.impl("_local_scalar_dense", PTC(poptorch::localScalarDense)); m.impl("item", PTC(poptorch::item)); m.impl("empty.memory_format", PTC(poptorch::emptyMemoryFormat)); m.impl("empty_strided", PTC(poptorch::emptyStrided)); m.impl("_weight_norm_interface", PTC_BOXED(poptorch::weightNormInterface)); m.impl("index.Tensor", PTC_BOXED(poptorch::fallback)); m.impl("convolution", PTC_BOXED(poptorch::fallback)); m.impl("convolution_backward", PTC_BOXED(poptorch::fallback)); // These ops must be intercepted so that meta type inference // doesn't have to deal with "out" tensors that aren't directly // assigned to m.impl("median.dim", PTC_BOXED(poptorch::fallback)); m.impl("min.dim", PTC_BOXED(poptorch::fallback)); m.impl("max.dim", PTC_BOXED(poptorch::fallback)); m.impl("topk", PTC_BOXED(poptorch::fallback)); m.impl("nll_loss_forward", PTC_BOXED(poptorch::fallback)); m.impl("nll_loss2d_forward", PTC_BOXED(poptorch::fallback)); m.impl("transpose.int", PTC_BOXED(poptorch::fallback)); m.impl("expand", PTC_BOXED(poptorch::fallback)); m.impl("_unsafe_view", PTC_BOXED(poptorch::fallback)); m.impl("gather", PTC_BOXED(poptorch::fallback)); m.impl("dropout", PTC_BOXED(poptorch::fallback)); m.impl("avg_pool2d.out", PTC_BOXED(poptorch::fallback)); m.impl("avg_pool3d.out", PTC_BOXED(poptorch::fallback)); m.impl("max_pool1d", PTC_BOXED(poptorch::fallback)); m.impl("max_pool2d", PTC_BOXED(poptorch::fallback)); m.impl("max_pool3d", PTC_BOXED(poptorch::fallback)); m.impl("adaptive_avg_pool1d", PTC_BOXED(poptorch::fallback)); m.impl("adaptive_avg_pool2d", PTC_BOXED(poptorch::fallback)); m.impl("adaptive_avg_pool3d", PTC_BOXED(poptorch::fallback)); m.impl("trunc", PTC_BOXED(poptorch::fallback)); m.impl("min", PTC_BOXED(poptorch::fallback)); m.impl("amin", PTC_BOXED(poptorch::fallback)); m.impl("minimum", PTC_BOXED(poptorch::fallback)); m.impl("max", PTC_BOXED(poptorch::fallback)); m.impl("amax", PTC_BOXED(poptorch::fallback)); m.impl("maximum", PTC_BOXED(poptorch::fallback)); m.impl("argsort", PTC_BOXED(poptorch::fallback)); m.impl("one_hot", PTC_BOXED(poptorch::fallback)); m.impl("all", PTC_BOXED(poptorch::fallback)); m.impl("any", PTC_BOXED(poptorch::fallback)); m.impl("feature_dropout", PTC_BOXED(poptorch::fallback)); m.impl("feature_dropout_", PTC_BOXED(poptorch::fallback)); m.impl("embedding", PTC_BOXED(poptorch::fallback)); // Needed due to "CompositeImplicitAutograd" m.impl("native_group_norm", torch::CppFunction::makeFromBoxedFunction<&poptorch::fallback>()); m.impl("native_layer_norm", torch::CppFunction::makeFromBoxedFunction<&poptorch::fallback>()); m.impl("lstm.input", torch::CppFunction::makeFromBoxedFunction<&poptorch::fallback>()); // If we don't intercept these ops, they will be decomposed into // as_strided which is harder to handle. m.impl("slice.Tensor", PTC_BOXED(poptorch::fallback)); m.impl("squeeze", PTC_BOXED(poptorch::fallback)); m.impl("squeeze_", PTC_BOXED(poptorch::fallback)); m.impl("squeeze.dim", PTC_BOXED(poptorch::fallback)); m.impl("squeeze_.dim", PTC_BOXED(poptorch::fallback)); m.impl("squeeze.dims", PTC_BOXED(poptorch::fallback)); m.impl("squeeze_.dims", PTC_BOXED(poptorch::fallback)); m.impl("unsqueeze", PTC_BOXED(poptorch::fallback)); m.impl("permute", PTC_BOXED(poptorch::fallback)); m.impl("select.int", PTC_BOXED(poptorch::fallback)); m.impl("transpose_", PTC_BOXED(poptorch::fallback)); m.impl("split_with_sizes", PTC_BOXED(poptorch::fallback)); // If we don't intercept this op, it will be decomposed into // _index_put_impl_, which exposes unnecessary implementation // details m.impl("index_put_", PTC_BOXED(poptorch::fallback)); // If we don't intercept this op, it will be converted into a clone followed // by an index_put_, which is inefficient in eager mode m.impl("index_put", PTC_BOXED(poptorch::fallback)); // If we don't intercept this op, it will be converted into a clone followed // by an baddbmm.out, which is inefficient in eager mode m.impl("baddbmm", PTC_BOXED(poptorch::fallback)); // If we don't intercept this op, it will be converted into a clone followed // by an masked_fill_.Scalar, which is inefficient in eager mode m.impl("masked_fill.Scalar", PTC_BOXED(poptorch::fallback)); // If we don't catch these, PyTorch will try to call aten::resize_ on the // result which is not supported. m.impl("frobenius_norm.out", PTC_BOXED(poptorch::fallback)); m.impl("frobenius_norm.dim", PTC_BOXED(poptorch::fallback)); // Use our own repeat op m.impl("repeat", PTC_BOXED(poptorch::fallback)); m.impl("constant_pad_nd", PTC_BOXED(poptorch::fallback)); m.impl("binary_cross_entropy_with_logits", PTC_BOXED(poptorch::fallback)); m.impl("binary_cross_entropy_with_logits_backward", PTC_BOXED(poptorch::fallback)); // If we don't catch it here, PyTorch will decompose bilinear into an enormous // number of ops, which will result in an all-zeros output. m.impl("bilinear", PTC_BOXED(poptorch::fallback)); // Loss functions: these are needed for popart, so that we can mark the loss // tensor (see `IsLoss`); otherwise, the op will get decomposed by PyTorch. m.impl("cosine_embedding_loss", PTC_BOXED(poptorch::fallback)); m.impl("ctc_loss.IntList", PTC_BOXED(poptorch::fallback)); m.impl("ctc_loss.Tensor", PTC_BOXED(poptorch::fallback)); m.impl("hinge_embedding_loss", PTC_BOXED(poptorch::fallback)); m.impl("kl_div", PTC_BOXED(poptorch::fallback)); m.impl("l1_loss", PTC_BOXED(poptorch::fallback)); m.impl("margin_ranking_loss", PTC_BOXED(poptorch::fallback)); m.impl("poisson_nll_loss", PTC_BOXED(poptorch::fallback)); m.impl("soft_margin_loss.out", PTC_BOXED(poptorch::fallback)); m.impl("triplet_margin_loss", PTC_BOXED(poptorch::fallback)); m.impl("mse_loss", PTC_BOXED(poptorch::fallback)); m.impl("smooth_l1_loss", PTC_BOXED(poptorch::fallback)); // Scatter: By default, PyTorch's handler will fail if the index tensor isn't // a tensor of int64s (see `scatter_gather_dtype_check` in PyTorch) -- ours // will have been coerced to int32s. m.impl("scatter.src", PTC_BOXED(poptorch::fallback)); m.impl("scatter.src_out", PTC_BOXED(poptorch::fallback)); m.impl("scatter_.src", PTC_BOXED(poptorch::fallback)); m.impl("scatter.value", PTC_BOXED(poptorch::fallback)); m.impl("scatter.value_out", PTC_BOXED(poptorch::fallback)); m.impl("scatter_.value", PTC_BOXED(poptorch::fallback)); m.impl("scatter.reduce", PTC_BOXED(poptorch::fallback)); m.impl("scatter.reduce_out", PTC_BOXED(poptorch::fallback)); m.impl("scatter_.reduce", PTC_BOXED(poptorch::fallback)); m.impl("scatter.value_reduce", PTC_BOXED(poptorch::fallback)); m.impl("scatter.value_reduce_out", PTC_BOXED(poptorch::fallback)); m.impl("scatter_.value_reduce", PTC_BOXED(poptorch::fallback)); m.impl("scatter_add", PTC_BOXED(poptorch::fallback)); m.impl("scatter_add.out", PTC_BOXED(poptorch::fallback)); m.impl("scatter_add_", PTC_BOXED(poptorch::fallback)); m.impl("scatter_reduce.two", PTC_BOXED(poptorch::fallback)); m.impl("scatter_reduce.two_out", PTC_BOXED(poptorch::fallback)); m.impl("scatter_reduce_.two", PTC_BOXED(poptorch::fallback)); m.impl("select_scatter", PTC_BOXED(poptorch::fallback)); m.impl("select_scatter.out", PTC_BOXED(poptorch::fallback)); m.impl("_prelu_kernel", PTC_BOXED(poptorch::fallback)); m.impl("take_along_dim", PTC_BOXED(poptorch::fallback)); m.impl("take_along_dim.out", PTC_BOXED(poptorch::fallback)); } TORCH_LIBRARY_IMPL(poptorch, IPU, m) { m.impl("fps", PTC_BOXED(poptorch::fallback)); m.impl("nearest", PTC_BOXED(poptorch::fallback)); m.impl("nearest_batch_list", PTC_BOXED(poptorch::fallback)); } TORCH_LIBRARY_IMPL(torch_scatter, IPU, m) { m.impl("scatter_max", PTC_BOXED(poptorch::fallback)); m.impl("scatter_min", PTC_BOXED(poptorch::fallback)); m.impl("scatter_mul", PTC_BOXED(poptorch::fallback)); } TORCH_LIBRARY_IMPL(torch_cluster, IPU, m) { m.impl("grid", PTC_BOXED(poptorch::fallback)); } TORCH_LIBRARY_IMPL(torch_spline_conv, IPU, m) { m.impl("spline_basis", PTC_BOXED(poptorch::fallback)); m.impl("spline_weighting", PTC_BOXED(poptorch::fallback)); } ================================================ FILE: poptorch/source/dispatch_tracer/Tensor.cpp ================================================ // Copyright (c) 2022 Graphcore Ltd. All rights reserved. #include "Tensor.hpp" #include #include #include #include #include #include #include #include #include #include #include #include "CommonHelperFunctions.hpp" #include "ValueMapper.hpp" #include "poptorch/DispatchTracer.hpp" #include "poptorch/Utils.hpp" #include "poptorch_logging/Error.hpp" #include "poptorch_logging/Logging.hpp" #include "pytorch_bridge/CompilerTypes.hpp" #include "pytorch_bridge/IpuSession.hpp" namespace poptorch { namespace { using BufferPtr = std::shared_ptr; using TensorViewPtr = std::shared_ptr; template struct Overloaded : Ts... { using Ts::operator()...; }; template Overloaded(Ts...) -> Overloaded; std::shared_ptr getTensorDetails(const at::TensorImpl &ipu_tensor); // This is just a useful helper since sometimes we need to pass both keys in. c10::DispatchKeySet dispatch_key_set{c10::DispatchKey::IPU, c10::DispatchKey::AutogradIPU}; } // namespace poptorch_ir::Type toCompilerType(const at::ScalarType &elem_type) { switch (elem_type) { case at::ScalarType::Bool: return poptorch_ir::Type::BOOL; case at::ScalarType::Byte: return poptorch_ir::Type::UNSIGNED_CHAR; case at::ScalarType::Char: return poptorch_ir::Type::CHAR; case at::ScalarType::Float: case at::ScalarType::Double: // We will convert this. return poptorch_ir::Type::FLOAT; case at::ScalarType::Half: return poptorch_ir::Type::HALF; case at::ScalarType::Short: return poptorch_ir::Type::SHORT; case at::ScalarType::Int: case at::ScalarType::Long: // We will convert this. return poptorch_ir::Type::INT; default: ERROR("Unsupported tensor input type from pytorch: " << elem_type); } } poptorch_ir::Type toCompilerElementType(const at::Tensor &tensor) { auto dtype = tensor.dtype(); return toCompilerType(dtype.toScalarType()); } // Return the data size in bytes of a tensor (i.e num_elems * elem_size) uint64_t tensorImplDataSize(const at::TensorImpl &impl) { auto shape = impl.sizes(); const std::int64_t nelems = std::accumulate(shape.begin(), shape.end(), 1, std::multiplies()); const auto elem_size = impl.itemsize(); return nelems * elem_size; } // This is our own TensorImpl: this is stored in every at::Tensor of type IPU. // // This implementation is inspired by VulkanOpaqueTensorImpl / OpaqueTensorImpl: // they seem to have similar needs to us. struct IpuTensorImpl : public at::TensorImpl { // Shallow copy constructor (Both instances will share the same host buffer if // it exists). Shouldn't be called directly: use shallow_copy_and_detach() // instead. IpuTensorImpl(const IpuTensorImpl &src) : IpuTensorImpl(src.dtype(), src.device(), src.sizes_and_strides_.sizes_arrayref(), src.sizes_and_strides_.strides_arrayref(), src.details) {} void release_resources() override { details.reset(); } IpuTensorImpl(const caffe2::TypeMeta data_type, c10::Device device, c10::IntArrayRef sizes, c10::IntArrayRef strides, const std::shared_ptr &details_) : at::TensorImpl(dispatch_key_set, data_type, device), details(details_) { // set_sizes must be called before stride_at because it resizes the // array that stores both sizes and strides. sizes_and_strides_.set_sizes(sizes); for (uint dim = 0; dim < strides.size(); ++dim) { sizes_and_strides_.stride_at(dim) = strides.at(dim); } set_storage_access_should_throw(); set_custom_sizes_strides(at::TensorImpl::SizesStridesPolicy::Default); is_non_overlapping_and_dense_ = false; refresh_numel(); } c10::intrusive_ptr shallow_copy_and_detach(const c10::VariableVersion &version_counter, bool allow_tensor_metadata_change) const override { auto impl = c10::make_intrusive(*this); copy_tensor_metadata( /*src_impl=*/this, /*dest_impl=*/impl.get(), /*version_counter=*/version_counter, /*allow_tensor_metadata_change=*/allow_tensor_metadata_change); impl->refresh_numel(); return impl; } c10::intrusive_ptr shallow_copy_and_detach(c10::VariableVersion &&version_counter, bool allow_tensor_metadata_change) const override { auto impl = c10::make_intrusive(*this); copy_tensor_metadata( /*src_impl=*/this, /*dest_impl=*/impl.get(), /*version_counter=*/version_counter, /*allow_tensor_metadata_change=*/allow_tensor_metadata_change); impl->refresh_numel(); return impl; } void set_size(int64_t dim, int64_t new_size) override { UNUSED(dim); UNUSED(new_size); AT_ERROR("IPU tensors do not have set_size"); } void set_stride(int64_t dim, int64_t new_stride) override { UNUSED(dim); UNUSED(new_stride); AT_ERROR("IPU tensors do not have set_stride"); } void set_storage_offset(int64_t storage_offset) override { UNUSED(storage_offset); AT_ERROR("IPU tensors do not have set_storage_offset"); } std::shared_ptr details; private: const char *tensorimpl_type_name() const override { return "IpuTensorImpl"; } }; namespace { IpuTensorImpl *tryIpuTensorImpl(const at::Tensor &tensor) { return dynamic_cast(tensor.unsafeGetTensorImpl()); } IpuTensorImpl *toIpuTensorImpl(const at::Tensor &tensor) { auto *ptr = tryIpuTensorImpl(tensor); ERROR_ON_MSG(ptr == nullptr, "Expected an IPU tensor but " << tensor.unsafeGetTensorImpl() << " is " << tensor.unsafeGetTensorImpl()->device_type()); return ptr; } const IpuTensorImpl *toIpuTensorImpl(const at::TensorImpl &tensor) { const auto *impl = dynamic_cast(&tensor); ERROR_ON_MSG(impl == nullptr, "Expected an IPU tensor but " << &tensor << " is " << tensor.device_type()); return impl; } std::shared_ptr getTensorDetails(const at::TensorImpl &ipu_tensor) { return toIpuTensorImpl(ipu_tensor)->details; } // TODO(T61601) Create a proper implementation of GuardImpl struct GuardImpl : public c10::impl::DeviceGuardImplInterface { at::DeviceType type() const override { return at::DeviceType::IPU; } c10::Device exchangeDevice(c10::Device device) const override { logging::trace("exchangeDevice: current {} new {}", _current_device, device); c10::Device old = _current_device; *const_cast(&_current_device) = device; return old; } // Called by the dispatcher every time the user passes a device type without // an ID to a "to()" method For example: my_tensor.to(torch.device("ipu")) c10::Device getDevice() const override { return _current_device; } void setDevice(c10::Device device) const override { logging::trace("setDevice: current {} new {}", _current_device, device); *const_cast(&_current_device) = device; } void uncheckedSetDevice(c10::Device device) const noexcept override { logging::trace("uncheckedSetDevice: current {} new {}", _current_device, device); *const_cast(&_current_device) = device; } // Used by the autograd. // Streams are essentially command queues: if kernels A & B are added to the // same stream, A is guaranteed to have completed before B starts. // For A & B to be run in parallel they need to be added to different // streams. c10::Stream getStream(c10::Device device) const noexcept override { return c10::Stream(c10::Stream::DEFAULT, device); } c10::Stream exchangeStream(c10::Stream s) const noexcept override { UNUSED(s); return c10::Stream(c10::Stream::DEFAULT, getDevice()); } // Used by torch::autograd::Engine::initialize_device_threads_pool c10::DeviceIndex deviceCount() const noexcept override { return 1; } private: c10::Device _current_device{at::DeviceType::IPU, 0}; }; C10_REGISTER_GUARD_IMPL(IPU, GuardImpl) poptorch_ir::TensorType getTensorType(const at::ScalarType &scalar_type, std::vector sizes) { return {std::move(sizes), toCompilerType(scalar_type)}; } } // namespace poptorch_ir::TensorType getTensorType(const at::Tensor &tensor) { return getTensorType(tensor.scalar_type(), tensor.sizes().vec()); } uint64_t ipuTensorId(const at::Tensor &tensor) { return getTensorDetails(*tensor.unsafeGetTensorImpl())->tensor_id; } uint64_t ipuTensorId(const at::TensorImpl &tensor) { return toIpuTensorImpl(tensor)->details->tensor_id; } bool isIpuTensor(const at::Tensor &tensor) { return tryIpuTensorImpl(tensor) != nullptr; } std::string str(const at::Tensor &tensor) { std::stringstream ss; ss << "impl_ " << reinterpret_cast(tensor.unsafeGetTensorImpl()); if (!tensor.defined()) { ss << " type: "; } else { auto device_type = tensor.unsafeGetTensorImpl()->device_type(); ss << " type " << device_type; if (device_type == at::DeviceType::IPU) { auto *ipu_tensor = toIpuTensorImpl(tensor); ss << " ID " << ipu_tensor->details->tensor_id; } ss << " sizes " << tensor.unsafeGetTensorImpl()->sizes(); ss << " dtype " << tensor.unsafeGetTensorImpl()->dtype(); } return ss.str(); } uint64_t tensorDataSize(const at::Tensor &tensor) { return tensorImplDataSize(*tensor.unsafeGetTensorImpl()); } Buffer &getHostBuffer(const at::Tensor &ipu_tensor) { return getHostBuffer(*toIpuTensorImpl(ipu_tensor)); } Buffer &getHostBuffer(const at::TensorImpl &ipu_tensor) { auto details = toIpuTensorImpl(ipu_tensor)->details; return details->getBuffer(); } bool hasData(const at::Tensor &ipu_tensor) { const auto &details = *toIpuTensorImpl(*toIpuTensorImpl(ipu_tensor))->details; return details.hasData(); } void errorOnZeroSizedTensor(const at::Tensor &tensor) { auto sizes = tensor.sizes(); if (std::any_of(sizes.begin(), sizes.end(), [](auto dim) { return dim == 0; })) { std::stringstream err; err << "Zero-sized tensors are unsupported (Got shape ["; for (std::size_t i = 0; i < sizes.size() - 1; i++) { err << sizes[i] << ", "; } err << sizes[sizes.size() - 1] << "])."; ERROR(err.str()); } } TensorStore::TensorStore() : _ipu_session(poptorch_ir::createStaticSession()) {} std::shared_ptr TensorStore::allocateTensorDetails(c10::IntArrayRef size, at::ScalarType coerced_scalar_type, std::shared_ptr view_info) { for (size_t dim = 0; dim < size.size(); ++dim) { ERROR_ON_MSG(size.at(dim) < 0, "Invalid tensor shape: dimension " << dim << " is negative (" << size.at(dim) << ")"); } auto details = std::make_shared( _next_tensor_id++, getTensorType(coerced_scalar_type, size.vec()), std::move(view_info)); return details; } at::Tensor TensorStore::allocateTensor(c10::IntArrayRef size, c10::optional dtype, std::shared_ptr view_info, c10::optional device) { const at::ScalarType scalar_type = scalarTypeOrDefault(dtype); auto coerced_scalar_type = coerceToSupportedType(scalar_type); auto details = allocateTensorDetails(size, coerced_scalar_type, std::move(view_info)); auto strides = at::detail::defaultStrides(size); at::Tensor output = at::detail::make_tensor( c10::scalarTypeToTypeMeta(coerced_scalar_type), deviceOrDefaultIpu(device), size, strides, std::move(details)); for (size_t dim = 0; dim < size.size(); ++dim) { ERROR_ON_MSG(size.at(dim) < 0, "Invalid tensor shape: dimension " << dim << " is negative (" << size.at(dim) << ")"); } ERROR_ON(output.device().type() != c10::DeviceType::IPU); logging::trace( "Created IPU tensor: id {} impl_ {} size {} strides {} dtype {}", ipuTensorId(output), reinterpret_cast(output.unsafeGetTensorImpl()), size, strides, coerced_scalar_type); if (scalar_type != coerced_scalar_type) { logging::warn("[DISPATCHER] Type coerced from {} to {} for tensor id {}", scalar_type, coerced_scalar_type, ipuTensorId(output)); } return output; } Buffer &TensorStore::allocateBuffer(IpuTensorDetails &details) { return details.getBuffer() = _ipu_session->allocate(details.type); } void TensorStore::allocateBuffer(const at::Tensor &ipu_tensor) { auto &details = *getTensorDetails(ipu_tensor); allocateBuffer(details); } void TensorStore::copyOnIpu(const at::Tensor &ipu_dest, const at::Tensor &ipu_src) { ERROR_ON_MSG(ipu_dest.dtype() != ipu_src.dtype(), "Copy operations cannot cast outside of the dispatcher."); const auto &src_details = getTensorDetails(ipu_src); const auto &dest_details = getTensorDetails(ipu_dest); auto dest_buf = allocateBuffer(*dest_details); _ipu_session->copyDataOnDevice(dest_buf, src_details->getBuffer()); ipu_dest.set_requires_grad(ipu_src.requires_grad()); } void TensorStore::copyFromCpu(const at::Tensor &ipu_dest, const at::Tensor &cpu_src) { logging::trace("[DISPATCHER] Copying from CPU tensor {} with data_ptr {}", static_cast(cpu_src.unsafeGetTensorImpl()), cpu_src.data_ptr()); ERROR_ON(cpu_src.dtype() != ipu_dest.dtype()); ERROR_ON(cpu_src.sizes() != ipu_dest.sizes()); const auto &details = getTensorDetails(ipu_dest); auto &buff = allocateBuffer(*details); _ipu_session->copyDataFromCpuSource( buff, static_cast(cpu_src.data_ptr())); ipu_dest.set_requires_grad(cpu_src.requires_grad()); } void TensorStore::copyToCpu(const at::Tensor &cpu_dest, const at::Tensor &ipu_src) { logging::trace("[DISPATCHER] Copying to CPU tensor {} with data_ptr {}", static_cast(cpu_dest.unsafeGetTensorImpl()), cpu_dest.data_ptr()); ERROR_ON(ipu_src.dtype() != cpu_dest.dtype()); ERROR_ON(ipu_src.sizes() != cpu_dest.sizes()); const auto &details = getTensorDetails(ipu_src); _ipu_session->copyDataToCpu(static_cast(cpu_dest.data_ptr()), details->getBuffer()); } const std::shared_ptr & TensorStore::getIpuSession() const { return _ipu_session; } void TensorStore::reset() { _ipu_session = nullptr; } std::shared_ptr getTensorDetails(const at::Tensor &ipu_tensor) { return getTensorDetails(*ipu_tensor.unsafeGetTensorImpl()); } std::vector> getTensorDetails(const std::vector &ipu_tensors) { std::vector> details; details.reserve(ipu_tensors.size()); std::transform( ipu_tensors.begin(), ipu_tensors.end(), std::back_inserter(details), [](const auto &ipu_tensor) { return getTensorDetails(ipu_tensor); }); return details; } void setTensorDetails(const at::Tensor &ipu_tensor, std::shared_ptr details) { auto *impl = dynamic_cast(ipu_tensor.unsafeGetTensorImpl()); ERROR_ON(impl == nullptr); impl->set_sizes_contiguous(details->type.shape); impl->details = std::move(details); } namespace { IpuTensorDetails::Data getBufferOrView(std::shared_ptr view_info) { if (view_info) { return view_info; } return std::make_shared(); } } // namespace IpuTensorDetails::IpuTensorDetails(IpuTensorId tensor_id_, poptorch_ir::TensorType type_, std::shared_ptr view_info) : tensor_id(tensor_id_), type(std::move(type_)), data(getBufferOrView(std::move(view_info))) {} Buffer &IpuTensorDetails::getBuffer() { return std::visit( Overloaded{[](const BufferPtr &buffer) -> Buffer & { return *buffer; }, [](const TensorViewPtr &view) -> Buffer & { UNUSED(view); ERROR("Cannot get the buffer of a view tensor."); }}, data); } std::shared_ptr IpuTensorDetails::getOwningBuffer() const { return std::visit( Overloaded{[](const BufferPtr &buffer) -> BufferPtr { return buffer; }, [](const TensorViewPtr &view) -> BufferPtr { UNUSED(view); return nullptr; }}, data); } bool IpuTensorDetails::hasData() const { return std::visit( Overloaded{[](const BufferPtr &buffer) { return buffer->hasData(); }, [](const TensorViewPtr &view) { return view != nullptr; }}, data); } bool IpuTensorDetails::isView() const { return std::holds_alternative(data); } } // namespace poptorch ================================================ FILE: poptorch/source/dispatch_tracer/Tensor.hpp ================================================ // Copyright (c) 2022 Graphcore Ltd. All rights reserved. #ifndef POPTORCH_DISPATCH_TENSOR_HPP_ #define POPTORCH_DISPATCH_TENSOR_HPP_ #include #include #include #include #include #include #include #include "pytorch_bridge/CompilerTypes.hpp" #include "pytorch_bridge/DebugInfo.hpp" #include "pytorch_bridge/IpuSession.hpp" namespace poptorch_ir { class IIpuSession; } namespace poptorch { using Buffer = poptorch_ir::Buffer; using IpuTensorId = uint64_t; class IDispatch; struct IpuTensorImpl; class ValueMapper; class ITensorView { public: virtual poptorch_ir::TensorId addViewToGraph(IDispatch &dispatcher) = 0; }; // The ipu tensor details tracks the data and meta information associated with // the IpuTensorImpl. This information cannot be directly stored in the ipu // tensor impl because the lifetime of that is too short when views are // involved. We need to the lifetime of the data to outlive any views of the // data. struct IpuTensorDetails { IpuTensorDetails(IpuTensorId tensor_id_, poptorch_ir::TensorType type_, std::shared_ptr view_info); // The tensor details either owns its own storage or is a view of other tensor // details. // // For inputs that are temporaries we need the buffer to live until the // function is ran and we don't want to extend the lifetime of the // IpuTensorDetails unnecessarily. This means we need to share ownership of // the buffer. using Data = std::variant, std::shared_ptr>; const IpuTensorId tensor_id; const poptorch_ir::TensorType type; Data data; poptorch_ir::TensorDebugInfo debug_info; Buffer &getBuffer(); std::shared_ptr getOwningBuffer() const; bool hasData() const; bool isView() const; }; poptorch_ir::Type toCompilerType(const at::ScalarType &elem_type); poptorch_ir::Type toCompilerElementType(const at::Tensor &tensor); poptorch_ir::TensorType getTensorType(const at::Tensor &tensor); uint64_t tensorImplDataSize(const at::TensorImpl &impl); // Return the data size in bytes of the given at::Tensor. uint64_t tensorDataSize(const at::Tensor &tensor); // Return the tensor ID of the given IPU tensor. IpuTensorId ipuTensorId(const at::Tensor &tensor); // Return the tensor ID of the given IPU tensor implementation. IpuTensorId ipuTensorId(const at::TensorImpl &tensor); // Return true if the given at::Tensor is an IPU tensor. bool isIpuTensor(const at::Tensor &tensor); // Return a string containing the given tensor's metadata (device, shape, etc). std::string str(const at::Tensor &tensor); // Returns a reference to the CPU buffer of the given IPU tensor. Buffer &getHostBuffer(const at::Tensor &ipu_tensor); // Returns a reference to the CPU buffer of the given IPU tensor implementation. Buffer &getHostBuffer(const at::TensorImpl &ipu_tensor); bool hasData(const at::Tensor &ipu_tensor); std::shared_ptr getTensorDetails(const at::Tensor &ipu_tensor); std::vector> getTensorDetails(const std::vector &ipu_tensors); void setTensorDetails(const at::Tensor &ipu_tensor, std::shared_ptr details); void errorOnZeroSizedTensor(const at::Tensor &tensor); /** Host-side storage for `ipu` tensors. * * This allows the user to convert tensors and modules to `ipu` using * `t.to("ipu")` even when the dispatcher is off, and even outside eager mode. * * We simply copy the tensor in to our ownership, then when we go to load and * execute an executable, we can upload these tensors to the device. We'll * also retrieve them from the device when the user copies a tensor back to the * CPU (`t.to("cpu")`). */ class TensorStore { public: TensorStore(); TensorStore(const TensorStore &) = delete; TensorStore(TensorStore &&) = delete; TensorStore &operator=(TensorStore &) = delete; TensorStore &operator=(TensorStore &&) = delete; std::shared_ptr allocateTensorDetails(c10::IntArrayRef size, at::ScalarType coerced_scalar_type, std::shared_ptr view_info); // Create a new IPU tensor. at::Tensor allocateTensor(c10::IntArrayRef sizes, c10::optional dtype = c10::nullopt, std::shared_ptr view_info = nullptr, c10::optional device = c10::nullopt); void allocateBuffer(const at::Tensor &ipu_tensor); void copyOnIpu(const at::Tensor &ipu_dest, const at::Tensor &ipu_src); void copyFromCpu(const at::Tensor &ipu_dest, const at::Tensor &cpu_src); void copyToCpu(const at::Tensor &cpu_dest, const at::Tensor &ipu_src); const std::shared_ptr &getIpuSession() const; void reset(); private: Buffer &allocateBuffer(IpuTensorDetails &details); poptorch_ir::TensorId _next_tensor_id{1}; std::shared_ptr _ipu_session = poptorch_ir::createStaticSession(); }; } // namespace poptorch #endif // POPTORCH_DISPATCH_TENSOR_HPP_ ================================================ FILE: poptorch/source/dispatch_tracer/TypeInferenceHandler.cpp ================================================ // Copyright (c) 2022 Graphcore Ltd. All rights reserved. #include #include #include "poptorch_logging/Logging.hpp" #include "CommonHelperFunctions.hpp" #include "TypeInferenceHandler.hpp" #include namespace poptorch { constexpr c10::DispatchKeySet meta_keys{c10::DispatchKey::Meta, c10::DispatchKey::AutogradMeta}; namespace { c10::Stack copyTensorsFrom(const c10::Stack &meta_stack) { c10::Stack tmp_stack; tmp_stack.reserve(meta_stack.size()); std::copy_if(meta_stack.cbegin(), meta_stack.cend(), std::back_inserter(tmp_stack), [](const auto &value) { return value.isTensor() || value.isTensorList(); }); return tmp_stack; } } // namespace void TypeInferenceHandler::inferOutputTypes(const c10::OperatorHandle &op, c10::Stack *ipu_stack) { const auto schema_key = getSchemaKey(op.schema()); ERROR_ON_MSG(!op.hasComputedKernelForDispatchKey(c10::DispatchKey::Meta), "Type inference failed for " << schema_key << " because the operator " "doesn't have an implementation for the Meta backend."); // Unfortunately, aten::prelu with 1D inputs is broken with the Meta backend: // https://github.com/pytorch/pytorch/issues/89560 // As a workaround, we add a dummy channel dim to the input, and then remove // it again afterwards const bool is_prelu = schema_key == "aten::prelu" || schema_key == "aten::_prelu_kernel"; // Create a new operand stack with meta tensors c10::Stack meta_stack = createMetaStack(*ipu_stack, schema_key, is_prelu); // redispatchBoxed drops all function inputs from the stack. Meta stack is // the only owner of input params created by createMetaStack. If function // returns reference to input param, dropping params cause memory leak. // In order to prevent it lifetime of inputs must be extedned. const c10::Stack input_tensor_liftetime_extender = copyTensorsFrom(meta_stack); logging::trace("[DISPATCHER] Using meta type inference for {}", schema_key); op.redispatchBoxed(meta_keys, &meta_stack); ipu_stack->clear(); repopulateIpuStack(*ipu_stack, meta_stack, is_prelu); } std::optional TypeInferenceHandler::workaroundLookup(const std::string &schema_key) { if (const auto &it = schema_to_workaround.find(schema_key); it != schema_to_workaround.cend()) { return it->second; } return std::nullopt; } c10::IValue TypeInferenceHandler::applyWorkaround( const TypeInferenceHandler::Workaround &workaround, std::size_t value_index, const c10::IValue &value, const c10::Stack &stack) { if (workaround.predicate_fn(value_index, value, stack)) { return workaround.transform_fn(value, stack); } return value; } namespace { template c10::List createMetaTensorList(const c10::List &ipu_tensor_list, bool should_upcast_to_long) { c10::List meta_tensor_list; std::function transform_fn; if constexpr (std::is_same_v, T>) { transform_fn = [=](const T &t) -> T { if (!t.has_value()) { return c10::nullopt; } return TypeInferenceHandler::toMeta(t.value(), should_upcast_to_long); }; } else { transform_fn = [=](const T &t) -> T { return TypeInferenceHandler::toMeta(t, should_upcast_to_long); }; } std::transform(ipu_tensor_list.begin(), ipu_tensor_list.end(), std::back_inserter(meta_tensor_list), transform_fn); return meta_tensor_list; } c10::Device createMetaDevice(const c10::Device &device) { return device.is_ipu() ? c10::Device{at::kMeta} : device; } bool isUpcastRequired(const std::string &schema_key, const std::size_t input_idx) { if (auto opt_upcast_arg = TypeInferenceHandler::indexArgToUpcast(schema_key)) { return opt_upcast_arg.value() == input_idx; } return false; } } // namespace c10::Stack TypeInferenceHandler::createMetaStack(const c10::Stack &ipu_stack, const std::string &schema_key, bool is_prelu) { c10::Stack meta_stack; meta_stack.reserve(ipu_stack.size()); const auto maybe_workaround = workaroundLookup(schema_key); std::transform( ipu_stack.cbegin(), ipu_stack.cend(), std::back_inserter(meta_stack), [&, input_idx = 0u](const c10::IValue &value) mutable -> c10::IValue { // For various reasons, sometimes we have to transform the value before // pushing it on the meta stack to workaround validation issues which // are not the problem for the PopArt backend. const auto &v = maybe_workaround ? applyWorkaround(maybe_workaround.value(), input_idx, value, ipu_stack) : value; // We coerce index tensor types from Long to Int during dispatch, but // these need to be converted back to Long before running with the Meta // backend otherwise they'll emit type errors const bool should_upcast_to_long = isUpcastRequired(schema_key, input_idx); const bool is_first_input = input_idx == 0; ++input_idx; // Convert any IPU tensors to meta tensors if (v.isTensor()) { return toMeta(v.toTensor(), should_upcast_to_long, is_prelu && is_first_input); } if (v.isTensorList()) { return createMetaTensorList(v.toTensorList(), should_upcast_to_long); } if (v.isOptionalTensorList()) { return createMetaTensorList(v.toOptionalTensorList(), should_upcast_to_long); } if (v.isDevice()) { return createMetaDevice(v.toDevice()); } return v; }); return meta_stack; } at::Tensor TypeInferenceHandler::allocateTensor(const at::Tensor &meta_tensor, bool is_prelu) { auto sizes = meta_tensor.sizes(); if (is_prelu && sizes.size() == 2 && sizes[1] == 1) { sizes = sizes.slice(1); } return _tensor_store->allocateTensor(sizes, meta_tensor.scalar_type()); } c10::List TypeInferenceHandler::allocateTensorList( const c10::List &meta_tensor_list) { c10::List allocated_tensor_list; std::transform(meta_tensor_list.begin(), meta_tensor_list.end(), std::back_inserter(allocated_tensor_list), [this](const at::Tensor &tensor) { return this->_tensor_store->allocateTensor( tensor.sizes(), tensor.scalar_type()); }); return allocated_tensor_list; } void TypeInferenceHandler::repopulateIpuStack(c10::Stack &ipu_stack, const c10::Stack &meta_stack, bool is_prelu) { ERROR_ON(!ipu_stack.empty()); ipu_stack.reserve(meta_stack.size()); std::transform(meta_stack.cbegin(), meta_stack.cend(), std::back_inserter(ipu_stack), [=](const auto &v) -> c10::IValue { if (v.isTensor()) { return allocateTensor(v.toTensor(), is_prelu); } if (v.isTensorList()) { return allocateTensorList(v.toTensorList()); } return v; }); } namespace { std::vector getMetaTensorSize(const at::Tensor &tensor, bool is_prelu) { std::vector sizes = tensor.sizes().vec(); if (is_prelu && sizes.size() == 1) { sizes.push_back(1); } return sizes; } c10::ScalarType getMetaTensorDtype(const at::Tensor &tensor, bool should_upcast_to_long) { const auto dtype = tensor.scalar_type(); if (dtype == c10::ScalarType::Int && should_upcast_to_long) { return c10::ScalarType::Long; } return dtype; } at::Tensor createEmptyMetaTensor(const at::Tensor &tensor, bool should_upcast_to_long, bool is_prelu) { const auto dtype = getMetaTensorDtype(tensor, should_upcast_to_long); const std::vector sizes = getMetaTensorSize(tensor, is_prelu); auto out = at::meta::empty(sizes, dtype); if (tensor.unsafeGetTensorImpl()->is_wrapped_number()) { out.unsafeGetTensorImpl()->set_wrapped_number(true); } return out; } } // namespace at::Tensor TypeInferenceHandler::toMeta(const at::Tensor &tensor, bool should_upcast_to_long, bool is_prelu) { if (!tensor.defined()) { return tensor; } if (!isIpuTensor(tensor)) { if (tensor.unsafeGetTensorImpl()->is_wrapped_number()) { return tensor; } ERROR("Expected an IPU tensor but got tensor(device=" << tensor.device() << ", shape=" << tensor.sizes() << ", dtype=" << tensor.scalar_type() << ").\nConstant tensors should be moved explicitly " "to the IPU, via cpu_tensor.to(\"ipu\")."); } return createEmptyMetaTensor(tensor, should_upcast_to_long, is_prelu); } c10::optional TypeInferenceHandler::indexArgToUpcast(const std::string &schema_key) { if (schema_key == "aten::argmax.out" || schema_key == "aten::argmin.out") { return 3; } if (schema_key == "aten::gather" || schema_key == "aten::scatter.src" || schema_key == "aten::scatter_.src" || schema_key == "aten::scatter.value" || schema_key == "aten::scatter.value_reduce" || schema_key == "aten::scatter_.value" || schema_key == "aten::scatter_.value_reduce" || schema_key == "aten::scatter_add" || schema_key == "aten::scatter_add_" || schema_key == "aten::scatter_reduce.two" || schema_key == "aten::scatter_reduce_.two" || schema_key == "torch_scatter::scatter_max" || schema_key == "torch_scatter::scatter_min" || schema_key == "torch_scatter::scatter_mul" || schema_key == "torch_spline_conv::spline_basis") { return 2; } if (schema_key == "aten::index.Tensor" || schema_key == "aten::nll_loss_forward" || schema_key == "aten::take_along_dim" || schema_key == "aten::take_along_dim.out") { return 1; } if (schema_key == "aten::sort.values_stable") { return 5; } return c10::nullopt; } static bool reductionWorkaroundPredicate(const std::size_t value_index, const c10::IValue &value, const c10::Stack &ipu_stack, const std::size_t dtype_index, const std::size_t out_index) { return value_index == dtype_index && value.isNone() && !ipu_stack.at(out_index).isNone(); } static c10::IValue reductionTransform(const c10::IValue &transformed_value, const c10::Stack &ipu_stack, const std::size_t out_index) { const auto &value = ipu_stack.at(out_index); if (!value.isNone() && value.isTensor()) { const auto tensor = value.toTensor(); return c10::IValue(c10::typeMetaToScalarType(tensor.dtype())); } return transformed_value; } static auto makeReductionWorkaround(const std::size_t dtype_index, const std::size_t out_index) { /* In case dtype is None, PyTorch meta backend assumes that it is int64_t for * all integral tensors, causing validation issues when the output tensor has * int32_t dtype. */ const auto predicate = [=](const std::size_t value_index, const c10::IValue &value, const c10::Stack &ipu_stack) { return reductionWorkaroundPredicate(value_index, value, ipu_stack, dtype_index, out_index); }; const auto transform_fn = [=](const c10::IValue &transformed_value, const c10::Stack &ipu_stack) { return reductionTransform(transformed_value, ipu_stack, out_index); }; return TypeInferenceHandler::Workaround{predicate, transform_fn}; } const std::unordered_map TypeInferenceHandler::schema_to_workaround = { {"aten::sum.IntList_out", makeReductionWorkaround(3 /*dtype_index*/, 4 /*out_index*/)}, {"aten::cumsum.out", makeReductionWorkaround(2 /*dtype_index*/, 3 /*out_index*/)}, {"aten::cumprod.out", makeReductionWorkaround(2 /*dtype_index*/, 3 /*out_index*/)}, {"aten::sum.out", makeReductionWorkaround(4 /*dtype_index*/, 0 /*out_index*/)}, {"aten::prod.out", makeReductionWorkaround(4 /*dtype_index*/, 0 /*out_index*/)}}; } // namespace poptorch ================================================ FILE: poptorch/source/dispatch_tracer/TypeInferenceHandler.hpp ================================================ // Copyright (c) 2022 Graphcore Ltd. All rights reserved. #ifndef POPTORCH_DISPATCH_TYPE_INFERENCE_HANDLER_HPP_ #define POPTORCH_DISPATCH_TYPE_INFERENCE_HANDLER_HPP_ #include #include #include #include #include #include "Tensor.hpp" namespace poptorch { class TypeInferenceHandler { public: explicit TypeInferenceHandler(TensorStore *tensor_store) : _tensor_store(tensor_store) {} void inferOutputTypes(const c10::OperatorHandle &op, c10::Stack *ipu_stack); struct Workaround { std::function predicate_fn; std::function transform_fn; }; // Create a meta tensor with the same type as the input static at::Tensor toMeta(const at::Tensor &tensor, bool upcast_to_long, bool is_prelu = false); static c10::optional indexArgToUpcast(const std::string &schema_key); private: // Create a stack of meta tensors that matches the inputs in // ipu_stack static c10::Stack createMetaStack(const c10::Stack &ipu_stack, const std::string &schema_key, bool is_prelu); // Using the computed meta output stack, repopulate the ipu stack // with tensors of the correct inferred output types void repopulateIpuStack(c10::Stack &ipu_stack, const c10::Stack &meta_stack, bool is_prelu); at::Tensor allocateTensor(const at::Tensor &meta_tensor, bool is_prelu); c10::List allocateTensorList(const c10::List &meta_tensor_list); static std::optional workaroundLookup(const std::string &schema_key); static c10::IValue applyWorkaround(const Workaround &workaround, std::size_t value_index, const c10::IValue &value, const c10::Stack &stack); static const std::unordered_map schema_to_workaround; TensorStore *_tensor_store; }; } // namespace poptorch #endif // POPTORCH_DISPATCH_TYPE_INFERENCE_HANDLER_HPP_ ================================================ FILE: poptorch/source/dispatch_tracer/ValueMapper.cpp ================================================ // Copyright (c) 2021 Graphcore Ltd. All rights reserved. #include "ValueMapper.hpp" #include #include #include #include "Tensor.hpp" #include "poptorch/DispatchTracer.hpp" #include "poptorch/Utils.hpp" #include "poptorch_logging/Error.hpp" #include "poptorch_logging/Logging.hpp" #include "pytorch_bridge/IpuSession.hpp" namespace poptorch { ValueMapper::ValueMapper(ValueMapper &&other) noexcept = default; ValueMapper &ValueMapper::operator=(ValueMapper &&other) noexcept = default; ValueMapper::~ValueMapper() = default; ValueMapper::TrackedTensor::TrackedTensor( const std::shared_ptr &details) : tensor_details(details), buffer(details->getOwningBuffer()) {} bool ValueMapper::isParameter(const at::Tensor &t) const { if (const auto *record = find(t)) { return record->is_parameter; } return false; } void ValueMapper::setParameterName(const at::Tensor &t, const std::string &name) { const IpuTensorId id = ipuTensorId(t); const auto itr = _tensors.find(id); if (itr == _tensors.end()) { logging::warn("Parameter {} cannot be named because it was not added to " "the value mapper.", name); return; } if (!itr->second.is_parameter && !t.is_floating_point()) { logging::warn("Parameter {}: {} was downgraded to constant because PopART " "doesn't support non floating point parameters", name, str(t)); return; } ERROR_ON_MSG(!itr->second.is_parameter, "Not a parameter or a buffer: " << str(t)); auto name_it = _name_ids_map.find(name); if (name_it != _name_ids_map.end()) { ERROR_ON_MSG(name_it->second != id, "Name " << name << " can't be associated to " << id << " because it is already associated to " << name_it->second); return; } auto id_it = _ids_name_map.find(id); if (id_it != _ids_name_map.end()) { ERROR_ON_MSG(id_it->second != name, "Name for tensor id " << id << " can't be set to " << name << " because it is already set to " << id_it->second); return; } _name_ids_map.insert({name, id}); _ids_name_map.insert({id, name}); } std::string ValueMapper::getParameterName(torch::jit::Value *value) const { auto itr = _values_map.find(value); if (itr == _values_map.end()) { logging::trace("JIT value not tracked {}", reinterpret_cast(value)); return ""; } auto it = _ids_name_map.find(itr->second); if (it == _ids_name_map.end()) { return ""; } return it->second; } void ValueMapper::setParameterPerReplica(const std::string ¶m_name, const at::Tensor &tensor, int comm_group_type, int shards, int variable_retrieval_mode) { auto param_it = _name_ids_map.find(param_name); if (param_it == std::end(_name_ids_map)) { logging::warn("Parameter name {} was not found", param_name); return; } auto data_size = tensorDataSize(tensor); ERROR_ON_MSG(!tensor.is_contiguous(), "Data source must be contiguous: " << str(tensor)); const PerReplicaSettings settings = { comm_group_type, shards, variable_retrieval_mode, tensor.size(0), std::make_shared>(data_size)}; memcpy(settings.host_buffer->data(), tensor.data_ptr(), data_size); _per_replica_map[param_it->second] = settings; } std::optional ValueMapper::getParameterPerReplica(torch::jit::Value *value) const { auto itr = _values_map.find(value); if (itr == _values_map.end()) { logging::trace("JIT value not tracked {}", reinterpret_cast(value)); return std::nullopt; } auto it = _per_replica_map.find(itr->second); if (it == _per_replica_map.end()) { return std::nullopt; } return it->second; } // Add a tensor to the IR. void ValueMapper::addTensor(const std::shared_ptr &details, poptorch_ir::TensorId mlir_id, bool is_param) { logging::trace("Adding {} to value mapper {}, MLIR id: {}", details->tensor_id, static_cast(this), mlir_id); auto tensor_id = details->tensor_id; auto &record = _tensors.insert({tensor_id, TrackedTensor{details}}).first->second; record.mlir = mlir_id; record.is_parameter |= is_param; _mlir_id_tensors_map.emplace(mlir_id, tensor_id); } void ValueMapper::addTensor(const at::Tensor &t, poptorch_ir::TensorId mlir_id, bool is_param) { addTensor(getTensorDetails(t), mlir_id, is_param); } void ValueMapper::addTensorUnchecked(const at::Tensor &t, torch::jit::Value *val, bool is_param) { logging::trace("Adding {} to value mapper {}, JIT ir: {}", static_cast(t.unsafeGetTensorImpl()), static_cast(this), val->debugName()); // If the tensor is already being tracked then we will update the JIT // value being tracked. Otherwise we insert and add the jit value. const auto &new_details = getTensorDetails(t); const auto ipu_tensor_id = new_details->tensor_id; auto &record = _tensors.insert({ipu_tensor_id, TrackedTensor{new_details}}) .first->second; record.jit = val; record.is_parameter |= is_param; // Ensure we maintain a lookup of torch::jit to pytorch tensor. _values_map.insert({val, ipu_tensor_id}); } void ValueMapper::addTensor(const at::Tensor &t, torch::jit::Value *val, bool is_param) { ERROR_ON_MSG(val == nullptr, "torch::jit::Value* cannot be null"); validateTensorShapeAndType(val, t); addTensorUnchecked(t, val, is_param); } ValueMapper::TrackedTensor *ValueMapper::rawTensorRecord(const at::Tensor &t) { return find(t); } ValueMapper::TrackedTensor * ValueMapper::rawTensorRecord(torch::jit::Value *val) { auto itr = _values_map.find(val); if (itr == _values_map.end()) { return nullptr; } auto tracked_tensor_itr = _tensors.find(itr->second); if (tracked_tensor_itr == _tensors.end()) { return nullptr; } return &tracked_tensor_itr->second; } // Get the user tensor from our SSA tensors. torch::jit::Value *ValueMapper::getValueForTensor(const at::Tensor &t) { if (!isIpuTensor(t)) { return nullptr; } if (auto *tracked_tensor = find(t)) { return tracked_tensor->jit; } return nullptr; } poptorch_ir::TensorId ValueMapper::getMLIRForTensorId(IpuTensorId tensor_id) const { if (const auto itr = _tensors.find(tensor_id); itr != _tensors.end()) { return itr->second.mlir; } return poptorch_ir::tensor_error_id; } poptorch_ir::TensorId ValueMapper::getMLIRForTensor(const IpuTensorDetails &details) const { if (const auto *tracked_tensor = find(details)) { return tracked_tensor->mlir; } return poptorch_ir::tensor_error_id; } poptorch_ir::TensorId ValueMapper::getMLIRForTensor(const at::Tensor &t) const { if (!isIpuTensor(t)) { return poptorch_ir::tensor_error_id; } return getMLIRForTensor(*getTensorDetails(t)); } bool ValueMapper::hasMapping(const at::Tensor &t) const { return find(t) != nullptr; } void ValueMapper::addTensorList(const TensorList &list, torch::jit::Value *val) { logging::trace("Adding tensor list to value mapper, JIT ir: {}", val->debugName()); _tensor_lists.insert({list, val}); } torch::jit::Value *ValueMapper::getValueForTensorList(const TensorList &list) { auto itr = _tensor_lists.find(list); if (itr != _tensor_lists.end()) { return itr->second; } return nullptr; } void ValueMapper::replaceValue(torch::jit::Value *v_old, torch::jit::Value *v_new) { for (auto &rec : _tensors) { if (rec.second.jit == v_old) { rec.second.jit = v_new; } } } std::shared_ptr ValueMapper::getTensorDetailsForId(IpuTensorId id) const { auto it = _tensors.find(id); if (it == _tensors.end()) { return nullptr; } return it->second.tensor_details.lock(); } std::shared_ptr ValueMapper::getTensorDetailsForMlirId(poptorch_ir::TensorId id) const { auto it = _mlir_id_tensors_map.find(id); if (it == _mlir_id_tensors_map.end()) { return nullptr; } return getTensorDetailsForId(it->second); } Buffer ValueMapper::getBufferForId(IpuTensorId id) const { const auto it = _tensors.find(id); if (it == _tensors.end()) { return Buffer(); } return *it->second.buffer; } poptorch_ir::CpuBuffer ValueMapper::getBufferForValue(torch::jit::Value *value) const { auto itr = _values_map.find(value); if (itr == _values_map.end()) { return nullptr; } if (auto b = getBufferForId(itr->second); b.hasData()) { return b.getCpuData(); } return nullptr; } ValueMapper::TrackedTensor *ValueMapper::find(const IpuTensorDetails &details) { auto itr = _tensors.find(details.tensor_id); if (itr == _tensors.end()) { return nullptr; } return &itr->second; } const ValueMapper::TrackedTensor * ValueMapper::find(const IpuTensorDetails &details) const { return const_cast(this)->find(details); } ValueMapper::TrackedTensor *ValueMapper::find(const at::Tensor &t) { return find(*getTensorDetails(t)); } const ValueMapper::TrackedTensor *ValueMapper::find(const at::Tensor &t) const { return find(*getTensorDetails(t)); } } // namespace poptorch ================================================ FILE: poptorch/source/dispatch_tracer/ValueMapper.hpp ================================================ // Copyright (c) 2021 Graphcore Ltd. All rights reserved. #ifndef POPTORCH_DISPATCH_VALUE_MAPPER_HPP_ #define POPTORCH_DISPATCH_VALUE_MAPPER_HPP_ #include #include #include #include #include #include #include #include "Tensor.hpp" #include "poptorch/DispatchTracer.hpp" #include "pytorch_bridge/CompilerTypes.hpp" namespace poptorch { /* * The value mapper is the core of the tracer functionality. It provides the * system by which we map an incoming at::Tensor onto the compiler IRs. We take * a tensor and disambiguate it into a torch::jit::Value or poptorch compiler * TensorID corresponding to the values we are tracking for that tensor in the * JIT/MLIR graphs respectively. */ class ValueMapper { private: using TensorList = std::vector; // Hash combine for mapping a vector of jit values (inputs of a // prim::ListConstruct) to the output jit value. This allows us to use an // unordered_map from TensorList to the output values and thus track the // incoming tensor lists. Performance and collisions are not very critical // in this scenario as we don't expect models with unreasonably // large number of lists. struct TensorListHash { size_t operator()(const TensorList &list) const { const std::hash hash_func; size_t hash = 11; for (const auto *value : list) { const size_t hash_next = hash_func(value); hash = hash * 31 + hash_next; } return hash; } }; public: ValueMapper() = default; ValueMapper(ValueMapper &&) noexcept; ValueMapper &operator=(ValueMapper &&) noexcept; ValueMapper(const ValueMapper &) = delete; ValueMapper &operator=(const ValueMapper &) = delete; ~ValueMapper(); // Each tensor we are tracking has a short record containing a pointer to the // tensor and its corresponding values in the two IRs. struct TrackedTensor { explicit TrackedTensor(const std::shared_ptr &details); // The underlying tensor information. Note that we don't participate in // ownership here. We want to tie the lifetime of the tensor details to the // when the tensor is accessible from pytorch. Note that it isn't sufficient // to check whether the tensor is directly accessible from pytorch because // the tensor details might be kept alive at the end of a chain of view // tensors std::weak_ptr tensor_details; bool is_parameter = false; // We want to track the lifetime of the tensor_details and the buffer // separately. This is so we can get the data from inputs to the graph that // are temporaries without extending their lifetime std::shared_ptr buffer; // The value in JIT IR torch::jit::Value *jit = nullptr; // The value in our mlir backend. poptorch_ir::TensorId mlir = poptorch_ir::tensor_error_id; }; TrackedTensor *rawTensorRecord(const at::Tensor &t); TrackedTensor *rawTensorRecord(torch::jit::Value *val); torch::jit::Value *getValueForTensor(const at::Tensor &t); poptorch_ir::TensorId getMLIRForTensorId(IpuTensorId tensor_id) const; poptorch_ir::TensorId getMLIRForTensor(const IpuTensorDetails &details) const; poptorch_ir::TensorId getMLIRForTensor(const at::Tensor &t) const; void addTensorUnchecked(const at::Tensor &t, torch::jit::Value *val, bool is_param); void addTensor(const at::Tensor &t, torch::jit::Value *val, bool is_param); void addTensor(const std::shared_ptr &details, poptorch_ir::TensorId mlir_id, bool is_param); void addTensor(const at::Tensor &t, poptorch_ir::TensorId mlir_id, bool is_param); void addTensorList(const TensorList &list, torch::jit::Value *val); torch::jit::Value *getValueForTensorList(const TensorList &list); bool isParameter(const at::Tensor &t) const; void setParameterName(const at::Tensor &t, const std::string &name); std::string getParameterName(torch::jit::Value *value) const; void setParameterPerReplica(const std::string ¶m_name, const at::Tensor &tensor, int comm_group_type, int shards, int variable_retrieval_mode); std::optional getParameterPerReplica(torch::jit::Value *value) const; void replaceValue(torch::jit::Value *v_old, torch::jit::Value *v_new); std::shared_ptr getTensorDetailsForId(IpuTensorId id) const; std::shared_ptr getTensorDetailsForMlirId(poptorch_ir::TensorId mlir_id) const; Buffer getBufferForId(IpuTensorId id) const; poptorch_ir::CpuBuffer getBufferForValue(torch::jit::Value *value) const; bool hasMapping(const at::Tensor &t) const; private: // We map each PyTorch tensor to a record of all the metadata we are tracking // about that tensor in the tensor map. std::unordered_map _tensors; // Mapping between parameter / buffer names and tensor IDs std::unordered_map _name_ids_map; std::unordered_map _ids_name_map; std::unordered_map _per_replica_map; // We also need to map the values to the mlir so we can query the mlir for a // given value. std::unordered_map _values_map; // Map each prim::ListConstruct to a corresponding jit output value. std::unordered_map _tensor_lists; // For resolving aliases, it's useful to find a TrackedTensor from its id. std::unordered_map _mlir_id_tensors_map; TrackedTensor *find(const IpuTensorDetails &details); const TrackedTensor *find(const IpuTensorDetails &details) const; TrackedTensor *find(const at::Tensor &t); const TrackedTensor *find(const at::Tensor &t) const; }; } // namespace poptorch #endif // POPTORCH_DISPATCH_VALUE_MAPPER_HPP_ ================================================ FILE: poptorch/source/dispatch_tracer/dispatchers/IDispatch.cpp ================================================ // Copyright (c) 2022 Graphcore Ltd. All rights reserved. #include "IDispatch.hpp" #include #include "poptorch_logging/Error.hpp" #include "poptorch_logging/Logging.hpp" #include "../CommonHelperFunctions.hpp" #include "../Tensor.hpp" #include "poptorch/OpBuilder.hpp" #include "poptorch/Utils.hpp" namespace poptorch { IDispatch::IDispatch(TensorStore *tensor_store) { ERROR_ON(tensor_store == nullptr); _tensor_store = tensor_store; } void IDispatch::setPythonStack( const std::vector &stack) { setCurrentCodeLocation(getPythonInterpreterSourceRange(stack)); } void *IDispatch::getDataSource(torch::jit::Value *value) { auto buf = _mapper.getBufferForValue(value); if (buf == nullptr) { logging::trace("JIT value not tracked {}", reinterpret_cast(value)); return nullptr; } return buf->data(); } bool IDispatch::isParameter(const at::Tensor &t) const { return _mapper.isParameter(t); } bool IDispatch::isParameter(torch::jit::Value *value) { auto *record = _mapper.rawTensorRecord(value); ERROR_ON_MSG(record == nullptr, "JIT value not tracked " << reinterpret_cast(value)); return record->is_parameter; } void IDispatch::setParameterName(const at::Tensor &tensor, const std::string &name) { _mapper.setParameterName(tensor, name); } std::string IDispatch::getParameterName(torch::jit::Value *value) const { return _mapper.getParameterName(value); } void IDispatch::setParameterPerReplica(const std::string ¶m_name, const at::Tensor &tensor, int comm_group_type, int shards, int variable_retrieval_mode) { _mapper.setParameterPerReplica(param_name, tensor, comm_group_type, shards, variable_retrieval_mode); } bool IDispatch::getParameterPerReplica(torch::jit::Value *value, PerReplicaSettings &settings) const { auto res = _mapper.getParameterPerReplica(value); if (!res.has_value()) { return false; } settings = std::move(*res); return true; } void IDispatch::replaceValue(torch::jit::Value *v_old, torch::jit::Value *v_new) { _mapper.replaceValue(v_old, v_new); } // adapted from torch/csrc/jit/python/python_tracer.cpp because the header file // had too many dependencies torch::jit::SourceRange IDispatch::getPythonInterpreterSourceRange( const std::vector &cs) const { auto excludes = getSourceLocationExcludes(); const auto is_filename_excluded = [&](std::string_view filename) { const auto excludes_filename = [&filename](std::vector exclude) { return filename.find(std::string_view(exclude.data(), exclude.size())) != std::string_view::npos; }; return std::any_of(excludes.begin(), excludes.end(), excludes_filename); }; // transform_reduce auto stack_trace = std::accumulate( cs.begin(), cs.end(), std::string(), [](std::string trace, const torch::jit::StackEntry &entry) { auto file_line_col = entry.range.file_line_col(); if (file_line_col) { const auto &[file, line, col] = *file_line_col; UNUSED(col); trace += file + "(" + std::to_string(line) + "): " + entry.filename + "\n"; } return trace; }); auto val = std::find_if( cs.begin(), cs.end(), [is_filename_excluded](const torch::jit::StackEntry &entry) { auto file_line_col = entry.range.file_line_col(); if (file_line_col) { return !is_filename_excluded(std::get<0>(*file_line_col)); } return false; }); c10::optional source_filename; std::size_t source_line = 0; if (val != cs.end()) { std::size_t col = 0; std::tie(source_filename, source_line, col) = *val->range.file_line_col(); } auto source = std::make_shared( stack_trace, source_filename, source_line); logging::trace("Setting op source to: {}:{}", source_filename.value_or(""), source_line); return torch::jit::SourceRange(source, 0, stack_trace.size()); } IDispatch::~IDispatch() { resetCurrentSourceLocation(); } } // namespace poptorch ================================================ FILE: poptorch/source/dispatch_tracer/dispatchers/IDispatch.hpp ================================================ // Copyright (c) 2021 Graphcore Ltd. All rights reserved. #ifndef POPTORCH_IDISPATCH_H_ #define POPTORCH_IDISPATCH_H_ #include #include #include #include #include #include #include #include "../ValueMapper.hpp" namespace poptorch { class IDispatch { public: explicit IDispatch(TensorStore *tensor_store); IDispatch(IDispatch &&other) noexcept = default; IDispatch &operator=(IDispatch &&other) noexcept = default; IDispatch(const IDispatch &other) noexcept = delete; IDispatch &operator=(const IDispatch &other) noexcept = delete; virtual ~IDispatch(); // Input tensor is a CPU tensor, returns an IPU tensor. virtual void addInput(const at::Tensor &cpu_tensor, const at::Tensor &ipu_tensor) = 0; // Constant tensor is a CPU tensor, returns an IPU tensor. virtual void addConstant(const at::Tensor &cpu_tensor, const at::Tensor &ipu_tensor) = 0; // Input tensor is a CPU tensor, returns an IPU tensor. virtual void addParameter(const at::Tensor &cpu_tensor, const at::Tensor &ipu_tensor) = 0; // Source tensor is an IPU tensor, destination is a CPU tensor. virtual void addOutput(const at::Tensor &ipu_src, const at::Tensor &cpu_dest) = 0; virtual void finalizeGraph() = 0; void setPythonStack(const std::vector &stack); // The "catch-all" fallback kernel. virtual void fallback(const c10::OperatorHandle &op, c10::Stack *stack) = 0; virtual void detach(const c10::OperatorHandle &op, c10::Stack *stack, bool moving_parameters) = 0; // Rather than have each empty overload requring a specialised kernel we // simply ask the dispatchers to acknowledge the created empty tensor and we // create it manually in the base function registration. virtual void registerEmptyTensor(const at::Tensor &empty, bool is_param) = 0; bool isParameter(const at::Tensor &t) const; void *getDataSource(torch::jit::Value *val); bool isParameter(torch::jit::Value *val); void replaceValue(torch::jit::Value *v_old, torch::jit::Value *v_new); void setParameterName(const at::Tensor &tensor, const std::string &name); std::string getParameterName(torch::jit::Value *val) const; void setParameterPerReplica(const std::string ¶m_name, const at::Tensor &tensor, int comm_group_type, int shards, int variable_retrieval_mode); bool getParameterPerReplica(torch::jit::Value *value, PerReplicaSettings &settings) const; protected: // We use the value mapper to map between incoming at::Tensors and JIT/MLIR // types. ValueMapper _mapper; // Used to create and manage tensors. This is a raw pointer to ensure this is // trivially copyable, but must never be nullptr. TensorStore *_tensor_store; virtual const std::vector> & getSourceLocationExcludes() const = 0; virtual void setCurrentCodeLocation(const torch::jit::SourceRange &source_location) = 0; private: torch::jit::SourceRange getPythonInterpreterSourceRange( const std::vector &cs) const; }; } // namespace poptorch #endif // POPTORCH_IDISPATCH_H_ ================================================ FILE: poptorch/source/dispatch_tracer/dispatchers/JitDispatch.cpp ================================================ // Copyright (c) 2021 Graphcore Ltd. All rights reserved. #include "JitDispatch.hpp" #include #include #include #include #include "../../PoptorchSymbols.hpp" #include "../../popart_canonicalization/PopartCanonicalizationUtils.hpp" #include "poptorch/OpBuilder.hpp" #include "poptorch/PopartCanonicalization.hpp" #include "poptorch/TypeAndConstantCanonicalization.hpp" #include "poptorch/Utils.hpp" #include "poptorch_logging/Error.hpp" #include "poptorch_logging/Logging.hpp" #include "pytorch_bridge/CompilerOptions.hpp" #include "../CommonHelperFunctions.hpp" #include "../Tensor.hpp" namespace poptorch { class WithMetadata { public: explicit WithMetadata(const std::string &metadata) { setCurrentMetadata(metadata); } ~WithMetadata() { setCurrentMetadata(""); } }; std::string truncateGraphString(torch::jit::Graph &graph) { static const int num_lines_max = [=]() { if (const char *graph_len = std::getenv("POPTORCH_MAX_GRAPH_LEN")) { const int n = std::stoi(graph_len); logging::trace("POPTORCH_MAX_GRAPH_LEN={}", n); return n; } const int n = 10; logging::trace("POPTORCH_MAX_GRAPH_LEN not set, defaulting to {}", n); return n; }(); std::string s = graph.toString(); if (num_lines_max <= 0 || s.empty()) { return s; } size_t start = s.size(); for (int i = 0; i < num_lines_max; i++) { start = s.rfind('\n', start - 1); if (start == std::string::npos) { // Didn't find another new line: print everything. return s; } } // Start after the last line return. return "[...truncated...]" + s.substr(start); } JITDispatch::JITDispatch(const CompilerOptions &options, TensorStore *tensor_store) : IDispatch(tensor_store), graph(std::make_shared()), _opts(options), _type_inference_handler(tensor_store) {} void JITDispatch::addConstant(const at::Tensor &cpu_tensor, const at::Tensor &ipu_tensor) { ERROR_ON(!cpu_tensor.unsafeGetTensorImpl()->is_cpu()); const auto src = cpu_tensor.to(ipu_tensor.scalar_type()); const WithMetadata metadata("constant"); auto *value = insertConstant(graph.get(), src); logging::trace("[DISPATCHER] Adding constant: Value {} with cpu ptr {}", static_cast(value), cpu_tensor.data_ptr()); _mapper.addTensor(ipu_tensor, value, false); } void JITDispatch::addTensor(const at::Tensor &cpu_tensor, const at::Tensor &ipu_tensor, bool is_parameter) { ERROR_ON(!cpu_tensor.unsafeGetTensorImpl()->is_cpu()); errorOnZeroSizedTensor(cpu_tensor); const auto src = cpu_tensor.to(ipu_tensor.dtype()); _tensor_store->copyFromCpu(ipu_tensor, src); torch::jit::Value *value = graph->addInput(cpu_tensor.name()); setSourceRangeToCurrentLocation(value->node()); value->setType(c10::TensorType::create(ipu_tensor) ->withRequiresGrad(cpu_tensor.requires_grad())); logging::trace("[DISPATCHER] Adding {}: Value {} with cpu ptr {}", is_parameter ? "parameter" : "input", static_cast(value), src.data_ptr()); _inplace_tracker.addTensor(value); _mapper.addTensor(ipu_tensor, value, is_parameter); } void JITDispatch::addInput(const at::Tensor &cpu_tensor, const at::Tensor &ipu_tensor) { const WithMetadata metadata("input"); addTensor(cpu_tensor, ipu_tensor, /* is_parameter= */ false); } void JITDispatch::addParameter(const at::Tensor &cpu_tensor, const at::Tensor &ipu_tensor) { const WithMetadata metadata("parameter"); const at::ScalarType type = cpu_tensor.scalar_type(); // PopART doesn't allow non-floating point variables so add them as // constants instead. These will be deleted from parameters and buffers // in python before passed to lowering. if (!at::isFloatingType(type)) { return addConstant(cpu_tensor, ipu_tensor); } addTensor(cpu_tensor, ipu_tensor, /* is_parameter= */ true); } void JITDispatch::addOutput(const at::Tensor &ipu_src, const at::Tensor &cpu_dest) { const WithMetadata metadata("output"); // The PopART backend will allocate its own buffers: ignore cpu_dest. UNUSED(cpu_dest); auto *record = _mapper.rawTensorRecord(ipu_src); ERROR_ON_MSG(record == nullptr, "Internal: graph output tensor not present in value mapper " << static_cast(&_mapper) << " for " << static_cast(ipu_src.unsafeGetTensorImpl())); torch::jit::Value *output = record->jit; // If the output is an input: add an identity op to make sure the graph // is not empty. for (torch::jit::Value *input : graph->inputs()) { if (input == output) { auto *none = graph->createNone(); insertNodeInGraph(graph.get(), none); output = createAndInsertNode(graph.get(), c10::aten::clone, {output, none->output()}, ImplicitCast::None, OutputType::AsFirstInput) ->output(); break; } } logging::trace( "[DISPATCHER][JIT] Graph output: Tensor ptr {}, jit ir %{} " "(scalar type {})", reinterpret_cast(ipu_src.unsafeGetTensorImpl()), output->debugNameBase(), output->type()->expect()->scalarType().value_or( at::ScalarType::Undefined)); graph->registerOutput(output); } void JITDispatch::finalizeGraph() { // Clear the code location setCurrentPythonCodeLocation({}); } void JITDispatch::registerEmptyTensor(const at::Tensor &tensor, bool is_param) { const WithMetadata metadata("empty"); // Do not call copyAndCoerceType from this method: // the source tensor hasn't been added to the mapper yet. // The tensor shouldn't need converting anyway: it should be created with a // valid type. const auto coerced_scalar_type = coerceToSupportedType(tensor.scalar_type()); ERROR_ON_MSG( coerced_scalar_type != tensor.scalar_type(), "[Internal error] The empty tensor should have a valid compiler type"); // aten::empty.memory_format(int[] size, *, ScalarType? dtype=None, // Layout? layout=None, Device? device=None, // bool? pin_memory=None, // MemoryFormat? memory_format=None) -> Tensor auto *g = graph.get(); auto *const pin_memory = g->createNone(); auto *const memory_format = g->createNone(); insertNodeInGraph(g, pin_memory); insertNodeInGraph(g, memory_format); torch::jit::Node *n = createAndInsertNode( g, c10::aten::empty, {insertConstant(g, tensor.sizes()), insertConstant(g, tensor.scalar_type()), insertConstant(g, tensor.layout()), insertConstant(g, tensor.device()), pin_memory->output(), memory_format->output()}); n->output()->inferTypeFrom(tensor); setSourceRangeToCurrentLocation(n); _mapper.addTensor(tensor, n->output(), is_param); } // aten::detach(Tensor(a) self) -> (Tensor(a)) void JITDispatch::detach(const c10::OperatorHandle &op, c10::Stack *stack, bool moving_parameters) { // We only handle the special case when we're moving parameters here. If we're // not moving parameters, we'll defer to the fallback and actually create a // dispatch op on the PopART graph. if (!moving_parameters) { fallback(op, stack); return; } const c10::FunctionSchema &schema = op.schema(); const auto num_arguments = schema.arguments().size(); const auto arguments = torch::jit::last(stack, num_arguments); ERROR_ON(arguments.size() != 1); const at::Tensor in = arguments.front().toTensor(); const at::Tensor out(in.unsafeGetTensorImpl()->shallow_copy_and_detach( /*version_counter=*/in.unsafeGetTensorImpl()->version_counter(), /*allow_tensor_metadata_change=*/true)); // The new tensor points at the same mlir tensor as the source. _mapper.addTensor(out, _mapper.getValueForTensor(in), true); torch::jit::drop(stack, num_arguments); torch::jit::push(stack, out); } const std::vector> & JITDispatch::getSourceLocationExcludes() const { return _opts.dispatcher.source_location_excludes; } void JITDispatch::setCurrentCodeLocation( const torch::jit::SourceRange &source_location) { setCurrentPythonCodeLocation(source_location); } // Convert the operation into our normal IR style operation. void JITDispatch::fixOutput(c10::Stack &stack, torch::jit::Node *node) { // Fix up the outputs. std::uint32_t output_index = 0; for (const c10::IValue &value : stack) { // Add any missing outputs. They frequently return scalars which we just // ignore here as our canonicalisation only returns tensors. while (output_index >= node->outputs().size()) { node->addOutput(); } // Start tracking the output tensors, i.e. add them to the value mapper. torch::jit::Value *val = node->output(output_index); if (value.isTensor()) { const at::Tensor tensor = value.toTensor(); val->inferTypeFrom(tensor); _mapper.addTensor(tensor, val, false); logging::trace( "[DISPATCHER][JIT] Output: Tensor ptr {}, jit ir %{} (scalar type " "{})", reinterpret_cast(tensor.unsafeGetTensorImpl()), val->debugNameBase(), val->type()->expect()->scalarType().value_or( at::ScalarType::Undefined)); } else if (value.isTensorList()) { logging::trace("[DISPATCHER][JIT] Output tensor list: jit ir %{}", val->debugName()); val->setType(value.type()->expect()); const auto tensor_list = value.toTensorVector(); // Always insert list unpack if output value is a list. auto *const unpack = graph->createListUnpack(val, tensor_list.size()); insertNodeInGraph(graph.get(), unpack); for (size_t i = 0; i < tensor_list.size(); ++i) { const at::Tensor &tensor = tensor_list.at(i); val = unpack->output(i); val->inferTypeFrom(copyAndCoerceType(tensor)); _mapper.addTensor(tensor, val, false); logging::trace("[DISPATCHER][JIT] Output tensor list element: Tensor " "ptr {}, jit ir %{} {}", reinterpret_cast(tensor.unsafeGetTensorImpl()), val->debugNameBase(), toString(tensor)); } } output_index++; } } void JITDispatch::fallback(const c10::OperatorHandle &op, c10::Stack *stack) { const c10::FunctionSchema &schema = op.schema(); // Run through the schema to find out if one of the operators is supposed to // be inplace, this could be the 'out' argument of a non-inplace op. const std::vector inplace_tensors = getInplaceArguments(*stack, schema); const std::size_t num_inplace_tensors = inplace_tensors.size(); std::vector aliased_inputs(num_inplace_tensors, nullptr); if (!inplace_tensors.empty()) { std::transform(inplace_tensors.cbegin(), inplace_tensors.cend(), aliased_inputs.begin(), [&](const auto &inplace_tensor) { return _inplace_tracker.eraseCurrentAlias( _mapper.getValueForTensor(inplace_tensor)); }); } // Tag all the nodes created by the handler with the initial schema string // representation so that they can be traced back to top level ops in the // profiler. const WithMetadata metadata(c10::toString(schema)); // Create a fake IR node for us to target using the schema. torch::jit::Node *node = lowerFromSchema(schema, stack, *graph, _mapper); logging::trace("[DISPATCHER][JIT] Node from schema {}", *node); if (!inplace_tensors.empty()) { // For inplace ops, cast all input tensors to the same type as the output // tensor. for (std::size_t ouput_tensor_id = 0; ouput_tensor_id < num_inplace_tensors; ++ouput_tensor_id) { const auto output_type = inplace_tensors.at(ouput_tensor_id).scalar_type(); const bool output_float = c10::isFloatingType(output_type); for (size_t i = 0; i < stack->size(); i++) { const c10::IValue &sv = (*stack).at(i); if (!sv.isTensor()) { continue; } const at::Tensor &tensor = sv.toTensor(); const auto input_type = tensor.scalar_type(); const bool input_float = c10::isFloatingType(input_type); if (input_type == at::ScalarType::Undefined || input_type == output_type || input_float != output_float || !canCast(input_type, output_type)) { continue; } // Save where nodes will be inserted in the graph. auto *const curr_insert_point = graph->insertPoint(); // Set insertion point before `node`. graph->setInsertPoint(node); torch::jit::Value *jv = node->input(i); torch::jit::Node *cast = createAndInsertCastOp(graph.get(), jv, output_type); node->replaceInputWith(jv, cast->output()); // Restore old insertion point. graph->setInsertPoint(curr_insert_point); } } } // The MLIR dispatcher is going to use the shape and type of the inputs to // infer the shape and type of the outputs so we need to create dummy MLIR // tensors for each input. const std::function process_value = [&](const c10::IValue &value) { if (value.isList()) { for (const auto &v : value.toList()) { process_value(v); } } else if (value.isTensor()) { const at::Tensor &tensor = value.toTensor(); // Sometimes Undefined is used to mark an optional tensor as not set. if (tensor.scalar_type() == at::ScalarType::Undefined) { ERROR_ON_MSG( tensor.numel() != 0, "[Internal error] Non-empty tensor of type 'Undefined'"); // No need to register the tensor if it's undefined. return; } } else { // If this assertion is hit then we need to add support for this kind // of value by going through the container and identifying all the // tensors. ERROR_ON_MSG(value.isTuple() || value.isGenericDict(), "[Internal] Support for container " << value.tagKind() << " not implemented"); } }; for (const c10::IValue &value : *stack) { process_value(value); } _type_inference_handler.inferOutputTypes(op, stack); // Fix the fake tensor so it can still work with our canonicalisation // functions which check the output. fixOutput(*stack, node); logging::trace("[DISPATCHER][JIT] Pre canonicalisation {}", *node); std::size_t i = 0; for (c10::IValue value : *stack) { if (value.isTensor()) { const at::Tensor tensor = value.toTensor(); logging::trace( "[DISPATCHER][JIT] Node tensor output at index {} size: ={}", i++, tensor.sizes()); } else { logging::trace("[DISPATCHER][JIT] Node scalar output at index {}", i++); } } // Switcheroo the output so the inplace tensor reference is now pointing to // the output. if (!inplace_tensors.empty()) { for (std::size_t ouput_tensor_id = 0; ouput_tensor_id < num_inplace_tensors; ++ouput_tensor_id) { const at::Tensor output = stack->at(ouput_tensor_id).toTensor(); // Get the jit value we are tracking for the output. torch::jit::Value *const value = _mapper.getValueForTensor(output); // If the modified inplace tensor was an alias for an input then // register the new alias. if (!aliased_inputs.empty()) { const auto &aliased_input = aliased_inputs.at(ouput_tensor_id); if (aliased_input != nullptr) { _inplace_tracker.registerAlias(aliased_input, value); } } // Overwrite the inplace tensor with that jit. Now a reference to the // inplace tensor correctly points to this outplace value. const auto &inplace_tensor = inplace_tensors.at(ouput_tensor_id); ValueMapper::TrackedTensor *const record = _mapper.rawTensorRecord(inplace_tensor); ERROR_ON_MSG( !record, "[DISPATCHER][JIT] Inplace op is not tracking inplace argument"); // Ensure the value and torch tensor shapes match const JitTensorInfo value_info(value); inplace_tensor.unsafeGetTensorImpl()->set_sizes_contiguous( value_info.dims); // Validate to make sure the data type also matches. validateTensorShapeAndType(value, inplace_tensor); record->jit = value; } } } InplaceGraphInfo JITDispatch::finalizeInplaceGraphInfo(size_t num_anchors, bool replicas_needing_broadcast) { return _inplace_tracker.finalizeGraph(*graph, num_anchors, replicas_needing_broadcast); } } // namespace poptorch ================================================ FILE: poptorch/source/dispatch_tracer/dispatchers/JitDispatch.hpp ================================================ // Copyright (c) 2021 Graphcore Ltd. All rights reserved. #ifndef POPTORCH_DISPATCH_JIT_DISPATCH_HPP_ #define POPTORCH_DISPATCH_JIT_DISPATCH_HPP_ #include #include #include #include #include #include "pytorch_bridge/CompilerOptions.hpp" #include "../TypeInferenceHandler.hpp" #include "../ValueMapper.hpp" #include "IDispatch.hpp" #include "poptorch/InplaceOps.hpp" namespace poptorch { struct CompilerOptions; class JITDispatch final : public IDispatch { public: JITDispatch(const CompilerOptions &options, TensorStore *tensor_store); // The JIT graph we are building up. std::shared_ptr graph; void addConstant(const at::Tensor &cpu_tensor, const at::Tensor &ipu_tensor) final; void addInput(const at::Tensor &cpu_tensor, const at::Tensor &ipu_tensor) final; void addParameter(const at::Tensor &cpu_tensor, const at::Tensor &ipu_tensor) final; void addOutput(const at::Tensor &ipu_src, const at::Tensor &cpu_dest) final; void finalizeGraph() final; void fallback(const c10::OperatorHandle &op, c10::Stack *stack) override; void detach(const c10::OperatorHandle &op, c10::Stack *stack, bool moving_parameters) final; void registerEmptyTensor(const at::Tensor &tensor, bool is_param) final; // Node will be updated to the new target post canonicalisation. void fixOutput(c10::Stack &stack, torch::jit::Node *node); InplaceGraphInfo finalizeInplaceGraphInfo(size_t num_anchors, bool replicas_needing_broadcast); private: void addTensor(const at::Tensor &cpu_tensor, const at::Tensor &ipu_tensor, bool is_parameter); const std::vector> &getSourceLocationExcludes() const final; void setCurrentCodeLocation(const torch::jit::SourceRange &source_location) final; CompilerOptions _opts; TypeInferenceHandler _type_inference_handler; InplaceInputsTracker _inplace_tracker; }; } // namespace poptorch #endif // POPTORCH_DISPATCH_JIT_DISPATCH_HPP_ ================================================ FILE: poptorch/source/include/poptorch/AliasProcessing.hpp ================================================ // Copyright (c) 2021 Graphcore Ltd. All rights reserved. #ifndef INCLUDE_POPTORCH_ALIAS_PROCESSING_H #define INCLUDE_POPTORCH_ALIAS_PROCESSING_H namespace torch { namespace jit { struct Graph; } // namespace jit } // namespace torch namespace poptorch { // Remove instances of aten::alias in the graph by replacing the outputs with // the original (aliased) output. The known source of aliases is when an // operation takes place on a wrapped buffer, for which the return value tensor // is aliased and then set to be a member of the original (wrapper) subclass. void resolveAliases(torch::jit::Graph *graph); } // namespace poptorch #endif // INCLUDE_POPTORCH_ALIAS_PROCESSING_H ================================================ FILE: poptorch/source/include/poptorch/CompilerOps.inc.hpp ================================================ // Copyright (c) 2022 Graphcore Ltd. All rights reserved. // Auto generated file, do not modify // Run `python3 scripts/PopParse.py` to regenerate // clang-format off torch::jit::Node* createCopyvarupdate(torch::jit::Graph *graph, const std::vector& args); torch::jit::Node* createBatchnormalization(torch::jit::Graph *graph, const std::vector& args,unsigned int num_outputs,float epsilon,float momentum, unsigned int num_node_outputs); torch::jit::Node* createBucketize(torch::jit::Graph *graph, const std::vector& args, bool right); torch::jit::Node* createGroupnormalization(torch::jit::Graph *graph, const std::vector& args,int64_t num_groups,float epsilon); torch::jit::Node* createSubsample(torch::jit::Graph *graph, const std::vector& args,const std::vector & strides); torch::jit::Node* createPrinttensor(torch::jit::Graph *graph, const std::vector& args,int64_t print_gradient,const std::string & title,const int summariseThreshold,const int edgeItems,const int maxLineWidth,const int digits,const int floatFormat,const char separator,const char openBracket,const char closeBracket); torch::jit::Node* createNop(torch::jit::Graph *graph, const std::vector& args); torch::jit::Node* createScale(torch::jit::Graph *graph, const std::vector& args,float scale); torch::jit::Node* createScaledadd(torch::jit::Graph *graph, const std::vector& args,float scale0,float scale1); torch::jit::Node* createLstm(torch::jit::Graph *graph, const std::vector& args,int64_t outputFullSequence); torch::jit::Node* createGelu(torch::jit::Graph *graph, const std::vector& args); torch::jit::Node* createGeluErf(torch::jit::Graph *graph, const std::vector& args); torch::jit::Node* createDetach(torch::jit::Graph *graph, const std::vector& args); torch::jit::Node* createDepthtospace(torch::jit::Graph *graph, const std::vector& args,int64_t blocksize,const std::string & mode); torch::jit::Node* createRound(torch::jit::Graph *graph, const std::vector& args); torch::jit::Node* createNearbyInt(torch::jit::Graph *graph, const std::vector& args); torch::jit::Node* createDynamicslice(torch::jit::Graph *graph, const std::vector& args,std::vector axes,std::vector sizes,std::int32_t noOverlap); torch::jit::Node* createDynamicupdate(torch::jit::Graph *graph, const std::vector& args,std::vector axes,std::vector sizes,std::int32_t noOverlap); torch::jit::Node* createDynamiczero(torch::jit::Graph *graph, const std::vector& args,std::vector axes,std::vector sizes); torch::jit::Node* createDynamicadd(torch::jit::Graph *graph, const std::vector& args,std::vector axes,std::vector sizes); torch::jit::Node* createSequenceslice(torch::jit::Graph *graph, const std::vector& args,std::int32_t zeroUnused); torch::jit::Node* createL1loss(torch::jit::Graph *graph, const std::vector& args,const float lambda,std::int32_t reduction); torch::jit::Node* createNllloss(torch::jit::Graph *graph, const std::vector& args,std::int32_t reduction,std::int32_t ignoreIndex,bool inputIsLogProbability); torch::jit::Node* createIdentityloss(torch::jit::Graph *graph, const std::vector& args,std::int32_t reduction); torch::jit::Node* create_ctcloss(torch::jit::Graph *graph, const std::vector& args,std::int32_t reduction,const unsigned int blank,const std::string & outDataType,const bool zeroInfinity); torch::jit::Node* createCtcbeamsearchdecoder(torch::jit::Graph *graph, const std::vector& args,unsigned int blank,unsigned int beamWidth,unsigned int topPaths); torch::jit::Node* createShapeddropout(torch::jit::Graph *graph, const std::vector& args,const std::vector & shape,float ratio); torch::jit::Node* createAtan2(torch::jit::Graph *graph, const std::vector& args); torch::jit::Node* createExpm1(torch::jit::Graph *graph, const std::vector& args); torch::jit::Node* createLog1p(torch::jit::Graph *graph, const std::vector& args); torch::jit::Node* createFmod(torch::jit::Graph *graph, const std::vector& args); torch::jit::Node* createRemainder(torch::jit::Graph *graph, const std::vector& args); torch::jit::Node* createReverse(torch::jit::Graph *graph, const std::vector& args,const std::vector & dimensions); torch::jit::Node* createSlice(torch::jit::Graph *graph, const std::vector& args,const std::vector & ends,const std::vector & starts,const std::vector & axes); torch::jit::Node* createBitwisenot(torch::jit::Graph *graph, const std::vector& args); torch::jit::Node* createBitwiseand(torch::jit::Graph *graph, const std::vector& args); torch::jit::Node* createBitwiseor(torch::jit::Graph *graph, const std::vector& args); torch::jit::Node* createBitwisexor(torch::jit::Graph *graph, const std::vector& args); torch::jit::Node* createBitwisexnor(torch::jit::Graph *graph, const std::vector& args); torch::jit::Node* createReducemedian(torch::jit::Graph *graph, const std::vector& args,std::vector axes,int64_t keepdims); torch::jit::Node* createScatterreduce(torch::jit::Graph *graph, const std::vector& args,std::int32_t axis_size,std::int32_t axis,bool enable_index_broadcast, std::int32_t reduction); torch::jit::Node* createGroupedscatterreduce(torch::jit::Graph *graph, const std::vector& args,std::int32_t axis_size,std::int32_t axis,std::int32_t group_size, bool enable_index_broadcast, std::int32_t reduction); torch::jit::Node* createSwish(torch::jit::Graph *graph, const std::vector& args); torch::jit::Node* createAveragepool(torch::jit::Graph *graph, const std::vector& args,const std::vector & kernel_shape,int64_t ceil_mode,int64_t count_include_pad,const std::vector & pads,const std::vector & strides); torch::jit::Node* createConvinteger(torch::jit::Graph *graph, const std::vector& args,const std::vector & dilations,int64_t group,const std::vector & kernel_shape,const std::vector & pads,const std::vector & strides); torch::jit::Node* createDequantizelinear(torch::jit::Graph *graph, const std::vector& args); torch::jit::Node* createDropout(torch::jit::Graph *graph, const std::vector& args,unsigned int num_outputs,float ratio); torch::jit::Node* createIsinf(torch::jit::Graph *graph, const std::vector& args,int64_t detect_negative,int64_t detect_positive); torch::jit::Node* createMatmulinteger(torch::jit::Graph *graph, const std::vector& args); torch::jit::Node* createMaxpool(torch::jit::Graph *graph, const std::vector& args,unsigned int num_outputs,const std::vector & kernel_shape,int64_t ceil_mode,const std::vector & dilations,const std::vector & pads,int64_t storage_order,const std::vector & strides); torch::jit::Node* createMod(torch::jit::Graph *graph, const std::vector& args,int64_t fmod); torch::jit::Node* createNonmaxsuppression(torch::jit::Graph *graph, const std::vector& args,int64_t center_point_box); torch::jit::Node* createQlinearconv(torch::jit::Graph *graph, const std::vector& args,const std::vector & dilations,int64_t group,const std::vector & kernel_shape,const std::vector & pads,const std::vector & strides); torch::jit::Node* createQlinearmatmul(torch::jit::Graph *graph, const std::vector& args); torch::jit::Node* createQuantizelinear(torch::jit::Graph *graph, const std::vector& args); torch::jit::Node* createResize(torch::jit::Graph *graph, const std::vector& args,const std::string &coordinate_transformation_mode,float cubic_coeff_a,int64_t exclude_outside,float extrapolation_value,const std::string & mode,const std::string &nearest_mode); torch::jit::Node* createReversesequence(torch::jit::Graph *graph, const std::vector& args,int64_t batch_axis,int64_t time_axis); torch::jit::Node* createRoialign(torch::jit::Graph *graph, const std::vector& args,const std::string & mode,int64_t output_height,int64_t output_width,int64_t sampling_ratio,float spatial_scale); torch::jit::Node* createThresholdedrelu(torch::jit::Graph *graph, const std::vector& args,float alpha); torch::jit::Node* createTopk(torch::jit::Graph *graph, const std::vector& args,int64_t axis, bool largest, bool sorted); torch::jit::Node* createSort(torch::jit::Graph *graph, const std::vector& args,int64_t axis, bool descending, bool stable); torch::jit::Node* createUpsample(torch::jit::Graph *graph, const std::vector& args,const std::string & mode); torch::jit::Node* createAcosh(torch::jit::Graph *graph, const std::vector& args); torch::jit::Node* createAsinh(torch::jit::Graph *graph, const std::vector& args); torch::jit::Node* createAtanh(torch::jit::Graph *graph, const std::vector& args); torch::jit::Node* createCast(torch::jit::Graph *graph, const std::vector& args,const std::string & to); torch::jit::Node* createCompress(torch::jit::Graph *graph, const std::vector& args,std::int32_t axis); torch::jit::Node* createCosh(torch::jit::Graph *graph, const std::vector& args); torch::jit::Node* createErf(torch::jit::Graph *graph, const std::vector& args); torch::jit::Node* createEyelike(torch::jit::Graph *graph, const std::vector& args,std::int32_t dtype,int64_t k); torch::jit::Node* createFlatten(torch::jit::Graph *graph, const std::vector& args,int64_t axis); torch::jit::Node* createGemm(torch::jit::Graph *graph, const std::vector& args,float alpha,float beta,int64_t transA,int64_t transB); torch::jit::Node* createGreater(torch::jit::Graph *graph, const std::vector& args); torch::jit::Node* createIsnan(torch::jit::Graph *graph, const std::vector& args); torch::jit::Node* createLess(torch::jit::Graph *graph, const std::vector& args); torch::jit::Node* createMatmul(torch::jit::Graph *graph, const std::vector& args); torch::jit::Node* createMaxunpool(torch::jit::Graph *graph, const std::vector& args,const std::vector & kernel_shape,const std::vector & pads,const std::vector & strides); torch::jit::Node* createMeanvariancenormalization(torch::jit::Graph *graph, const std::vector& args,const std::vector & axes); torch::jit::Node* createNonzero(torch::jit::Graph *graph, const std::vector& args); torch::jit::Node* createOnehot(torch::jit::Graph *graph, const std::vector& args,int64_t axis); torch::jit::Node* createScatter(torch::jit::Graph *graph, const std::vector& args,int64_t axis); torch::jit::Node* createScatterElements(torch::jit::Graph *graph, const std::vector& args,int64_t axis); torch::jit::Node* createShrink(torch::jit::Graph *graph, const std::vector& args,float bias,float lambd); torch::jit::Node* createSign(torch::jit::Graph *graph, const std::vector& args); torch::jit::Node* createSinh(torch::jit::Graph *graph, const std::vector& args); torch::jit::Node* createTfidfvectorizer(torch::jit::Graph *graph, const std::vector& args,int64_t max_gram_length,int64_t max_skip_count,int64_t min_gram_length,const std::string & mode,const std::vector & ngram_counts,const std::vector & ngram_indexes,const std::vector & pool_int64s,const std::vector & pool_strings,std::vector weights); torch::jit::Node* createWhere(torch::jit::Graph *graph, const std::vector& args); torch::jit::Node* createExpand(torch::jit::Graph *graph, const std::vector& args); torch::jit::Node* createMax(torch::jit::Graph *graph, const std::vector& args); torch::jit::Node* createMean(torch::jit::Graph *graph, const std::vector& args); torch::jit::Node* createMin(torch::jit::Graph *graph, const std::vector& args); torch::jit::Node* createSum(torch::jit::Graph *graph, const std::vector& args); torch::jit::Node* createAcos(torch::jit::Graph *graph, const std::vector& args); torch::jit::Node* createAdd(torch::jit::Graph *graph, const std::vector& args); torch::jit::Node* createLogical_and(torch::jit::Graph *graph, const std::vector& args); torch::jit::Node* createAsin(torch::jit::Graph *graph, const std::vector& args); torch::jit::Node* createAtan(torch::jit::Graph *graph, const std::vector& args); torch::jit::Node* createCos(torch::jit::Graph *graph, const std::vector& args); torch::jit::Node* createDiv(torch::jit::Graph *graph, const std::vector& args); torch::jit::Node* createEqual(torch::jit::Graph *graph, const std::vector& args); torch::jit::Node* createMul(torch::jit::Graph *graph, const std::vector& args); torch::jit::Node* createMultinomial(torch::jit::Graph *graph, const std::vector& args,int64_t dtype,int64_t sample_size,float seed); torch::jit::Node* createLogical_or(torch::jit::Graph *graph, const std::vector& args); torch::jit::Node* createPow(torch::jit::Graph *graph, const std::vector& args); torch::jit::Node* createSin(torch::jit::Graph *graph, const std::vector& args); torch::jit::Node* createSub(torch::jit::Graph *graph, const std::vector& args); torch::jit::Node* createTan(torch::jit::Graph *graph, const std::vector& args); torch::jit::Node* createLogical_xor(torch::jit::Graph *graph, const std::vector& args); torch::jit::Node* createAbs(torch::jit::Graph *graph, const std::vector& args); torch::jit::Node* createArgmax(torch::jit::Graph *graph, const std::vector& args,int64_t axis,int64_t keepdims); torch::jit::Node* createArgmin(torch::jit::Graph *graph, const std::vector& args,int64_t axis,int64_t keepdims); torch::jit::Node* createCeil(torch::jit::Graph *graph, const std::vector& args); torch::jit::Node* createClip(torch::jit::Graph *graph, const std::vector& args); torch::jit::Node* createConcat(torch::jit::Graph *graph, const std::vector& args,int64_t axis); torch::jit::Node* createConv(torch::jit::Graph *graph, const std::vector& args,const std::vector & dilations,int64_t group,const std::vector & kernel_shape,const std::vector & pads,const std::vector & strides); torch::jit::Node* createConvtranspose(torch::jit::Graph *graph, const std::vector& args,const std::vector & dilations,int64_t group,const std::vector & kernel_shape,const std::vector & output_padding,const std::vector & output_shape,const std::vector & pads,const std::vector & strides); torch::jit::Node* createElu(torch::jit::Graph *graph, const std::vector& args,float alpha); torch::jit::Node* createExp(torch::jit::Graph *graph, const std::vector& args); torch::jit::Node* createFloor(torch::jit::Graph *graph, const std::vector& args); torch::jit::Node* createGather(torch::jit::Graph *graph, const std::vector& args,int64_t axis); torch::jit::Node* createGroupedgather(torch::jit::Graph *graph, const std::vector& args,int64_t axis,int64_t group_size); torch::jit::Node* createGlobalaveragepool(torch::jit::Graph *graph, const std::vector& args); torch::jit::Node* createGloballppool(torch::jit::Graph *graph, const std::vector& args,int64_t p); torch::jit::Node* createGlobalmaxpool(torch::jit::Graph *graph, const std::vector& args); torch::jit::Node* createHardsigmoid(torch::jit::Graph *graph, const std::vector& args,float alpha,float beta); torch::jit::Node* createHardmax(torch::jit::Graph *graph, const std::vector& args,int64_t axis); torch::jit::Node* createIdentity(torch::jit::Graph *graph, const std::vector& args); torch::jit::Node* createInstancenormalization(torch::jit::Graph *graph, const std::vector& args,float epsilon); torch::jit::Node* createLrn(torch::jit::Graph *graph, const std::vector& args,int64_t size,float alpha,float beta,float bias); torch::jit::Node* createLeakyrelu(torch::jit::Graph *graph, const std::vector& args,float alpha); torch::jit::Node* createLog(torch::jit::Graph *graph, const std::vector& args); torch::jit::Node* createLogsoftmax(torch::jit::Graph *graph, const std::vector& args,int64_t axis); torch::jit::Node* createLpnormalization(torch::jit::Graph *graph, const std::vector& args,int64_t axis,int64_t p); torch::jit::Node* createLppool(torch::jit::Graph *graph, const std::vector& args,const std::vector & kernel_shape,int64_t p,const std::vector & pads,const std::vector & strides); torch::jit::Node* createMaxroipool(torch::jit::Graph *graph, const std::vector& args,const std::vector & pooled_shape,float spatial_scale); torch::jit::Node* createNeg(torch::jit::Graph *graph, const std::vector& args); torch::jit::Node* createLogical_not(torch::jit::Graph *graph, const std::vector& args); torch::jit::Node* createPad(torch::jit::Graph *graph, const std::vector& args,const std::string & mode); torch::jit::Node* createRandomnormallike(torch::jit::Graph *graph, const std::vector& args,std::int32_t dtype,float mean,float scale,float seed); torch::jit::Node* createRandomuniformlike(torch::jit::Graph *graph, const std::vector& args,std::int32_t dtype,float high,float low,float seed); torch::jit::Node* createReciprocal(torch::jit::Graph *graph, const std::vector& args); torch::jit::Node* createReducel1(torch::jit::Graph *graph, const std::vector& args,std::vector axes,int64_t keepdims); torch::jit::Node* createReducel2(torch::jit::Graph *graph, const std::vector& args,std::vector axes,int64_t keepdims); torch::jit::Node* createReducelogsum(torch::jit::Graph *graph, const std::vector& args,std::vector axes,int64_t keepdims); torch::jit::Node* createReducelogsumexp(torch::jit::Graph *graph, const std::vector& args,std::vector axes,int64_t keepdims); torch::jit::Node* createReducemax(torch::jit::Graph *graph, const std::vector& args,std::vector axes,int64_t keepdims); torch::jit::Node* createReducemean(torch::jit::Graph *graph, const std::vector& args,std::vector axes,int64_t keepdims); torch::jit::Node* createReducemin(torch::jit::Graph *graph, const std::vector& args,std::vector axes,int64_t keepdims); torch::jit::Node* createReduceprod(torch::jit::Graph *graph, const std::vector& args,std::vector axes,int64_t keepdims); torch::jit::Node* createReducesum(torch::jit::Graph *graph, const std::vector& args,std::vector axes,int64_t keepdims); torch::jit::Node* createReducesumsquare(torch::jit::Graph *graph, const std::vector& args,std::vector axes,int64_t keepdims); torch::jit::Node* createRelu(torch::jit::Graph *graph, const std::vector& args); torch::jit::Node* createSelu(torch::jit::Graph *graph, const std::vector& args,float alpha,float gamma); torch::jit::Node* createShape(torch::jit::Graph *graph, const std::vector& args); torch::jit::Node* createSigmoid(torch::jit::Graph *graph, const std::vector& args); torch::jit::Node* createSize(torch::jit::Graph *graph, const std::vector& args); torch::jit::Node* createSoftmax(torch::jit::Graph *graph, const std::vector& args,int64_t axis); torch::jit::Node* createSoftplus(torch::jit::Graph *graph, const std::vector& args); torch::jit::Node* createSoftsign(torch::jit::Graph *graph, const std::vector& args); torch::jit::Node* createSpacetodepth(torch::jit::Graph *graph, const std::vector& args,int64_t blocksize); torch::jit::Node* createSplinebasis(torch::jit::Graph *graph, const std::vector& args,std::int32_t degree); torch::jit::Node* createSplineweighting(torch::jit::Graph *graph, const std::vector& args); torch::jit::Node* createSplit(torch::jit::Graph *graph, const std::vector& args,unsigned int num_outputs,int64_t axis,const std::vector & split); torch::jit::Node* createSqrt(torch::jit::Graph *graph, const std::vector& args); torch::jit::Node* createSqueeze(torch::jit::Graph *graph, const std::vector& args,const std::vector & axes); torch::jit::Node* createTanh(torch::jit::Graph *graph, const std::vector& args); torch::jit::Node* createTile(torch::jit::Graph *graph, const std::vector& args); torch::jit::Node* createTranspose(torch::jit::Graph *graph, const std::vector& args,const std::vector & perm); torch::jit::Node* createUnsqueeze(torch::jit::Graph *graph, const std::vector& args,const std::vector & axes); ================================================ FILE: poptorch/source/include/poptorch/ImplicitCasting.hpp ================================================ // Copyright (c) 2020 Graphcore Ltd. All rights reserved. #ifndef INCLUDE_POPTORCH_IMPLICIT_CASTING_HPP #define INCLUDE_POPTORCH_IMPLICIT_CASTING_HPP #include namespace c10 { template class ArrayRef; } // namespace c10 namespace torch { namespace jit { template using ArrayRef = c10::ArrayRef; struct Graph; struct Value; } // namespace jit } // namespace torch namespace poptorch { enum class ImplicitCast { None, All, ExceptFirst, ExceptSecond, ExceptThird, ExceptFourthFifth }; enum class ImplicitCastOutput { None, AsPromoted, AlwaysBool, AlwaysFloat }; std::vector implicitCastInputs(torch::jit::ArrayRef *inputs, ImplicitCast implicit_cast); // TODO(T55228): remove after we use our own dispatch key. // With the dispatcher we catch implicit torch casts (intercepted with // JitDispatch::toCopyInplace) but it seems that in the case of CPU tensors, // the returned (casted) aten tensors are not reflected in the later ops, i.e. // we might end up with dead implicit casts in the ir which we clean with this // pass. The actual poptorch casting is done in our canonicalization handlers // anyway. void removeDeadImplicitCasts(torch::jit::Graph *graph); } // namespace poptorch #endif // INCLUDE_POPTORCH_IMPLICIT_CASTING_HPP ================================================ FILE: poptorch/source/include/poptorch/InplaceOpsPyTorch.hpp_nolint ================================================ // Copyright (c) 2021 Graphcore Ltd. All rights reserved. // Any Modifications to code from PyTorch // From PyTorch: // Copyright (c) 2016- Facebook, Inc (Adam Paszke) // Copyright (c) 2014- Facebook, Inc (Soumith Chintala) // Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert) // Copyright (c) 2012-2014 Deepmind Technologies (Koray Kavukcuoglu) // Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu) // Copyright (c) 2011-2013 NYU (Clement Farabet) // Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston) // Copyright (c) 2006 Idiap Research Institute (Samy Bengio) // Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz) // From Caffe2: // Copyright (c) 2016-present, Facebook Inc. All rights reserved. // All contributions by Facebook: // Copyright (c) 2016 Facebook Inc. // All contributions by Google: // Copyright (c) 2015 Google Inc. // All rights reserved. // All contributions by Yangqing Jia: // Copyright (c) 2015 Yangqing Jia // All rights reserved. // All contributions by Kakao Brain: // Copyright 2019-2020 Kakao Brain // All contributions from Caffe: // Copyright(c) 2013, 2014, 2015, the respective contributors // All rights reserved. // All other contributions: // Copyright(c) 2015, 2016 the respective contributors // All rights reserved. // Caffe2 uses a copyright model similar to Caffe: each contributor holds // copyright over their contributions to Caffe2. The project versioning records // all such contribution and copyright details. If a contributor wants to further // mark their specific copyright on a particular contribution, they should // indicate their copyright solely in the commit message of the change when it is // committed. // All rights reserved. // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are met: // 1. Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // 2. Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // 3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories America // and IDIAP Research Institute nor the names of its contributors may be // used to endorse or promote products derived from this software without // specific prior written permission. // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE // POSSIBILITY OF SUCH DAMAGE. // From torch/csrc/jit/passes/remove_inplace_ops.cpp which is // inaccesible from outside the module namespace torch { namespace jit { // Modify from source to handle only those which cannot be formed by removing // trailing _ static const std::unordered_map inPlaceToOutOfPlace = { {aten::zero_, aten::zeros_like}, {aten::fill_, aten::full_like}}; static const std::unordered_map expectedInputCount = { {aten::zero_, 6}, {aten::fill_, 7}}; // Modify from source to handle other in place ops not in the list bool isInplaceOp(const Node* node) { const char *kind_str = node->kind().toQualString(); size_t str_length = strlen(kind_str); if (str_length < 2) { return false; } // Handle ops like aten::__and__ if (kind_str[str_length-2] == '_') { return false; } return kind_str[str_length-1] == '_'; } } // namespace jit } // namespace torch ================================================ FILE: poptorch/source/include/poptorch/OpBuilder.hpp ================================================ // Copyright (c) 2020 Graphcore Ltd. All rights reserved. #ifndef INCLUDE_POPTORCH_OP_BUILDER_HPP #define INCLUDE_POPTORCH_OP_BUILDER_HPP #include #include #include #include #include #include #include #include "poptorch/ImplicitCasting.hpp" #include "poptorch_logging/Error.hpp" // Represents how the output type of the op is to be determined enum class OutputType { Unknown, AsFirstInput, AsThirdInput, FirstAsFirstInputSecondAlwaysInt, AsImplicitCastPromoted, AsDtype, AsDtypeOrAsPromoted, AlwaysBool, AlwaysFloat, AlwaysInt, AlwaysUint8 }; namespace c10 { template class optional; } // namespace c10 namespace poptorch { // RAII object to set / clear the current source code location // and metadata to those attached to the provided node. // (Useful when creating / replacing nodes in the graph). // [Important] This is not a stack: the metadata is cleared on // destruction. class WithNodeMetadata { public: explicit WithNodeMetadata(torch::jit::Node *node); ~WithNodeMetadata(); }; // Set the current source code location (i.e all the nodes created // will appear as having been instantiated from that location). void setCurrentPythonCodeLocation( const torch::jit::SourceRange &source_location); // Set the current metadata. (All the nodes created // will have this metadata attached to them). void setCurrentMetadata(const std::string &metadata); void resetCurrentSourceLocation(); torch::jit::Node *createNode( torch::jit::Graph *graph, torch::jit::NodeKind kind, torch::jit::ArrayRef inputs = {}, ImplicitCast implicit_cast = ImplicitCast::None, OutputType output_type = OutputType::Unknown, size_t num_outputs = 1, c10::optional dtype = c10::optional()); torch::jit::Node *createAndInsertNode( torch::jit::Graph *graph, torch::jit::NodeKind kind, torch::jit::ArrayRef inputs = {}, ImplicitCast implicit_cast = ImplicitCast::None, OutputType output_type = OutputType::Unknown, size_t num_outputs = 1, c10::optional dtype = c10::optional()); // All nodes should be added to the jit graph using one of the below `insert` // functions (or indirectly by using createAndInsertNode()). // These functions will ensure the new node contains all the required metadata // before it's added to the graph. void insertNodeInGraph(torch::jit::Graph *graph, torch::jit::Node *new_node); void insertNodeBeforeNode(torch::jit::Node *new_node, torch::jit::Node *insert_point); void insertNodeAfterNode(torch::jit::Node *new_node, torch::jit::Node *insert_point); torch::jit::Value *insertConstant(torch::jit::Graph *graph, const torch::jit::IValue &val); void setSourceRangeToCurrentLocation(torch::jit::Node *node); // Called by createAndInsertNode except in the cases of OutputType::AsDtype and // OutputType::AsDtypeOrFirstInput where it should be called manually once the // dtype attribute is set void setNodeOutputsTypes(torch::jit::Node *node, ImplicitCast implicit_cast, OutputType output_type); enum class UseOfNode { HostSideOnly, PopARTOnly, HostSideAndPopART }; // Create a poptorch::tensor_constant, poptorch::host_side_tensor_constant // or poptorch::host_and_ipu_side_tensor_constant node from the given tensors, // setting the output type accordingly. // A constant which is simply returned, perhaps as a tuple or list, is labelled // as a host side constant to prevent it being placed in PopART. A constant // which is both returned unchanged and used in PopART needs a further pass to // split it into two constants. torch::jit::Node * tensorToConstant(torch::jit::Graph *graph, const at::Tensor &t, UseOfNode constant_use = UseOfNode::PopARTOnly); // Manually added. torch::jit::Node *createReshape(torch::jit::Graph *graph, torch::jit::Value *A, const std::vector &new_shape); torch::jit::Node * createConstantLong(torch::jit::Graph *graph, const std::vector &data, const std::vector &new_shape); torch::jit::Node *createConstantInt(torch::jit::Graph *graph, const std::vector &data, const std::vector &new_shape); torch::jit::Node * createConstantFloat32(torch::jit::Graph *graph, const std::vector &data, const std::vector &new_shape); // Create a constant float that inherits its underlying type (float16/32) from // tensor t torch::jit::Node * createConstantFloatLike(torch::jit::Graph *graph, torch::jit::Value *t, const std::vector &data, const std::vector &new_shape); template torch::jit::Node * createHandlerOperation(torch::jit::Graph *graph, SymbolHandler &&handler, torch::jit::ArrayRef inputs) { torch::jit::Node *inputs_node = graph->createTuple(inputs); return handler(graph, inputs_node); } torch::jit::Node * createCustomOperation(torch::jit::Graph *graph, const std::vector &inputs, const std::string &name, const std::string &domain, std::int64_t domainVersion, std::int64_t numOutputs, const std::string &attributes_id_str); torch::jit::Node *createCast(torch::jit::Graph *graph, torch::jit::Value *A, c10::ScalarType scalar); torch::jit::Node *createInternalCast(torch::jit::Graph *graph, torch::jit::Value *A, const std::string &type); torch::jit::Node *createConstantPad(torch::jit::Graph *graph, torch::jit::Value *A, const std::vector &pad_shape, float constant, bool direct_pad_shape_input = false); torch::jit::Node *createReflectionPad(torch::jit::Graph *graph, torch::jit::Value *A, const std::vector &pad_shape); torch::jit::Node *createEdgePad(torch::jit::Graph *graph, torch::jit::Value *A, const std::vector &pad_shape); torch::jit::Node *createAddNotInPlace(torch::jit::Graph *graph, torch::jit::Value *A, torch::jit::Value *B); torch::jit::Node *createStartForLoop(torch::jit::Graph *graph, torch::jit::Value *inputs); torch::jit::Node *createEndForLoop(torch::jit::Graph *graph, torch::jit::Value *outputs, torch::jit::Value *inputs, std::int64_t trip_count); torch::jit::Node *createStartIfBlock(torch::jit::Graph *graph, torch::jit::Value *condition); torch::jit::Node *createStartElseBlock(torch::jit::Graph *graph, torch::jit::Value *outputs_then); torch::jit::Node *createEndIfBlock(torch::jit::Graph *graph, torch::jit::Value *outputs_else, torch::jit::Value *condition); torch::jit::Node *createAddUntypedInputTensor(torch::jit::Graph *graph, torch::jit::Value *input); // Create an add output to mark a node of being an output of a subgraph. torch::jit::Node *createAddOutputTensor(torch::jit::Graph *graph, torch::jit::Value *output); torch::jit::Value *wrapInConstantVec(torch::jit::Graph *graph, const std::vector &data); template using FirstElmType = typename std::tuple_element<0, std::tuple>::type; template < typename... Ints, std::enable_if_t>::value, int> = 0> torch::jit::Value *wrapInConstant1D(torch::jit::Graph *graph, Ints... values) { std::vector data{std::forward(values)...}; return wrapInConstantVec(graph, data); } template struct CreateCast {}; template <> struct CreateCast { torch::jit::Node *operator()(torch::jit::Graph *graph, torch::jit::Value *value) const { return createCast(graph, value, c10::kFloat); } }; template <> struct CreateCast { torch::jit::Node *operator()(torch::jit::Graph *graph, torch::jit::Value *value) const { return createCast(graph, value, c10::kInt); } }; template <> struct CreateCast { torch::jit::Node *operator()(torch::jit::Graph *graph, torch::jit::Value *value) const { return createCast(graph, value, c10::kLong); } }; template torch::jit::Node *castToType(torch::jit::Graph *graph, torch::jit::Value *value) { return CreateCast{}(graph, value); } torch::jit::Node * createOptimizerGroup(torch::jit::Graph *graph, std::uint64_t group, const std::vector &list_of_params); torch::jit::Node *createRecomputationCheckpoint(torch::jit::Graph *graph, torch::jit::Value *value); torch::jit::Node *createUnfold(torch::jit::Graph *graph, torch::jit::Value *value, int64_t dimension, int64_t size, int64_t step); torch::jit::Node * createRandomNormal(torch::jit::Graph *graph, const std::vector &possible_inputs, const std::vector &shape, float mean, float scale, at::ScalarType dataType = at::ScalarType::Undefined); torch::jit::Node * createRandomUniform(torch::jit::Graph *graph, torch::jit::Value *possible_input, const std::vector &shape, float high, float low, at::ScalarType dataType = at::ScalarType::Undefined); torch::jit::Node *createPrintIpuTensor(torch::jit::Graph *graph, torch::jit::Value *value, const std::string &title); torch::jit::Node *createCallCpuOp(torch::jit::Graph *graph, const std::vector &value, const std::string &id, torch::jit::Node *node); torch::jit::Node *createSetAvailableMemory(torch::jit::Graph *graph, torch::jit::Value *value, float proportion); torch::jit::Node *createSetAttribute(torch::jit::Graph *graph, const std::string &attribute, const std::string &key, const std::string &value, bool insert_after_insertion_pnt = false); torch::jit::Node *createClearAttribute(torch::jit::Graph *graph, const std::string &attribute, const std::string &key, bool insert_after_insertion_pnt = false); torch::jit::Node *createSetMatMulSerialization(torch::jit::Graph *graph, torch::jit::Value *matmul, const std::string &mode, int64_t factor, bool keep_precision); torch::jit::Node *createBeginIpuBlock(torch::jit::Graph *graph, std::uint64_t stage, std::int64_t phase, std::int64_t ipu); torch::jit::Node *createMultiConvPart(torch::jit::Graph *graph, torch::jit::Node *conv_node); torch::jit::Node *createGru(torch::jit::Graph *graph, const std::vector &args, int64_t hidden_size); torch::jit::Node *createRnn(torch::jit::Graph *graph, const std::vector &args, const std::vector &activations); torch::jit::Node *createPrelu(torch::jit::Graph *graph, torch::jit::Value *self, torch::jit::Value *weight); // Autogenerated. #include "poptorch/CompilerOps.inc.hpp" } // namespace poptorch #endif // INCLUDE_POPTORCH_OP_BUILDER_HPP ================================================ FILE: poptorch/source/include/poptorch/OverlappedIO.hpp ================================================ // Copyright (c) 2021 Graphcore Ltd. All rights reserved. #ifndef INCLUDE_POPTORCH_OVERLAPPED_IO_H #define INCLUDE_POPTORCH_OVERLAPPED_IO_H namespace torch { namespace jit { struct Graph; } // namespace jit } // namespace torch namespace poptorch { // Turns any set_overlap_for_input nodes applied to inputs into attributes of // the parameter node. These attributes specify any host IO Overlapped for the // input void attributiseOverlappedIO(torch::jit::Graph *graph); } // namespace poptorch #endif // INCLUDE_POPTORCH_OVERLAPPED_IO_H ================================================ FILE: poptorch/source/include/poptorch/PopartCanonicalization.hpp ================================================ // Copyright (c) 2020 Graphcore Ltd. All rights reserved. #ifndef INCLUDE_POPTORCH_TRANSFORM_ATEN_TO_POPART_HPP_ #define INCLUDE_POPTORCH_TRANSFORM_ATEN_TO_POPART_HPP_ #include #include #include namespace torch { namespace jit { struct Graph; struct Node; } // namespace jit } // namespace torch namespace at { class Tensor; } // namespace at namespace poptorch { /* The first canonicalization pass cleans up the pytorch IR to use popart specific operations and will remove all others. Constants will be folded into the attributes of the ops themselves. */ void canonicalize(torch::jit::Graph *graph); /* * The second late canonicalization pass will take the popart code and will * enforce any constraints that aren't fixed by popart itself. */ void canonicalizeLate(torch::jit::Graph *graph); /* * Error if any Aten ops remain in the graph after we have run canonicalisation * so the user can report exactly what operation we are missing. */ void errorOnUnsupportedAten(torch::jit::Graph *graph); void annotateSubgraphs(torch::jit::Graph *graph, torch::jit::Node *start_node); void removeSurplusIdentityLosses(torch::jit::Graph *graph); // Clean up the graph if it is using CPU offloading. void cpuOffloadingCleanup(torch::jit::Graph *graph); // Handle the 'requires_grad=False' flag on tensors. void addDetachOperations(torch::jit::Graph *graph); // Popart scatterreduceop allows for non-expanded index to be passed in. It is // essentially a fused and more efficient version of the expand + scatterreduce. // This pass identifies all of the valid optimization cases and removes the // explicit and sub-optimal index expansion before the scatter_add ops. void removeScatterAddIndexExpansion(torch::jit::Graph *graph); // Combine possibly scatter operations to execute a grouped version. void groupScatterReduceAndGatherNodes(torch::jit::Graph *graph); // PyTorch's `gather` works differently to PopART's (aka. PyTorch's // `index_select`), but in certain cases when the indices tensor has been // passed through an `expand`, they're equivalent (if the non-expanded indices // are used). Swapping out the handling saves some ops, but is also more // efficient if the expanded indices tensor is just a long series of slices. void simplifyGatherWithExpandedIndices(torch::jit::Graph *graph); // Adds the op as the possible true input op to set_available_memory if it is // of a valid kind. // Some ops are composed of multiple ops, and their return values might not be // an op that accepts set_available_memory. void setAvailableMemoryAddPossibleInputOp(torch::jit::Node *node); // Ensure that the input to the given set_available_memory op is the one that // supports set_available_memory, if it's not move it to the right place. void moveSetAvailableMemoryIfRequired(torch::jit::Node *node); } // namespace poptorch #endif // INCLUDE_POPTORCH_TRANSFORM_ATEN_TO_POPART_HPP_ ================================================ FILE: poptorch/source/include/poptorch/RequiresGrad.hpp ================================================ // Copyright (c) 2022 Graphcore Ltd. All rights reserved. #ifndef INCLUDE_POPTORCH_REQUIRES_GRAD_H #define INCLUDE_POPTORCH_REQUIRES_GRAD_H namespace torch { namespace jit { struct Graph; } // namespace jit } // namespace torch namespace poptorch { // Autograd sets the requires_grad flag on the ATen tensors // after we've instantiated the corresponding ATen node in the dispatcher. // This pass goes through all the nodes in the ATen graph and sets the // requires_graph flag on a node's outputs if any of its inputs has // requires_grad set. void fixRequiresGradFromDispatch(torch::jit::Graph *graph); } // namespace poptorch #endif // INCLUDE_POPTORCH_REQUIRES_GRAD_H ================================================ FILE: poptorch/source/include/poptorch/TypeAndConstantCanonicalization.hpp ================================================ // Copyright (c) 2020 Graphcore Ltd. All rights reserved. #ifndef INCLUDE_POPTORCH_TYPE_AND_CONSTANT_CANONICALIZATION_H #define INCLUDE_POPTORCH_TYPE_AND_CONSTANT_CANONICALIZATION_H #include #include #include namespace at { class Tensor; } // namespace at namespace c10 { struct Symbol; } // namespace c10 namespace torch { namespace jit { struct Graph; struct Node; } // namespace jit } // namespace torch namespace poptorch { namespace type_and_constant_canonicalization { // Add the number of elements of the list to the type by replacing it with // ListTypeWithNumElements instances. The PyTorch ListType does not contain // the number of elements. If revert is "true", reverts all such types to the // original ListType. void addListNumElements(torch::jit::Graph *graph, bool revert = false); void evaluateConstexprs(torch::jit::Graph *graph); // Turn non-floating point parameters into constants as these are not supported // in popart. The pass also removes the affected graph inputs and modifies // 'parameter_names' and 'traced_parameter_tensors' accordingly. void makeConstantIntParams(torch::jit::Graph *graph, std::vector ¶meter_names, std::vector &traced_parameter_tensors); // Change the graph to add a poptorch::host_side_cast node after every graph // input whose type is unsupported (Long, Double, BFloat16) to reflect the // casting which would happen on the host and the correct types as they // would be on the graph. void castUnsupportedInputs(torch::jit::Graph *graph); // Change any unsupported output types to the appropriate equivalent (e.g. // double to float) and warn; error on any totally unsupported types e.g. 8 bit. void checkAndChangeOutputTypes(torch::jit::Graph *graph); // Changes all constants used in implicit casting operations into tensor // constants (poptorch::tensor_constant) of the correct type. void canonicaliseConstants(torch::jit::Graph *graph); } // namespace type_and_constant_canonicalization } // namespace poptorch #endif // INCLUDE_POPTORCH_TYPE_AND_CONSTANT_CANONICALIZATION_H ================================================ FILE: poptorch/source/popart_canonicalization/ActivationOps.cpp ================================================ // Copyright (c) 2020 Graphcore Ltd. All rights reserved. #include #include #include "../PoptorchStaticInit.hpp" #include "PopartCanonicalizationUtils.hpp" #include "poptorch/DispatchTracer.hpp" #include "poptorch/OpBuilder.hpp" #include "poptorch/Utils.hpp" #include "poptorch_logging/Error.hpp" #include "poptorch_logging/Logging.hpp" namespace poptorch { namespace { torch::jit::Node *gluHandler(torch::jit::Graph *graph, torch::jit::Node *node) { // "aten::glu(Tensor self, int dim) -> Tensor" // The input IR before canonicalization: // %3 : Float(2:96, 4:24, 6:4, 4:1) = aten::glu(%input, %4) // The output IR after canonicalization. It takes 3 steps. // 1. split the intput into two halves // %5 : FloatTensor, %6 : FloatTensor = popart::split[num_outputs=2, axis=3, // split=[4, 4]](%input) // 2. sigmoid the 2nd half // %7 : FloatTensor = popart::sigmoid(%6) // 3. multiply the 1st half and the sigmoid result // %8 : Float(2:96, 4:24, 6:4, 4:1) = popart::mul(%5, %7) // Input torch::jit::Value *input = node->input(0); std::int64_t axis = constantToLong(node->input(1)->node()); const std::vector shape_input = shapeFromTensor(input); const std::int64_t size = shape_input.size(); // handle python's negative indices if (axis < 0) { axis += size; } ERROR_ON_MSG(axis < 0 || axis >= size, "The second input argument of glu is not in the legal range"); ERROR_ON_MSG(shape_input[axis] % 2, "Halving dimension" << axis << "must be even"); const unsigned int half_size = static_cast(shape_input[axis] / 2); const std::vector split_sizes = {half_size, half_size}; torch::jit::Node *split = createSplit(graph, {input}, 2, axis, split_sizes); torch::jit::Node *sigmoid = createSigmoid(graph, {split->output(1)}); return createMul(graph, {split->output(0), sigmoid->output()}); } torch::jit::Node *rreluHandler(torch::jit::Graph *graph, torch::jit::Node *node) { // clang-format off // aten::rrelu(Tensor self, Scalar lower=0.125, // Scalar upper=0.3333333333333333, // bool training=False, Generator? generator=None) -> Tensor // aten::rrelu_with_noise(Tensor self, Tensor noise, // Scalar lower, Scalar upper, // bool training, Generator? generator) -> Tensor // // training: rrelu(x) = x if x >= 0 // = a * x if x < 0, where a uniformly random value // from [lower, upper] // inference: rrelu(x) = x if x >= 0 // = x * ((lower + upper) / 2) // clang-format on torch::jit::Value *x = node->input(0); int64_t next_idx = 1; if (node->kind() == c10::aten::rrelu_with_noise) { next_idx++; // skip noise parameter logging::warn("Noise parameter not supported for aten::rrelu_with_noise"); } const float lower = constantToFloat(node->input(next_idx++)->node()); const float upper = constantToFloat(node->input(next_idx++)->node()); const bool is_training = constantToBool(node->input(next_idx++)->node()); auto *val = is_training ? createRandomUniform(graph, x, shapeFromTensor(x), upper, lower) ->output() : createConstantFloatLike(graph, x, {(lower + upper) / 2}, {}) ->output(); auto *zero = createConstantFloatLike(graph, x, {0}, {})->output(); auto *xlt0 = createLess(graph, {x, zero})->output(); auto *mul = createMul(graph, {x, val})->output(); return createWhere(graph, {xlt0, mul, x}); } torch::jit::Node *softplusHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *x = node->input(0); auto input_type = getNodeScalarType(x); auto beta = constantToFloat(node->input(1)->node()); auto threshold = constantToFloat(node->input(2)->node()); const auto msg = fmt::format("{{\"beta\":{},\"threshold\":{}}}", beta, threshold); auto *output_node = createCustomOperation(graph, {x}, "TorchSoftplus", "poptorch.custom_ops", 1, 1, msg); output_node->output(0)->setType(c10::TensorType::create( input_type, c10::nullopt, c10::nullopt, c10::nullopt)); return output_node; } torch::jit::Node *hardsigmoidHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *x = node->input(0); // hardsigmoid(x, 1/6, 0.5) return createHardsigmoid(graph, {x}, 1.0 / 6.0, 0.5); } torch::jit::Node *hardswishHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *x = node->input(0); auto *t0 = createConstantFloatLike(graph, x, {0.0}, {})->output(); auto *t1 = createMax(graph, {x, t0})->output(); auto *t2 = createAbs(graph, {x})->output(); auto *t3 = createConstantFloatLike(graph, x, {3.0}, {})->output(); auto *t4 = createGreater(graph, {t2, t3})->output(); auto *t5 = createAdd(graph, {x, t3})->output(); auto *t6 = createMul(graph, {x, t5})->output(); auto *t7 = createConstantFloatLike(graph, x, {6.0}, {})->output(); auto *t8 = createDiv(graph, {t6, t7})->output(); // where(greater(abs(x), 3), max(x, 0), (x + 3) * x / 6.0) return createWhere(graph, {t4, t1, t8}); } torch::jit::Node *preluHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *self = node->input(0); auto *weight = node->input(1); return createPrelu(graph, self, weight); } torch::jit::Node *geluHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *input = node->input(0); const auto approximate = constantToString(node->input(1)->node()); if (approximate == "tanh") { return createGelu(graph, {input}); } if (approximate == "none") { // TODO: use createGeluErf when it will reach sufficient // performance return createGelu(graph, {input}); } ERROR("Unknown GELU approximate '" << approximate << "'"); } torch::jit::Node *mishHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *src = node->input(0); auto *const neg_src = createNeg(graph, {src}); auto *const sigm = createSigmoid(graph, {neg_src->output()}); auto *const mul = createMul(graph, {sigm->output(), sigm->output()}); const auto shape = shapeFromTensor(mul->output()); const size_t size = std::accumulate(shape.cbegin(), shape.cend(), 1, std::multiplies()); const std::vector ones_vec(size, 1.0); auto *const one = createConstantFloat32(graph, ones_vec, shape); auto *const one_minus_mul = createSub(graph, {one->output(), mul->output()}); auto *const one_plus_mul = createAdd(graph, {one->output(), mul->output()}); auto *const div = createDiv(graph, {one_minus_mul->output(), one_plus_mul->output()}); return createMul(graph, {src, div->output()}); } } // namespace __attribute__((constructor(HANDLER_INIT_PRIORITY))) static void registration() { registerHandler(c10::aten::glu, gluHandler); registerHandler(c10::aten::rrelu, rreluHandler); registerHandler(c10::aten::rrelu_with_noise, rreluHandler); registerHandler(c10::aten::softplus, softplusHandler); registerHandler(c10::aten::hardsigmoid, hardsigmoidHandler); registerHandler(c10::aten::hardswish, hardswishHandler); registerHandler(c10::aten::prelu, preluHandler); registerHandler(c10::aten::_prelu_kernel, preluHandler); registerHandler(c10::aten::gelu, geluHandler); registerHandler(c10::aten::mish, mishHandler); } } // namespace poptorch ================================================ FILE: poptorch/source/popart_canonicalization/ArithmeticOps.cpp ================================================ // Copyright (c) 2020 Graphcore Ltd. All rights reserved. #include #include #include "../PoptorchStaticInit.hpp" #include "PopartCanonicalizationUtils.hpp" #include "../PoptorchSymbols.hpp" #include "poptorch/OpBuilder.hpp" #include "poptorch/Utils.hpp" #include "poptorch_logging/Error.hpp" namespace poptorch { namespace { torch::jit::Node *addHandler(torch::jit::Graph *graph, torch::jit::Node *node) { // aten::add(Tensor self, Tensor other, *, Scalar alpha) -> Tensor // aten::add_.Tensor(Tensor(a!) self, Tensor other, *, Scalar alpha=1) -> // (Tensor(a!)) torch::jit::Value *alpha_param = node->input(2); // If both types are bool, use logical_or if (allInputsBool(node, 2)) { ERROR_ON(!hasUnityValue(alpha_param)); return createLogical_or(graph, {node->input(0), node->input(1)}); } // Ordinary addition torch::jit::Value *alpha_multiplicand = node->input(1); if (!hasUnityValue(alpha_param)) { auto *alpha_node = createMul(graph, {alpha_param, alpha_multiplicand}); alpha_multiplicand = alpha_node->output(); } return createAdd(graph, {node->input(0), alpha_multiplicand}); } torch::jit::Node *truncHandler(torch::jit::Graph *graph, torch::jit::Node *node) { // Drop the exponent by casting to int and back. torch::jit::Node *to_int = createCast(graph, node->input(), c10::kInt); return createCast( graph, to_int->output(), *node->input()->type()->expect()->scalarType()); } torch::jit::Node *fracHandler(torch::jit::Graph *graph, torch::jit::Node *node) { // Frac(x) = x - trunc(x) // Drop the exponent by casting to int and back. torch::jit::Node *to_int = createCast(graph, node->input(0), c10::kInt); torch::jit::Node *trunc = createCast( graph, to_int->output(), *node->input(0)->type()->expect()->scalarType()); return createSub(graph, {node->input(0), trunc->output()}); } torch::jit::Node *floorDivideHandler(torch::jit::Graph *graph, torch::jit::Node *node) { // aten::floor_divide(Tensor x, Tensor y) -> Tensor // floor_divide(x, y) = floor(x/y) where floor(...) rounds towards -inf. torch::jit::Node *quotient = createDiv(graph, {node->input(0), node->input(1)}); return createFloor(graph, {quotient->output()}); } torch::jit::Node *mulHandler(torch::jit::Graph *graph, torch::jit::Node *node) { // aten::mul(Tensor self, Tensor other) -> Tensor // If both types are bool, use logical_add if (allInputsBool(node)) { return createLogical_and(graph, {node->input(0), node->input(1)}); } // Ordinary multiplication return createMul(graph, {node->input(0), node->input(1)}); } torch::jit::Node *trueDivideHandler(torch::jit::Graph *graph, torch::jit::Node *node) { // aten::true_divide(Tensor x, Tensor y) -> Tensor // true_divide(x, y) = (float)x / (float)y torch::jit::Node *x = createCast(graph, node->input(0), c10::kFloat); torch::jit::Node *y = createCast(graph, node->input(1), c10::kFloat); return createDiv(graph, {x->output(), y->output()}); } torch::jit::Node *clampHandler(torch::jit::Graph *graph, torch::jit::Node *node) { // We can't use PopART clip because it doesn't support integers, // so the following is used instead: // output = min(max(x, min_value), max_value) auto *x = node->input(0); auto *min_val = node->input(1); auto *max_val = node->input(2); c10::ScalarType const x_type = getNodeScalarType(x); if (x_type != c10::kInt) { if (!isNone(min_val->node()) && getNodeScalarType(min_val) != x_type) { min_val = createCast(graph, node->input(1), x_type)->output(); } if (!isNone(max_val->node()) && getNodeScalarType(max_val) != x_type) { max_val = createCast(graph, node->input(2), x_type)->output(); } } auto *max = isNone(min_val->node()) ? x->node() : createMax(graph, {x, min_val}); auto *min = isNone(max_val->node()) ? max : createMin(graph, {max->output(), max_val}); return min; } torch::jit::Node *clampMinHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *max = graph->createNone()->output(); auto *input = node->input(0); auto *min = node->input(1); auto clamp_handler = getHandler(c10::aten::clamp); return createHandlerOperation(graph, clamp_handler, {input, min, max}); } torch::jit::Node *clampMaxHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *min = graph->createNone()->output(); auto *input = node->input(0); auto *max = node->input(1); auto clamp_handler = getHandler(c10::aten::clamp); return createHandlerOperation(graph, clamp_handler, {input, min, max}); } torch::jit::Node *addCDivHandler(torch::jit::Graph *graph, torch::jit::Node *node) { // aten::addcdiv.out(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar // value=1, Tensor(a!) out) -> Tensor(a!) torch::jit::Node *div = createDiv(graph, {node->input(1), node->input(2)}); auto scale = constantToFloat(node->input(3)->node()); torch::jit::Node *scaled = createScale(graph, {div->output()}, scale); return createAdd(graph, {node->input(0), scaled->output()}); } torch::jit::Node *addCMulHandler(torch::jit::Graph *graph, torch::jit::Node *node) { // aten::addcmul(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar // value=1) -> Tensor torch::jit::Node *mul = createMul(graph, {node->input(1), node->input(2)}); auto scale = constantToFloat(node->input(3)->node()); torch::jit::Node *scaled = createScale(graph, {mul->output()}, scale); return createAdd(graph, {node->input(0), scaled->output()}); } torch::jit::Node *crossHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *x = node->input(0); auto *y = node->input(1); auto *opt_axis = node->input(2)->node(); auto x_shape = shapeFromTensor(x); auto y_shape = shapeFromTensor(y); ERROR_ON_MSG(x_shape.size() != y_shape.size(), "Cross product tensors must have same rank"); for (unsigned i = 0; i < x_shape.size(); ++i) { ERROR_ON_MSG(x_shape[i] != y_shape[i], "Cross product tensors must have same shape"); } unsigned axis = 0; if (isNone(opt_axis)) { // if unspecified, the axis is the first to have dimension 3 for (unsigned i = 0; i < x_shape.size(); ++i) { if (x_shape[i] == 3) { axis = i; break; } } } else { axis = constantToInt(opt_axis); } ERROR_ON_MSG(x_shape[axis] != 3, "Cross product product axis must have dimension 3"); auto *indices = createConstantInt(graph, {2, 0, 1}, {3})->output(); // circular permutation right by 1 along the axis auto *x_roll = createGather(graph, {x, indices}, axis)->output(); auto *y_roll = createGather(graph, {y, indices}, axis)->output(); // products of one straight input with the other input permuted auto *mul_x_y_roll = createMul(graph, {x, y_roll})->output(); auto *mul_y_x_roll = createMul(graph, {y, x_roll})->output(); // subtraction produces result permuted one position left auto *result_roll = createSub(graph, {mul_y_x_roll, mul_x_y_roll})->output(); // permute to compute final result return createGather(graph, {result_roll, indices}, axis); } std::pair calculateVarMean(torch::jit::Graph *graph, const c10::ArrayRef &inputs, const std::string &op_name) { auto *x = inputs[0]; auto shape = shapeFromTensor(x); std::vector dims; // If true, bessel's correction is applied bool unbiased = false; bool keepdim = false; switch (inputs.size()) { case 2: { // aten::var(Tensor input, bool unbiased) dims.resize(shape.size()); // dims are unspecified so reduce over all std::iota(dims.begin(), dims.end(), 0); unbiased = constantToBool(inputs[1]->node()); } break; case 4: { // aten::var(Tensor input, int[] dim, bool unbiased, bool keepdim) // or torch.var.correction(Tensor input, int[]? dim, *, bool unbiased, bool // keepdim) from the compiler if (inputs[1]->node()->kind() == c10::prim::ListConstruct) { dims = constantToLongVec(inputs[1]->node()); } else { dims.resize(shape.size()); // dims are unspecified so reduce over all std::iota(dims.begin(), dims.end(), 0); } unbiased = constantToBool(inputs[2]->node()); keepdim = constantToBool(inputs[3]->node()); } break; default: ERROR("Invalid number of arguments to aten::" << op_name); } // Keep the reduced dims so we can broadcast for the subtraction auto *mean_keepdim = createReducemean(graph, {x}, dims, 1)->output(); // Also keep a copy without singleton dims so we can pass to auto *mean = createSqueeze(graph, {mean_keepdim}, dims)->output(); auto *x_minus_mean = createSub(graph, {x, mean_keepdim})->output(); auto *x_minus_mean_sqr = createMul(graph, {x_minus_mean, x_minus_mean})->output(); auto *var = createReducemean(graph, {x_minus_mean_sqr}, dims, static_cast(keepdim)) ->output(); if (unbiased) { // Apply bessel's correction by multipling the biased variance by // n / (n - 1), where n is the sample size std::int64_t numel_reduced = 1; for (auto dim : dims) { if (dim < 0) { dim += shape.size(); } numel_reduced *= shape[dim]; } const double n = static_cast(numel_reduced); auto *unbiased_factor = createConstantFloatLike(graph, x, {n / (n - 1)}, {}); var = createMul(graph, {var, unbiased_factor->output()})->output(); } return {var, mean}; } torch::jit::Node *varHandler(torch::jit::Graph *graph, torch::jit::Node *node) { // aten::var(Tensor input, bool unbiased) // aten::var(Tensor input, int[] dim, bool unbiased, bool keepdim) return calculateVarMean(graph, node->inputs(), "var").first->node(); } torch::jit::Node *varMeanHandler(torch::jit::Graph *graph, torch::jit::Node *node) { // aten::var_mean(Tensor input, bool unbiased) -> (Tensor, Tensor) // aten::var_mean(Tensor input, int[] dim, bool unbiased, bool keepdim) // -> (Tensor, Tensor) auto var_mean = calculateVarMean(graph, node->inputs(), "var_mean"); replaceOutputUse(node->output(0), var_mean.first); replaceOutputUse(node->output(1), var_mean.second); markNodeForDeletion(node); return nullptr; } torch::jit::Node *stdHandler(torch::jit::Graph *graph, torch::jit::Node *node) { // aten::std(Tensor input, bool unbiased) // aten::std(Tensor input, int[] dim, bool unbiased, bool keepdim) auto *var = calculateVarMean(graph, node->inputs(), "std").first->node(); return createSqrt(graph, {var->output()}); } torch::jit::Node *stdMeanHandler(torch::jit::Graph *graph, torch::jit::Node *node) { // aten::std_mean(Tensor input, bool unbiased) -> (Tensor, Tensor) // aten::std_mean(Tensor input, int[] dim, bool unbiased, bool keepdim) // -> (Tensor, Tensor) auto var_mean = calculateVarMean(graph, node->inputs(), "std_mean"); auto *std = createSqrt(graph, {var_mean.first}); replaceOutputUse(node->output(0), std->output()); replaceOutputUse(node->output(1), var_mean.second); markNodeForDeletion(node); return nullptr; } } // namespace __attribute__((constructor(HANDLER_INIT_PRIORITY))) static void registration() { registerHandler(c10::aten::add, addHandler); registerHandler(c10::aten::trunc, truncHandler); registerHandler(c10::aten::frac, fracHandler); registerHandler(c10::aten::floor_divide, floorDivideHandler); registerHandler(c10::aten::mul, mulHandler); registerHandler(c10::aten::true_divide, trueDivideHandler); registerHandler(c10::aten::clamp, clampHandler); registerHandler(c10::aten::clamp_min, clampMinHandler); registerHandler(c10::aten::clamp_max, clampMaxHandler); registerHandler(c10::aten::addcdiv, addCDivHandler); registerHandler(c10::aten::addcmul, addCMulHandler); registerHandler(c10::aten::cross, crossHandler); registerHandler(c10::aten::linalg_cross, crossHandler); registerHandler(c10::aten::var, varHandler); registerHandler(c10::aten::var_mean, varMeanHandler); registerHandler(c10::aten::std, stdHandler); registerHandler(c10::aten::std_mean, stdMeanHandler); } } // namespace poptorch ================================================ FILE: poptorch/source/popart_canonicalization/AtenHandlers.gen.cpp ================================================ // DO NOT EDIT! Generated by PopAtenHandlers.py // Copyright (c) 2022 Graphcore Ltd. All rights reserved. #include "../PoptorchStaticInit.hpp" #include "../PoptorchSymbols.hpp" #include "PopartCanonicalizationUtils.hpp" #include "poptorch/OpBuilder.hpp" #include "poptorch/Utils.hpp" #include "poptorch_logging/Error.hpp" #include "poptorch_logging/Logging.hpp" #include namespace poptorch { namespace { torch::jit::Node *absHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *i0 = node->input(0); // abs(i0) return createAbs(graph, {i0}); } torch::jit::Node *acosHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *i0 = node->input(0); // acos(i0) return createAcos(graph, {i0}); } torch::jit::Node *acoshHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *i0 = node->input(0); // acosh(i0) return createAcosh(graph, {i0}); } torch::jit::Node *addmmHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *y = node->input(1); auto *z = node->input(2); auto *x = node->input(0); auto *alpha = node->input(4); auto t0 = constantToFloat(alpha->node()); auto *beta = node->input(3); auto t1 = constantToFloat(beta->node()); // gemm(y, z, x, cfloat(alpha), cfloat(beta), 0, 0) return createGemm(graph, {y, z, x}, t0, t1, 0, 0); } torch::jit::Node *asinHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *i0 = node->input(0); // asin(i0) return createAsin(graph, {i0}); } torch::jit::Node *asinhHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *i0 = node->input(0); // asinh(i0) return createAsinh(graph, {i0}); } torch::jit::Node *atanHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *i0 = node->input(0); // atan(i0) return createAtan(graph, {i0}); } torch::jit::Node *atan2Handler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *i0 = node->input(0); auto *i1 = node->input(1); // atan2(i0, i1) return createAtan2(graph, {i0, i1}); } torch::jit::Node *atanhHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *i0 = node->input(0); // atanh(i0) return createAtanh(graph, {i0}); } torch::jit::Node *catHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *x = node->input(0); auto t0 = handleTensorList(x->node()); auto *y = node->input(1); auto t1 = constantToLong(y->node()); // concat(TensorList(x), clong(y)) return createConcat(graph, {t0}, t1); } torch::jit::Node *ceilHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *i0 = node->input(0); // ceil(i0) return createCeil(graph, {i0}); } torch::jit::Node *celuHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *x = node->input(0); auto *a = node->input(1); auto *t0 = createDiv(graph, {x, a})->output(); // matched expm1: sub(exp(x), 1.0) auto *t1 = createExpm1(graph, {t0})->output(); auto *t2 = createMul(graph, {a, t1})->output(); auto *t3 = createConstantFloatLike(graph, x, {0.0}, {})->output(); auto *t4 = createMax(graph, {x, t3})->output(); auto *t5 = createMin(graph, {t3, t2})->output(); // add(max(x, 0.0), min(0.0, mul(a, expm1(div(x, a))))) return createAdd(graph, {t4, t5}); } torch::jit::Node *constantPadNdHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *x = node->input(0); auto *l = node->input(1); auto t0 = constantToLongVec(l->node()); auto *c = node->input(2); auto t1 = constantToFloat(c->node()); // constantPad(x, clong_list(l), cfloat(c)) return createConstantPad(graph, x, t0, t1); } torch::jit::Node *cosHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *i0 = node->input(0); // cos(i0) return createCos(graph, {i0}); } torch::jit::Node *coshHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *i0 = node->input(0); // cosh(i0) return createCosh(graph, {i0}); } torch::jit::Node *detachHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *i0 = node->input(0); // detach(i0) return createDetach(graph, {i0}); } torch::jit::Node *divHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *i0 = node->input(0); auto *i1 = node->input(1); // div(i0, i1) return createDiv(graph, {i0, i1}); } torch::jit::Node *eluHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *x = node->input(0); auto *y = node->input(1); auto t0 = constantToFloat(y->node()); auto *z = node->input(2); auto t1 = constantToFloat(z->node()); // selu(x, cfloat(y), cfloat(z)) return createSelu(graph, {x}, t0, t1); } torch::jit::Node *eqHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *i0 = node->input(0); auto *i1 = node->input(1); // equal(i0, i1) return createEqual(graph, {i0, i1}); } torch::jit::Node *erfHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *i0 = node->input(0); // erf(i0) return createErf(graph, {i0}); } torch::jit::Node *erfcHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *x = node->input(0); auto *t0 = createErf(graph, {x})->output(); auto *t1 = createConstantFloatLike(graph, t0, {1.0}, {})->output(); // sub(1.0, erf(x)) return createSub(graph, {t1, t0}); } torch::jit::Node *expHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *i0 = node->input(0); // exp(i0) return createExp(graph, {i0}); } torch::jit::Node *expm1Handler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *i0 = node->input(0); // expm1(i0) return createExpm1(graph, {i0}); } torch::jit::Node *floorHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *i0 = node->input(0); // floor(i0) return createFloor(graph, {i0}); } torch::jit::Node *fmodHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *i0 = node->input(0); auto *i1 = node->input(1); // fmod(i0, i1) return createFmod(graph, {i0, i1}); } torch::jit::Node *geHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *x = node->input(0); auto *y = node->input(1); auto *t0 = createGreater(graph, {x, y})->output(); auto *t1 = createEqual(graph, {x, y})->output(); // logical_or(greater(x, y), equal(x, y)) return createLogical_or(graph, {t0, t1}); } torch::jit::Node *gtHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *i0 = node->input(0); auto *i1 = node->input(1); // greater(i0, i1) return createGreater(graph, {i0, i1}); } torch::jit::Node *hardshrinkHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *x = node->input(0); auto *t0 = createAbs(graph, {x})->output(); auto *l = node->input(1); auto *t1 = createAbs(graph, {l})->output(); auto *t2 = createGreater(graph, {t0, t1})->output(); auto *t3 = createConstantFloatLike(graph, x, {0.0}, {})->output(); // where(greater(abs(x), abs(l)), x, 0.0) return createWhere(graph, {t2, x, t3}); } torch::jit::Node *hardtanhHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *x = node->input(0); const CreateCast cast_obj; auto *t1 = node->input(1); auto *t2 = node->input(2); auto *a = getNodeScalarType(t1) != c10::kFloat ? cast_obj(graph, t1)->output() : t1; auto *b = getNodeScalarType(t2) != c10::kFloat ? cast_obj(graph, t2)->output() : t2; // clip(x, a, b) return createClip(graph, {x, a, b}); } torch::jit::Node *hingeEmbeddingLossHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *y = node->input(1); auto *t0 = createConstantFloatLike(graph, y, {-1.0}, {})->output(); auto *t1 = createEqual(graph, {y, t0})->output(); auto *delta = node->input(2); auto *x = node->input(0); auto *t2 = createSub(graph, {delta, x})->output(); auto *t3 = createConstantFloatLike(graph, t2, {0.0}, {})->output(); auto *t4 = createMax(graph, {t3, t2})->output(); auto *t5 = createConstantFloatLike(graph, y, {1.0}, {})->output(); auto *t6 = createEqual(graph, {y, t5})->output(); auto *t7 = createWhere(graph, {t6, x, t3})->output(); auto *t8 = createWhere(graph, {t1, t4, t7})->output(); auto *red = node->input(3); auto t9 = constantToLong(red->node()); auto t10 = convertReduceToPopart(t9); // identityloss(where(equal(y, -1.0), max(0.0, sub(delta, x)), // where(equal(y, 1.0), x, 0.0)), reduction(clong(red))) return createIdentityloss(graph, {t8}, t10); } torch::jit::Node *indexSelectHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *x = node->input(0); auto *i = node->input(2); auto *d = node->input(1); auto t0 = x->type()->expect(); auto t1 = handleDimensionParam(d, t0); // gather(x, i, dimension(d, TensorType(x))) return createGather(graph, {x, i}, t1); } torch::jit::Node *isnanHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *i0 = node->input(0); // isnan(i0) return createIsnan(graph, {i0}); } torch::jit::Node *l1LossHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *x = node->input(0); auto *y = node->input(1); auto *t0 = createSub(graph, {x, y})->output(); auto *red = node->input(2); auto t1 = constantToLong(red->node()); auto t2 = convertReduceToPopart(t1); auto *t3 = createL1loss(graph, {t0}, 1.0, t2)->output(); // identityloss(l1loss(sub(x, y), 1.0, reduction(clong(red))), 2) return createIdentityloss(graph, {t3}, 2); } torch::jit::Node *leHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *x = node->input(0); auto *y = node->input(1); auto *t0 = createLess(graph, {x, y})->output(); auto *t1 = createEqual(graph, {x, y})->output(); // logical_or(less(x, y), equal(x, y)) return createLogical_or(graph, {t0, t1}); } torch::jit::Node *leakyReluHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *x = node->input(0); auto *y = node->input(1); auto t0 = constantToFloat(y->node()); // leakyrelu(x, cfloat(y)) return createLeakyrelu(graph, {x}, t0); } torch::jit::Node *logHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *i0 = node->input(0); // log(i0) return createLog(graph, {i0}); } torch::jit::Node *log10Handler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *x = node->input(0); auto *t0 = createLog(graph, {x})->output(); auto *t1 = createConstantFloatLike(graph, t0, {2.302585092994046}, {})->output(); // div(log(x), 2.302585092994046) return createDiv(graph, {t0, t1}); } torch::jit::Node *log1pHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *i0 = node->input(0); // log1p(i0) return createLog1p(graph, {i0}); } torch::jit::Node *log2Handler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *x = node->input(0); auto *t0 = createLog(graph, {x})->output(); auto *t1 = createConstantFloatLike(graph, t0, {0.6931471805599453}, {})->output(); // div(log(x), 0.6931471805599453) return createDiv(graph, {t0, t1}); } torch::jit::Node *logSigmoidHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *x = node->input(0); auto *t0 = createSigmoid(graph, {x})->output(); // log(sigmoid(x)) return createLog(graph, {t0}); } torch::jit::Node *logicalAndHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *i0 = node->input(0); auto *i1 = node->input(1); // logical_and(i0, i1) return createLogical_and(graph, {i0, i1}); } torch::jit::Node *logicalNotHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *i0 = node->input(0); // logical_not(i0) return createLogical_not(graph, {i0}); } torch::jit::Node *logicalOrHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *i0 = node->input(0); auto *i1 = node->input(1); // logical_or(i0, i1) return createLogical_or(graph, {i0, i1}); } torch::jit::Node *ltHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *i0 = node->input(0); auto *i1 = node->input(1); // less(i0, i1) return createLess(graph, {i0, i1}); } torch::jit::Node *marginRankingLossHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *y = node->input(2); auto *t0 = createNeg(graph, {y})->output(); auto *x1 = node->input(0); auto *x2 = node->input(1); auto *t1 = createSub(graph, {x1, x2})->output(); auto *t2 = createMul(graph, {t0, t1})->output(); auto *margin = node->input(3); auto *t3 = createAdd(graph, {t2, margin})->output(); auto *t4 = createConstantFloatLike(graph, t3, {0.0}, {})->output(); auto *t5 = createMax(graph, {t3, t4})->output(); auto *red = node->input(4); auto t6 = constantToLong(red->node()); auto t7 = convertReduceToPopart(t6); // identityloss(max(add(mul(neg(y), sub(x1, x2)), margin), 0.0), // reduction(clong(red))) return createIdentityloss(graph, {t5}, t7); } torch::jit::Node *maskedFillHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *i1 = node->input(1); auto *i2 = node->input(2); auto *i0 = node->input(0); // where(i1, i2, i0) return createWhere(graph, {i1, i2, i0}); } torch::jit::Node *mseLossHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *x = node->input(0); auto *y = node->input(1); auto *t0 = createSub(graph, {x, y})->output(); auto *t1 = createMul(graph, {t0, t0})->output(); auto *red = node->input(2); auto t2 = constantToLong(red->node()); auto t3 = convertReduceToPopart(t2); // identityloss(mul(sub(x, y), sub(x, y)), reduction(clong(red))) return createIdentityloss(graph, {t1}, t3); } torch::jit::Node *neHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *x = node->input(0); auto *y = node->input(1); auto *t0 = createEqual(graph, {x, y})->output(); // logical_not(equal(x, y)) return createLogical_not(graph, {t0}); } torch::jit::Node *negHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *i0 = node->input(0); // neg(i0) return createNeg(graph, {i0}); } torch::jit::Node *normalInPlaceHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *x = node->input(0); auto t0 = shapeFromTensor(x); auto *c1 = node->input(1); auto t1 = constantToFloat(c1->node()); auto *c2 = node->input(2); auto t2 = constantToFloat(c2->node()); // randomNormal(x, tensor_shape(x), cfloat(c1), cfloat(c2)) return createRandomNormal(graph, {x}, t0, t1, t2); } torch::jit::Node *pixelShuffleHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *x = node->input(0); auto *y = node->input(1); auto t0 = constantToLong(y->node()); // depthtospace(x, clong(y), "CRD") return createDepthtospace(graph, {x}, t0, "CRD"); } torch::jit::Node *powHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *i0 = node->input(0); auto *i1 = node->input(1); // pow(i0, i1) return createPow(graph, {i0, i1}); } torch::jit::Node *randHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *x = node->input(0); auto *t0 = node->output(0); auto t2 = shapeFromTensor(t0); auto t3 = getNodeScalarType(t0); // randomUniform(x, tensor_shape(output0), 1.0, 0.0, scalar_type(output0)) return createRandomUniform(graph, x, t2, 1.0, 0.0, t3); } torch::jit::Node *randnHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *t0 = node->output(0); auto t2 = shapeFromTensor(t0); auto t3 = getNodeScalarType(t0); // randomNormal({}, tensor_shape(output0), 0.0, 1.0, scalar_type(output0)) return createRandomNormal(graph, {}, t2, 0.0, 1.0, t3); } torch::jit::Node *reciprocalHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *i0 = node->input(0); // reciprocal(i0) if (getNodeScalarType(i0) == c10::kInt) { i0 = createCast(graph, i0, c10::kFloat)->output(); } return createReciprocal(graph, {i0}); } torch::jit::Node *reflectionPad1dHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *x = node->input(0); auto *y = node->input(1); auto t0 = constantToLongVec(y->node()); // reflectionPad(x, clong_list(y)) return createReflectionPad(graph, x, t0); } torch::jit::Node *reluHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *i0 = node->input(0); // relu(i0) return createRelu(graph, {i0}); } torch::jit::Node *remainderHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *i0 = node->input(0); auto *i1 = node->input(1); // remainder(i0, i1) return createRemainder(graph, {i0, i1}); } torch::jit::Node *replicationPad1dHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *x = node->input(0); auto *y = node->input(1); auto t0 = constantToLongVec(y->node()); // edgePad(x, clong_list(y)) return createEdgePad(graph, x, t0); } torch::jit::Node *roundHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *i0 = node->input(0); // round(i0) return createNearbyInt(graph, {i0}); } torch::jit::Node *rsqrtHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *x = node->input(0); auto *t0 = createSqrt(graph, {x})->output(); // matched reciprocal: div(1.0, x) // reciprocal(sqrt(x)) return createReciprocal(graph, {t0}); } torch::jit::Node *rsubHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *y = node->input(1); auto *x = node->input(0); // sub(y, x) return createSub(graph, {y, x}); } torch::jit::Node *seluHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *x = node->input(0); // selu(x, 1.6732632423543772, 1.0507009873554805) return createSelu(graph, {x}, 1.6732632423543772, 1.0507009873554805); } torch::jit::Node *sigmoidHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *i0 = node->input(0); // sigmoid(i0) return createSigmoid(graph, {i0}); } torch::jit::Node *signHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *i0 = node->input(0); // sign(i0) return createSign(graph, {i0}); } torch::jit::Node *siluHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *i0 = node->input(0); // swish(i0) return createSwish(graph, {i0}); } torch::jit::Node *sinHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *i0 = node->input(0); // sin(i0) return createSin(graph, {i0}); } torch::jit::Node *sinhHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *i0 = node->input(0); // sinh(i0) return createSinh(graph, {i0}); } torch::jit::Node *smoothL1LossHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *beta = node->input(3); auto *x = node->input(0); auto *y = node->input(1); auto *t0 = createSub(graph, {x, y})->output(); auto *t1 = createAbs(graph, {t0})->output(); auto *t2 = createGreater(graph, {beta, t1})->output(); auto *t3 = createConstantFloatLike(graph, t1, {0.5}, {})->output(); auto *t4 = createMul(graph, {t3, t1})->output(); auto *t5 = createMul(graph, {t4, t1})->output(); auto *t6 = createDiv(graph, {t5, beta})->output(); auto *t7 = createMul(graph, {t3, beta})->output(); auto *t8 = createSub(graph, {t1, t7})->output(); auto *t9 = createWhere(graph, {t2, t6, t8})->output(); auto *red = node->input(2); auto t10 = constantToLong(red->node()); auto t11 = convertReduceToPopart(t10); // identityloss(where(greater(beta, abs(sub(x, y))), div(mul(mul(0.5, // abs(sub(x, y))), abs(sub(x, y))), beta), sub(abs(sub(x, y)), mul(0.5, // beta))), reduction(clong(red))) return createIdentityloss(graph, {t9}, t11); } torch::jit::Node *softMarginLossHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *y = node->input(1); auto *t0 = createNeg(graph, {y})->output(); auto *x = node->input(0); auto *t1 = createMul(graph, {t0, x})->output(); auto *t2 = createExp(graph, {t1})->output(); // matched log1p: log(add(1.0, x)) auto *t3 = createLog1p(graph, {t2})->output(); auto *red = node->input(2); auto t4 = constantToLong(red->node()); auto t5 = convertReduceToPopart(t4); // identityloss(log1p(exp(mul(neg(y), x))), reduction(clong(red))) return createIdentityloss(graph, {t3}, t5); } torch::jit::Node *softshrinkHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *x = node->input(0); auto *l = node->input(1); auto *t0 = createNeg(graph, {l})->output(); auto *t1 = createLess(graph, {x, t0})->output(); auto *t2 = createAdd(graph, {x, l})->output(); auto *t3 = createGreater(graph, {x, l})->output(); auto *t4 = createSub(graph, {x, l})->output(); auto *t5 = createConstantFloatLike(graph, t4, {0.0}, {})->output(); auto *t6 = createWhere(graph, {t3, t4, t5})->output(); // where(less(x, neg(l)), add(x, l), where(greater(x, l), sub(x, l), 0.0)) return createWhere(graph, {t1, t2, t6}); } torch::jit::Node *sqrtHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *i0 = node->input(0); // sqrt(i0) return createSqrt(graph, {i0}); } torch::jit::Node *squareHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *x = node->input(0); // mul(x, x) return createMul(graph, {x, x}); } torch::jit::Node *subHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *x = node->input(0); auto *y = node->input(1); auto *a = node->input(2); auto *t0 = createMul(graph, {y, a})->output(); auto *t1 = hasUnityValue(a) ? y : t0; // sub(x, alpha(y, a, mul(y, a))) return createSub(graph, {x, t1}); } torch::jit::Node *tHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *i0 = node->input(0); // transpose(i0, {}) return createTranspose(graph, {i0}, {}); } torch::jit::Node *tanHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *i0 = node->input(0); // tan(i0) return createTan(graph, {i0}); } torch::jit::Node *tanhHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *i0 = node->input(0); // tanh(i0) return createTanh(graph, {i0}); } torch::jit::Node *thresholdHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *x = node->input(0); auto *threshold = node->input(1); auto *t0 = createGreater(graph, {x, threshold})->output(); auto *val = node->input(2); // where(greater(x, threshold), x, val) return createWhere(graph, {t0, x, val}); } torch::jit::Node *topkHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *x = node->input(0); auto *c = node->input(1); auto *t0 = c->node(); setNodeTensorAttrValue(t0, getNodeTensorAttrValue(t0).to(at::ScalarType::Long)); t0->output()->inferTypeFrom(getNodeTensorAttrValue(t0)); auto *t1 = t0->output(); auto *l = node->input(2); auto t2 = x->type()->expect(); auto t3 = handleDimensionParam(l, t2); const bool largest = constantToBool(node->input(3)->node()); const bool sorted = constantToBool(node->input(4)->node()); // topk(x, inplace_cast(c), dimension(l, TensorType(x))) return createTopk(graph, {x, t1}, t3, largest, sorted); } torch::jit::Node *sortHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *const input = node->input(0); auto *const dim = node->input(2); const int64_t axis = handleDimensionParam(dim, input->type()->expect()); const bool descending = constantToBool(node->input(3)->node()); const bool stable = constantToBool(node->input(1)->node()); return createSort(graph, {input}, axis, descending, stable); } torch::jit::Node *uniformInPlaceHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *x = node->input(0); auto t0 = shapeFromTensor(x); auto *b = node->input(2); auto t1 = constantToFloat(b->node()); auto *a = node->input(1); auto t2 = constantToFloat(a->node()); // randomUniform(x, tensor_shape(x), cfloat(b), cfloat(a)) return createRandomUniform(graph, x, t0, t1, t2); } torch::jit::Node *whereHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *i0 = node->input(0); auto *i1 = node->input(1); auto *i2 = node->input(2); // where(i0, i1, i2) return createWhere(graph, {i0, i1, i2}); } } // namespace __attribute__((constructor(HANDLER_INIT_PRIORITY))) static void registration() { registerHandler(c10::aten::abs, absHandler); registerHandler(c10::aten::acos, acosHandler); registerHandler(c10::aten::acosh, acoshHandler); registerHandler(c10::aten::addmm, addmmHandler); registerHandler(c10::aten::asin, asinHandler); registerHandler(c10::aten::asinh, asinhHandler); registerHandler(c10::aten::atan, atanHandler); registerHandler(c10::aten::atan2, atan2Handler); registerHandler(c10::aten::atanh, atanhHandler); registerHandler(c10::aten::cat, catHandler); registerHandler(c10::aten::ceil, ceilHandler); registerHandler(c10::aten::celu, celuHandler); registerHandler(c10::aten::constant_pad_nd, constantPadNdHandler); registerHandler(c10::aten::cos, cosHandler); registerHandler(c10::aten::cosh, coshHandler); registerHandler(c10::aten::detach, detachHandler); registerHandler(c10::aten::div, divHandler); registerHandler(c10::aten::elu, eluHandler); registerHandler(c10::aten::eq, eqHandler); registerHandler(c10::aten::erf, erfHandler); registerHandler(c10::aten::erfc, erfcHandler); registerHandler(c10::aten::exp, expHandler); registerHandler(c10::aten::expm1, expm1Handler); registerHandler(c10::aten::floor, floorHandler); registerHandler(c10::aten::fmod, fmodHandler); registerHandler(c10::aten::ge, geHandler); registerHandler(c10::aten::gt, gtHandler); registerHandler(c10::aten::hardshrink, hardshrinkHandler); registerHandler(c10::aten::hardtanh, hardtanhHandler); registerHandler(c10::aten::hinge_embedding_loss, hingeEmbeddingLossHandler); registerHandler(c10::aten::index_select, indexSelectHandler); registerHandler(c10::aten::isnan, isnanHandler); registerHandler(c10::aten::l1_loss, l1LossHandler); registerHandler(c10::aten::le, leHandler); registerHandler(c10::aten::leaky_relu, leakyReluHandler); registerHandler(c10::aten::log, logHandler); registerHandler(c10::aten::log10, log10Handler); registerHandler(c10::aten::log1p, log1pHandler); registerHandler(c10::aten::log2, log2Handler); registerHandler(c10::aten::log_sigmoid, logSigmoidHandler); registerHandler(c10::aten::log_sigmoid_forward, logSigmoidHandler); registerHandler(c10::aten::logical_and, logicalAndHandler); registerHandler(c10::aten::logical_not, logicalNotHandler); registerHandler(c10::aten::logical_or, logicalOrHandler); registerHandler(c10::aten::lt, ltHandler); registerHandler(c10::aten::margin_ranking_loss, marginRankingLossHandler); registerHandler(c10::aten::masked_fill, maskedFillHandler); registerHandler(c10::aten::mse_loss, mseLossHandler); registerHandler(c10::aten::ne, neHandler); registerHandler(c10::aten::neg, negHandler); registerHandler(c10::aten::normal_, normalInPlaceHandler); registerHandler(c10::aten::pixel_shuffle, pixelShuffleHandler); registerHandler(c10::aten::pow, powHandler); registerHandler(c10::aten::rand, randHandler); registerHandler(c10::aten::randn, randnHandler); registerHandler(c10::aten::reciprocal, reciprocalHandler); registerHandler(c10::aten::reflection_pad1d, reflectionPad1dHandler); registerHandler(c10::aten::reflection_pad2d, reflectionPad1dHandler); registerHandler(c10::aten::relu, reluHandler); registerHandler(c10::aten::remainder, remainderHandler); registerHandler(c10::aten::replication_pad1d, replicationPad1dHandler); registerHandler(c10::aten::replication_pad2d, replicationPad1dHandler); registerHandler(c10::aten::replication_pad3d, replicationPad1dHandler); registerHandler(c10::aten::round, roundHandler); registerHandler(c10::aten::rsqrt, rsqrtHandler); registerHandler(c10::aten::rsub, rsubHandler); registerHandler(c10::aten::selu, seluHandler); registerHandler(c10::aten::sigmoid, sigmoidHandler); registerHandler(c10::aten::sign, signHandler); registerHandler(c10::aten::silu, siluHandler); registerHandler(c10::aten::sin, sinHandler); registerHandler(c10::aten::sinh, sinhHandler); registerHandler(c10::aten::smooth_l1_loss, smoothL1LossHandler); registerHandler(c10::aten::soft_margin_loss, softMarginLossHandler); registerHandler(c10::aten::softshrink, softshrinkHandler); registerHandler(c10::aten::sort, sortHandler); registerHandler(c10::aten::sqrt, sqrtHandler); registerHandler(c10::aten::square, squareHandler); registerHandler(c10::aten::sub, subHandler); registerHandler(c10::aten::t, tHandler); registerHandler(c10::aten::tan, tanHandler); registerHandler(c10::aten::tanh, tanhHandler); registerHandler(c10::aten::threshold, thresholdHandler); registerHandler(c10::aten::topk, topkHandler); registerHandler(c10::aten::uniform_, uniformInPlaceHandler); registerHandler(c10::aten::where, whereHandler); } } // namespace poptorch ================================================ FILE: poptorch/source/popart_canonicalization/BilinearOps.cpp ================================================ // Copyright (c) 2020 Graphcore Ltd. All rights reserved. #include "../PoptorchStaticInit.hpp" #include "PopartCanonicalizationUtils.hpp" #include #include #include #include namespace poptorch { namespace { torch::jit::Node *bilinearHandler(torch::jit::Graph *graph, torch::jit::Node *node) { // aten::bilinear(Tensor input1, Tensor input2, Tensor weight, Tensor? bias) // -> Tensor // Bilinear - outputs a linear combination of feature inputs: // // Ynm = \sum_ij Un_i Am_ij Vn_j + bm // // Where U and V are the data input tensors containing feature vectors // (possibly ND), A is the 3D weight tensor, and b is the bias vector. // We can evaluate the bilinear map in pytorch as follows: // // U = U.unsqueeze(-2).unsqueeze(-2) // V = V.unsqueeze(-2).unsqueeze(-1) // Y = U.matmul(A).matmul(V) // Y = Y.squeeze(-1).squeeze(-1) // Y = Y + b // Tensor feature inputs torch::jit::Value *in1 = node->input(0); torch::jit::Value *in2 = node->input(1); // weight and the optional bias torch::jit::Value *weight = node->input(2); torch::jit::Value *bias = node->input(3); // Insert singleton dimensions in feature inputs auto shape1 = shapeFromTensor(in1); shape1.insert(shape1.end() - 1, 1); shape1.insert(shape1.end() - 1, 1); torch::jit::Node *flat_in1 = createReshape(graph, in1, shape1); auto shape2 = shapeFromTensor(in2); shape2.insert(shape2.end() - 1, 1); shape2.insert(shape2.end(), 1); torch::jit::Node *flat_in2 = createReshape(graph, in2, shape2); // Multiply matrices together for the bilinear map: U * A * V as above torch::jit::Node *in1_matmul_weight = poptorch::createMatmul(graph, {flat_in1->output(), weight}); torch::jit::Node *bilinear_map = poptorch::createMatmul( graph, {in1_matmul_weight->output(), flat_in2->output()}); // Squeeze out the trailing singleton dims by reshaping to the expected // result size. Taking care to omit the singleton dims injected above, we // derive the output shape from the leading dimensions of input1 and the // size in the first dimension of the weight tensor. In pytorch: // // U.shape[0:-1] + (A.shape[0],) // // is the expected output size. std::vector result_shape(shape1.begin(), shape1.end() - 3); auto weight_shape = shapeFromTensor(weight); result_shape.push_back(weight_shape.front()); torch::jit::Node *result = createReshape(graph, bilinear_map->output(), result_shape); // Add optional bias if (!isNone(bias->node())) { result = poptorch::createAdd(graph, {result->output(), bias}); } return result; } } // namespace __attribute__((constructor(HANDLER_INIT_PRIORITY))) static void registration() { registerHandler(c10::aten::bilinear, bilinearHandler); } } // namespace poptorch ================================================ FILE: poptorch/source/popart_canonicalization/BitwiseOps.cpp ================================================ // Copyright (c) 2021 Graphcore Ltd. All rights reserved. #include "../PoptorchStaticInit.hpp" #include "PopartCanonicalizationUtils.hpp" #include "../PoptorchSymbols.hpp" #include "poptorch/OpBuilder.hpp" #include "poptorch_logging/Error.hpp" namespace poptorch { namespace { // PyTorch's bitwise_* functions can take any integral tensors as input (ie. // torch.{uint8,int8,int16,int32,int64}. However, Poplibs' element-wise binary // ops don't support 8-bit int inputs (see // popops/codelets/elementwiseBinaryCodelets.cpp). Use this extra function to // generate slightly nicer error messages. void verifyCompatibleIntegralInputs(torch::jit::Node *node, const std::string &op_name) { ERROR_ON_MSG(allInputsOfType(node, at::ScalarType::Byte) || allInputsOfType(node, at::ScalarType::Char), op_name + ": Poplar does not support binary operations on " "8-bit integral types."); } torch::jit::Node *bitwiseAndHandler(torch::jit::Graph *graph, torch::jit::Node *node) { if (allInputsBool(node)) { return createLogical_and(graph, {node->input(0), node->input(1)}); } if (allInputsInteger(node)) { verifyCompatibleIntegralInputs(node, "Bitwise-and"); return createBitwiseand(graph, {node->input(0), node->input(1)}); } ERROR("Bitwise-and operator supports only bool and integer types"); return nullptr; } torch::jit::Node *bitwiseNotHandler(torch::jit::Graph *graph, torch::jit::Node *node) { if (allInputsBool(node)) { return createLogical_not(graph, {node->input(0)}); } if (allInputsInteger(node)) { verifyCompatibleIntegralInputs(node, "Bitwise-not"); return createBitwisenot(graph, {node->input(0)}); } ERROR("Bitwise-not operator supports only bool and integer types"); return nullptr; } torch::jit::Node *bitwiseOrHandler(torch::jit::Graph *graph, torch::jit::Node *node) { if (allInputsBool(node)) { return createLogical_or(graph, {node->input(0), node->input(1)}); } if (allInputsInteger(node)) { verifyCompatibleIntegralInputs(node, "Bitwise-or"); return createBitwiseor(graph, {node->input(0), node->input(1)}); } ERROR("Bitwise-or operator supports only bool and integer types"); return nullptr; } torch::jit::Node *bitwiseXorHandler(torch::jit::Graph *graph, torch::jit::Node *node) { if (allInputsBool(node)) { return createLogical_xor(graph, {node->input(0), node->input(1)}); } if (allInputsInteger(node)) { verifyCompatibleIntegralInputs(node, "Bitwise-xor"); return createBitwisexor(graph, {node->input(0), node->input(1)}); } ERROR("Bitwise-xor operator supports only bool and integer types"); return nullptr; } } // namespace __attribute__((constructor(HANDLER_INIT_PRIORITY))) static void registration() { registerHandler(c10::aten::bitwise_and, bitwiseAndHandler); registerHandler(c10::aten::bitwise_not, bitwiseNotHandler); registerHandler(c10::aten::bitwise_or, bitwiseOrHandler); registerHandler(c10::aten::bitwise_xor, bitwiseXorHandler); registerHandler(c10::aten::__and__, bitwiseAndHandler); registerHandler(c10::aten::__or__, bitwiseOrHandler); registerHandler(c10::aten::__xor__, bitwiseXorHandler); } } // namespace poptorch ================================================ FILE: poptorch/source/popart_canonicalization/BlasOps.cpp ================================================ // Copyright (c) 2020 Graphcore Ltd. All rights reserved. #include "../PoptorchStaticInit.hpp" #include "PopartCanonicalizationUtils.hpp" #include "poptorch/OpBuilder.hpp" #include "poptorch/Utils.hpp" #include "poptorch_logging/Error.hpp" #include "poptorch_logging/Logging.hpp" #include "../PoptorchSymbols.hpp" namespace poptorch { namespace { torch::jit::Node *matmulHandler(torch::jit::Graph *graph, torch::jit::Node *node) { // aten::mm(Tensor self, Tensor mat2) -> (Tensor) // "aten::matmul(Tensor self, int dim) -> Tensor" // We will fuse the batch dimesion of the matrix A of matmul(A, B), // if we find such a pattern: // // matrix A(N, M, K) multiplies matrix B(K, L) // // where matrix A is matmul's input 0, and matrix B is its input 1. // The matrix A will be reshaped into A(N*M, K) before matmul, // The benefit of this transformation is to avoid the ReduceSum // of the backwrad pass, as ReduceSum is a performance bottleneck otherwise. // // The input IR before canonicalization: // %output : Float(3:14, 2:7, 7:1) = aten::matmul(%input.1, %27) // It takes 3 steps for the transformation: // 1. Reshape // 2. Matmul // 3. Reshape // The output IRs after canonicalization: // %28 : Float(6:7, 7:1) = // popart::reshape_static_shape[shape=[6, 7]](%input.1) // %29 : FloatTensor = popart::matmul(%28, %27) // %30 : Float(3:14, 2:7, 7:1) = // popart::reshape_static_shape[shape=[3, 2, 7]](%29) torch::jit::Value *matrix_a = node->input(0); torch::jit::Value *matrix_b = node->input(1); std::vector shape_input_a = shapeFromTensor(matrix_a); std::vector shape_input_b = shapeFromTensor(matrix_b); std::int64_t const size_a = shape_input_a.size(); std::int64_t const size_b = shape_input_b.size(); torch::jit::Node *result; // Matrix A can have any batch dimensions // But matrix B has to be in a 2D shape if (size_a >= 3 && size_b == 2 && shape_input_a[size_a - 1] == shape_input_b[0]) { // Prepare the output shape of matmul by // - merging all the batch dimensions of matrix A, and // - taking the last dimension of matrix B std::vector output_shape; // Prepare the shape of fused batch dimensions for matrix A std::vector fused_a_shape; std::int64_t merged_dim = shape_input_a[size_a - 2]; for (std::int64_t i = 0; i < size_a - 2; ++i) { // Final output shape could have any batch dimensions as before output_shape.push_back(shape_input_a[i]); merged_dim *= shape_input_a[i]; } output_shape.push_back(shape_input_a[size_a - 2]); output_shape.push_back(shape_input_b[size_b - 1]); // Matrix A has 2D shape after fusing batch dimensions fused_a_shape.push_back(merged_dim); fused_a_shape.push_back(shape_input_a[size_a - 1]); // 1. Reshape matrix A to merge all of its batch size dimensions torch::jit::Node *merge_mat = createReshape(graph, matrix_a, fused_a_shape); // 2. Matmul torch::jit::Node *mul = createMatmul(graph, {merge_mat->output(), matrix_b}); // 3. Reshape to the expected shape of the original matmul result = createReshape(graph, mul->output(), output_shape); // Add the trace to ease debugging for before and after IRs logging::trace("Replacing matmul {} with {} {} {}", *node, *merge_mat, *mul, *result); } else { // The "normal" matmul will follow the original path result = createMatmul(graph, {matrix_a, matrix_b}); } return result; } torch::jit::Node *baddbmmHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *batch1 = node->input(1); auto *batch2 = node->input(2); auto b2_dtype = *batch2->type()->expect()->scalarType(); auto *t0 = createMatmul(graph, {batch1, batch2})->output(); auto *alpha = node->input(4); auto *t1 = createMul(graph, {t0, alpha})->output(); auto *input = node->input(0); auto input_dtype = *input->type()->expect()->scalarType(); auto *beta = node->input(3); // PyTorch type inference dictates that the output scalar type is that of // the second batch input, so cast the first input if necessary auto *t2 = createMul(graph, {input, beta})->output(); if (b2_dtype != input_dtype) { t2 = createCast(graph, t2, b2_dtype)->output(); } // add(mul(matmul(batch1, batch2), alpha), mul(input, beta)) return createAdd(graph, {t1, t2}); } torch::jit::Node *addmvHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *input = node->input(0); auto *mat = node->input(1); auto *vec = node->input(2); auto *beta = node->input(3); auto *alpha = node->input(4); const auto alpha_val = constantToFloat(alpha->node()); const auto beta_val = constantToFloat(beta->node()); if (alpha_val == 0 && beta_val == 0) { return createConstantFloatLike(graph, input, {0}, {shapeFromTensor(input)}); } torch::jit::Node *t1 = nullptr; if (alpha_val != 0) { auto *t0 = createMatmul(graph, {mat, vec})->output(); t1 = createMul(graph, {t0, alpha}); } torch::jit::Node *output; if (beta_val != 0) { auto *t2 = createMul(graph, {input, beta}); if (t1 != nullptr) { output = createAdd(graph, {t1->output(), t2->output()}); } else { output = t2; } } else { output = t1; } return output; } } // namespace __attribute__((constructor(HANDLER_INIT_PRIORITY))) static void registration() { registerHandler(c10::aten::matmul, matmulHandler); registerHandler(c10::aten::baddbmm, baddbmmHandler); registerHandler(c10::aten::addmv, addmvHandler); // Matrix-Vector registerHandler(c10::aten::mv, matmulHandler); // Vector-Vector registerHandler(c10::aten::dot, matmulHandler); // With bias. registerHandler(c10::aten::bmm, matmulHandler); // No bias. registerHandler(c10::aten::mm, matmulHandler); } } // namespace poptorch ================================================ FILE: poptorch/source/popart_canonicalization/ConstantOps.cpp ================================================ // Copyright (c) 2020 Graphcore Ltd. All rights reserved. #include #include #include "../PoptorchStaticInit.hpp" #include "PopartCanonicalizationUtils.hpp" #include "poptorch/OpBuilder.hpp" #include "poptorch/Utils.hpp" #include "poptorch_logging/Error.hpp" #include "poptorch_logging/Logging.hpp" #include "../PoptorchSymbols.hpp" namespace poptorch { namespace { torch::jit::Node *onesZerosHandler(torch::jit::Graph *graph, torch::jit::Node *node) { // aten::ones(int[] size, *, int? dtype, int? layout, Device? device, bool? // pin_memory) -> Tensor // aten::zeros(int[] size, *, int? dtype, int? layout, Device? device, bool? // pin_memory) -> Tensor // aten::zeros_like(Tensor self, ScalarType? dtype, Layout? layout, Device? // device, bool? pin_memory, MemoryFormat? memory_format) // -> Tensor // aten::ones_like(Tensor self, ScalarType? dtype, Layout? layout, Device? // device, bool? pin_memory, MemoryFormat? memory_format) // -> Tensor torch::jit::Symbol kind = node->kind(); const bool is_ones = kind == c10::aten::ones || kind == c10::aten::ones_like || kind == c10::aten::new_ones; auto *output = node->output(); auto *new_node = createAndInsertNode( graph, is_ones ? symbols::poptorch::ones : symbols::poptorch::zeros, {}, ImplicitCast::None, OutputType::AsDtype, 1, getNodeScalarType(output)); if (kind != c10::aten::new_ones && kind != c10::aten::new_zeros) { new_node->is_(c10::attr::shape, shapeFromTensor(output)); } else { const auto shape_list = handleTensorList(node->input(1)->node()); std::vector shape; for (auto *size : shape_list) { ERROR_ON_MSG( !isTensorConstant(size->node()), "Invalid shape for " "new_zeros or new_ones. Shape needs to be a static constant"); shape.emplace_back(constantToInt(size->node())); } new_node->is_(c10::attr::shape, shape); } return new_node; } torch::jit::Node *arangeHandler(torch::jit::Graph *graph, torch::jit::Node *node) { std::size_t start; std::size_t end; std::size_t step; switch (node->inputs().size()) { // arange.out(Scalar end, *, Tensor(a!) out) -> Tensor(a!) // arange(Scalar end, *, ScalarType? dtype=None, Layout? layout=None, // Device? device=None, bool? pin_memory=None) -> Tensor case 2: case 5: start = 0; end = constantToLong(node->input(0)->node()); step = 1; break; // arange.start(Scalar start, Scalar end, *, ScalarType? dtype=None, // Layout? layout=None, Device? device=None, // bool? pin_memory=None) -> Tensor case 6: start = constantToLong(node->input(0)->node()); end = constantToLong(node->input(1)->node()); step = 1; break; // arange.start_out(Scalar start, Scalar end, Scalar step=1, *, // Tensor(a!) out) -> Tensor(a!) // arange.start_step(Scalar start, Scalar end, Scalar step, *, // ScalarType? dtype=None, Layout? layout=None, // Device? device=None, bool? pin_memory=None) -> Tensor case 4: case 7: start = constantToLong(node->input(0)->node()); end = constantToLong(node->input(1)->node()); step = constantToLong(node->input(2)->node()); break; default: ERROR("Unsupported arange op"); break; } std::vector vals((end - start) / step); size_t v = start; std::generate(std::begin(vals), std::end(vals), [&v, step] { auto cv = v; v += step; return cv; }); return createConstantInt(graph, vals, {static_cast(vals.size())}); } torch::jit::Node *randpermHandler(torch::jit::Graph *graph, torch::jit::Node *node) { // aten::randperm(Scalar n, ScalarType dtype, Layout, Device, bool pin_memory) auto *n = node->input(0)->node(); setNodeTensorAttrValue(n, getNodeTensorAttrValue(n).to(at::ScalarType::Long)); n->output()->inferTypeFrom(getNodeTensorAttrValue(n)); auto *size_of_permutation = n->output(); const std::vector shape = {constantToLong(n)}; const auto dtype = c10::ScalarType::Float; torch::jit::Value *uniform = createRandomUniform(graph, nullptr, shape, 1.0, 0.0, dtype)->output(); auto *topk = createTopk(graph, {uniform, size_of_permutation}, 0, true /*largest*/, true /*sorted*/); return createCast(graph, topk->output(1), c10::ScalarType::Int); } } // namespace __attribute__((constructor(HANDLER_INIT_PRIORITY))) static void registration() { registerHandler(c10::aten::arange, arangeHandler); registerHandler(c10::aten::ones, onesZerosHandler); registerHandler(c10::aten::ones_like, onesZerosHandler); registerHandler(c10::aten::new_ones, onesZerosHandler); registerHandler(c10::aten::new_zeros, onesZerosHandler); registerHandler(c10::aten::zeros, onesZerosHandler); registerHandler(c10::aten::zeros_like, onesZerosHandler); registerHandler(c10::aten::randperm, randpermHandler); } } // namespace poptorch ================================================ FILE: poptorch/source/popart_canonicalization/ConvolutionOps.cpp ================================================ // Copyright (c) 2020 Graphcore Ltd. All rights reserved. #include "../PoptorchStaticInit.hpp" #include "PopartCanonicalizationUtils.hpp" #include "poptorch/OpBuilder.hpp" #include "poptorch/Utils.hpp" #include "poptorch_logging/Error.hpp" #include "poptorch_logging/Logging.hpp" namespace poptorch { namespace { torch::jit::Node *convolutionHandler(torch::jit::Graph *graph, torch::jit::Node *node) { // aten::_convolution(Tensor input, Tensor weight, Tensor? bias, int[] // stride, int[] padding, int[] dilation, bool transposed, // int[] output_padding, int groups) -> Tensor const bool transposed = constantToBool(node->input(6)->node()); torch::jit::Value *input = node->input(0); torch::jit::Value *kernel = node->input(1); torch::jit::Value *bias = node->input(2); castWeightAndBias(graph, input, kernel, bias); std::vector inputs{input, kernel}; if (!isNone(bias->node())) { inputs.push_back(bias); } const std::vector stride = constantToLongVec(node->input(3)->node()); std::vector padding = constantToLongVec(node->input(4)->node()); // Pytorch gives the padding as being the amount to pad in both // directions. Popart two arguments for each axis, the amount to pad in // each direction along that axis. In the form (Axis0Left, AxisNLeft..., // Axis0Right, AxisNRight) where left and right refer to the direction // along the axis to add zeros to. const std::size_t num_pads = padding.size(); for (std::size_t pad_index = 0; pad_index < num_pads; ++pad_index) { padding.push_back(padding[pad_index]); } const std::vector dilation = constantToLongVec(node->input(5)->node()); const std::vector output_padding = constantToLongVec(node->input(7)->node()); std::int64_t const groups = constantToLong(node->input(8)->node()); if (!transposed) { // Create a "normal" convolution. // output_padding should be zero except for conv transpose for (auto out_pad : output_padding) { ERROR_ON(out_pad > 0); } return createConv(graph, inputs, dilation, groups, {}, padding, stride); } return createConvtranspose(graph, inputs, dilation, groups, {}, output_padding, {}, padding, stride); } torch::jit::Node *conv2dHandler(torch::jit::Graph *graph, torch::jit::Node *node) { // aten::conv2d(Tensor input, Tensor weight, Tensor? bias, int[] stride, // int[] padding, int[] dilation, int groups) -> Tensor // Or: // aten::mkldnn_convolution(Tensor self, Tensor weight, Tensor? bias, int[] // padding, int[] stride, int[] dilation, int groups) -> (Tensor) torch::jit::Value *input = node->input(0); torch::jit::Value *kernel = node->input(1); torch::jit::Value *bias = node->input(2); castWeightAndBias(graph, input, kernel, bias); std::vector inputs{input, kernel}; if (!isNone(bias->node())) { inputs.push_back(bias); } const bool is_mkldnn_conv = node->kind() == c10::aten::mkldnn_convolution; const std::uint32_t stride_index = is_mkldnn_conv ? 4 : 3; const std::uint32_t padding_index = is_mkldnn_conv ? 3 : 4; const std::vector stride = constantToLongVec(node->input(stride_index)->node()); std::vector padding = constantToLongVec(node->input(padding_index)->node()); // Pytorch gives the padding as being the amount to pad in both // directions. Popart two arguments for each axis, the amount to pad in // each direction along that axis. In the form (Axis0Left, AxisNLeft..., // Axis0Right, AxisNRight) where left and right refer to the direction // along the axis to add zeros to. const std::size_t num_pads = padding.size(); for (std::size_t pad_index = 0; pad_index < num_pads; ++pad_index) { padding.push_back(padding[pad_index]); } const std::vector dilation = constantToLongVec(node->input(5)->node()); std::int64_t const groups = constantToLong(node->input(6)->node()); return poptorch::createConv(graph, inputs, dilation, groups, {}, padding, stride); } torch::jit::Node *cumsumHandler(torch::jit::Graph *graph, torch::jit::Node *node) { torch::jit::Value *data = node->input(0); std::vector data_shape = shapeFromTensor(data); int64_t dim = constantToLong(node->input(1)->node()); const int64_t r = static_cast(data_shape.size()); ERROR_ON_MSG(dim < -r || dim > r - 1, "Dimension out of range."); if (dim < 0) { dim += r; } // By default, the output's `dtype` should match the input's. at::ScalarType requested_output_dtype = getNodeScalarType(data); if (node->inputs().size() == 4) { // We've been called with the form `torch.cumsum(..., out=output)`, so the // output tensor's `dtype` gets used as per the `torch.cumsum` spec. requested_output_dtype = getNodeScalarType(node->input(3)); } else if (!isNone(node->input(2))) { // We've been called with an explicit `dtype`, so use that. requested_output_dtype = constantToScalarType(node->input(2)->node()); } // We have to cast the input tensor to the output `dtype` *before* doing the // sum, to conform with the API of `torch.cumsum`. data = createCast(graph, data, requested_output_dtype)->output(); // The 1-D conv kernel span is the size in the dim we are reducing along const int64_t span = data_shape[static_cast(dim)]; if (span < 2) { // cumsum in singleton dimension or scalar/empty return createIdentity(graph, {data}); } // Create the 1-d conv kernel const std::vector kernel_data(static_cast(span), 1.0); torch::jit::Value *ones = createConstantFloatLike(graph, data, kernel_data, {span})->output(); // ONNX conv expects the kernel to have size M x C/group X kW X kW // So reshape the kernel to have size [1,1,span,1] std::vector kernel_shape(4, 1); kernel_shape[2] = span; torch::jit::Value *k = createReshape(graph, ones, kernel_shape)->output(); if (dim != 0) { // Transpose input so that we can apply the 1-d conv assuming dim==0 std::vector p(r); std::iota(p.begin(), p.end(), 0); std::swap(p[0], p[dim]); data = createTranspose(graph, {data}, p)->output(); std::swap(data_shape[0], data_shape[dim]); } // Coerce into [N,M] 2-d tensor if (r < 2) { data = createUnsqueeze(graph, {data}, {1})->output(); } if (r > 2) { data = createFlatten(graph, {data}, 1)->output(); } // ONNX conv expects the input data to have size batch X channel x H X W // So we reshape the [N,M] 2-d data to [M,1,N,1] and apply the 1-d conv // kernel of ones with [span-1,0] padding above and below. torch::jit::Value *x = createUnsqueeze(graph, {data}, {2, 3})->output(); x = createTranspose(graph, {x}, {1, 2, 0, 3})->output(); x = createCast(graph, x, c10::ScalarType::Float)->output(); torch::jit::Value *y = createConv(graph, {x, k}, {}, 1, {}, {span - 1, 0, 0, 0}, {})->output(); // Unfortunately we have to cast again here, because `createConv` always // returns a float-typed tensor. We can't *only* cast here either, because // cast -> sum != sum -> cast when going from float to int, and the spec of // `torch.cumsum` says to cast first. If we don't cast at all, info about our // size doesn't get transmitted to later ops relying on us (eg. `select`). y = createCast(graph, y, requested_output_dtype)->output(); // Work back to the correct expected output shape y = createTranspose(graph, {y}, {2, 0, 1, 3})->output(); y = createReshape(graph, y, data_shape)->output(); if (dim != 0) { // Transpose back to the original axes orientation. std::vector p(r); std::iota(p.begin(), p.end(), 0); std::swap(p[0], p[dim]); y = createTranspose(graph, {y}, p)->output(); } return y->node(); } torch::jit::Node *cumprodHandler(torch::jit::Graph *graph, torch::jit::Node *node) { torch::jit::Value *data = node->input(0); const std::vector data_shape = shapeFromTensor(data); const int64_t dim = handleDimensionParam( node->input(1), data->type()->expect()); const int64_t num_iters = data_shape.at(dim); auto *result = createIdentity(graph, {data}); result->output()->setType( result->output()->type()->expect()->withSizes( data_shape)); auto select_handler = getHandler(c10::aten::select); for (int64_t i = 1; i < num_iters; ++i) { const auto src_slice_idx = i - 1; const auto dst_slice_idx = i; auto *const src = createSlice(graph, {result->output()}, {src_slice_idx + 1}, {src_slice_idx}, {dim}) ->output(); auto *const dst = createSlice(graph, {result->output()}, {dst_slice_idx + 1}, {dst_slice_idx}, {dim}) ->output(); auto *const new_val = createMul(graph, {src, dst})->output(); const std::vector args{ result->output(), wrapInConstantVec(graph, {dst_slice_idx}), new_val}; result = createDynamicupdate(graph, args, {dim}, {1}, 0); result->output()->setType( result->output()->type()->expect()->withSizes( data_shape)); } return result; } } // namespace __attribute__((constructor(HANDLER_INIT_PRIORITY))) static void registration() { registerHandler(c10::aten::convolution, convolutionHandler); registerHandler(c10::aten::convolution_overrideable, convolutionHandler); registerHandler(c10::aten::_convolution, convolutionHandler); registerHandler(c10::aten::mkldnn_convolution, conv2dHandler); registerHandler(c10::aten::conv2d, conv2dHandler); registerHandler(c10::aten::cumsum, cumsumHandler); registerHandler(c10::aten::cumprod, cumprodHandler); } } // namespace poptorch ================================================ FILE: poptorch/source/popart_canonicalization/CustomOps.cpp ================================================ // Copyright (c) 2020 Graphcore Ltd. All rights reserved. #include #include "../PoptorchStaticInit.hpp" #include "../PoptorchSymbols.hpp" #include "PopartCanonicalizationUtils.hpp" #include "poptorch/OpBuilder.hpp" #include "poptorch/Utils.hpp" #include "poptorch_logging/Error.hpp" namespace poptorch { namespace { torch::jit::Node *customOpHandler(torch::jit::Graph *graph, torch::jit::Node *node) { std::vector inputs = handleTensorList(node->input(0)->node()); std::string name = constantToString(node->input(1)->node()); std::string domain = constantToString(node->input(2)->node()); // Get the domain version. std::int64_t domain_version = constantToLong(node->input(3)->node()); // Get the number of outputs. std::int64_t num_outputs = constantToLong(node->input(4)->node()); // The attributes are in the Python dict represented by an id within a string auto attributes_id_str = constantToString(node->input(6)->node()); // Add the custom op with a variadic number of outputs. torch::jit::Node *custom_op = createCustomOperation(graph, inputs, name, domain, domain_version, num_outputs, attributes_id_str); // It is replacing an operation which returned a list so add a list // construct to keep the IR legal. return createAndInsertNode(graph, at::prim::ListConstruct, custom_op->outputs()); } } // namespace __attribute__((constructor(HANDLER_INIT_PRIORITY))) static void registration() { registerHandler(symbols::poptorch::custom_operation, customOpHandler); } } // namespace poptorch ================================================ FILE: poptorch/source/popart_canonicalization/DistanceOps.cpp ================================================ // Copyright (c) 2020 Graphcore Ltd. All rights reserved. #include #include "../PoptorchStaticInit.hpp" #include "PopartCanonicalizationUtils.hpp" #include "poptorch/OpBuilder.hpp" #include "poptorch/Utils.hpp" #include "poptorch_logging/Error.hpp" namespace poptorch { namespace { torch::jit::Node *pairwiseDistanceHandler(torch::jit::Graph *graph, torch::jit::Node *node) { // aten::pairwise_distance(Tensor x1, Tensor x2, float p, float eps, // bool keepdim) // Input 1 auto *x1 = node->input(0); // Input 2 auto *const x2 = node->input(1); // Norm degree auto *const p = node->input(2); // Small value to avoid division by zero auto *const eps = node->input(3); // Whether to keep vector dimension auto *const keepdim = node->input(4); auto input_shape = shapeFromTensor(x1); bool reshape_output = false; // No batch dim, append one to front // (D) -> (N, D), N = 1 if (input_shape.size() == 1) { input_shape = {1, input_shape[0]}; x1 = createUnsqueeze(graph, {x1}, {0})->output(); reshape_output = true; } // x1 - x2 auto *const x1_minus_x2 = createSub(graph, {x1, x2})->output(); // x1 - x2 + eps auto *const x1_minus_x2_plus_eps = createAdd(graph, {x1_minus_x2, eps})->output(); x1_minus_x2_plus_eps->setType( x1_minus_x2_plus_eps->type()->expect()->withSizes( input_shape)); // 1 auto *const ones = wrapInConstant1D(graph, 1); // tensorNormHandler expects ListConstruct for dims torch::jit::Node *const ones_list = createAndInsertNode(graph, c10::prim::ListConstruct, {ones}); std::vector norm_inputs = {x1_minus_x2_plus_eps, p, ones_list->output(), keepdim}; // norm(x1 - x2 + eps, p, 1, keepdim) auto *out = createHandlerOperation(graph, getHandler(c10::aten::norm), norm_inputs); // If passed inputs of size (1, N), the output of norm will have shape // torch.Size([1]), but torch outputs torch.Size([]), so reshape if (reshape_output) { out = createReshape(graph, out->output(), shapeFromTensor(node->output(0))); } return out; } torch::jit::Node *cosineSimilarityHandler(torch::jit::Graph *graph, torch::jit::Node *node) { // aten::cosine_similarity(const Tensor& x1, const Tensor& x2, int64_t dim, // double eps) // inputs auto *const x1 = node->input(0); auto *const x2 = node->input(1); const auto dim = constantToLong(node->input(2)->node()); auto *const eps = node->input(3); // dividend auto *const mul12 = createMul(graph, {x1, x2})->output(); auto *const dot12 = createReducesum(graph, {mul12}, {dim}, 0)->output(); // divisor auto *const mag1_sq = createReducesumsquare(graph, {x1}, {dim}, 0)->output(); auto *const mag2_sq = createReducesumsquare(graph, {x2}, {dim}, 0)->output(); auto *const mag12_sq = createMul(graph, {mag1_sq, mag2_sq})->output(); auto *const mag12 = createSqrt(graph, {mag12_sq})->output(); auto *const mag12_nonzero = createMax(graph, {mag12, eps})->output(); return createDiv(graph, {dot12, mag12_nonzero}); } torch::jit::Node *cdistHandler(torch::jit::Graph *graph, torch::jit::Node *node) { // Input 1 auto *const x1 = node->input(0); // Input 2 auto *const x2 = node->input(1); // Norm degree auto *const p_degree = node->input(2); const std::vector x1_shape = shapeFromTensor(x1); const std::vector x2_shape = shapeFromTensor(x2); const auto ndim_x1 = x1_shape.size(); const auto ndim_x2 = x2_shape.size(); std::vector x1_shape_expanded; std::vector x2_shape_expanded; if (ndim_x1 > 0) { const auto m = x1_shape.at(ndim_x1 - 1); x1_shape_expanded.push_back(m); } if (ndim_x2 > 0) { const auto m = x2_shape.at(ndim_x2 - 1); x2_shape_expanded.push_back(m); } if (ndim_x1 > 1) { const auto p = x1_shape.at(ndim_x1 - 2); x1_shape_expanded.insert(x1_shape_expanded.begin(), {p, 1}); } if (ndim_x2 > 1) { const auto r = x2_shape.at(ndim_x2 - 2); x2_shape_expanded.insert(x2_shape_expanded.begin(), {1, r}); } std::vector b_x1; std::vector b_x2; if (ndim_x1 > 2) { b_x1 = {x1_shape.begin(), x1_shape.end() - 2}; } if (ndim_x2 > 2) { b_x2 = {x2_shape.begin(), x2_shape.end() - 2}; } if (b_x1 != b_x2) { const auto get_broadcasted_batch_shape = [](const std::vector &batch_shape, const std::vector &inferred_size) { if (batch_shape == inferred_size) { return batch_shape; } std::vector broadcasted_shape; const auto batch_shape_size = batch_shape.size(); std::for_each( inferred_size.crbegin(), inferred_size.crend(), [cnt = 0u, batch_shape_size, &broadcasted_shape, &batch_shape](const auto &inferred_value) mutable { if (cnt >= batch_shape_size) { broadcasted_shape.insert(broadcasted_shape.begin(), 1); return; } const auto batch_shape_value = batch_shape.at(batch_shape_size - cnt - 1); if (inferred_value != batch_shape_value && batch_shape_value != 1) { broadcasted_shape.insert(broadcasted_shape.begin(), 1); } else { broadcasted_shape.insert(broadcasted_shape.begin(), batch_shape_value); ++cnt; } }); return broadcasted_shape; }; const std::vector expand_batch_portion = at::infer_size(b_x1, b_x2); b_x1 = get_broadcasted_batch_shape(b_x1, expand_batch_portion); b_x2 = get_broadcasted_batch_shape(b_x2, expand_batch_portion); } x1_shape_expanded.insert(x1_shape_expanded.begin(), b_x1.cbegin(), b_x1.cend()); x2_shape_expanded.insert(x2_shape_expanded.begin(), b_x2.cbegin(), b_x2.cend()); auto *x1_expanded = createReshape(graph, x1, x1_shape_expanded)->output(); auto *x2_expanded = createReshape(graph, x2, x2_shape_expanded)->output(); auto *x1_minus_x2 = createSub(graph, {x1_expanded, x2_expanded})->output(); auto *dims = createAndInsertNode(graph, c10::prim::ListConstruct, {wrapInConstant1D(graph, -1)}) ->output(); auto *keepdim = createConstantLong(graph, {0}, {1})->output(); return createHandlerOperation(graph, getHandler(c10::aten::norm), {x1_minus_x2, p_degree, dims, keepdim}); } } // namespace __attribute__((constructor(HANDLER_INIT_PRIORITY))) static void registration() { registerHandler(c10::aten::pairwise_distance, pairwiseDistanceHandler); registerHandler(c10::aten::cosine_similarity, cosineSimilarityHandler); registerHandler(c10::aten::cdist, cdistHandler); registerHandler(c10::aten::_cdist_forward, cdistHandler); } } // namespace poptorch ================================================ FILE: poptorch/source/popart_canonicalization/DropoutOps.cpp ================================================ // Copyright (c) 2020 Graphcore Ltd. All rights reserved. #include "../PoptorchStaticInit.hpp" #include "../PoptorchSymbols.hpp" #include "PopartCanonicalizationUtils.hpp" #include "poptorch/OpBuilder.hpp" #include "poptorch/Utils.hpp" #include "poptorch_logging/Error.hpp" namespace poptorch { namespace { torch::jit::Node *dropoutHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *x = node->input(0); auto p = constantToFloat(node->input(1)->node()); auto train = constantToBool(node->input(2)->node()); if (!train) { return createIdentity(graph, {x}); } return createDropout(graph, {x}, 1, p); } torch::jit::Node *featureDropoutHandler(torch::jit::Graph *graph, torch::jit::Node *node) { torch::jit::Value *input = node->input(0); float ratio = constantToFloat(node->input(1)->node()); bool train = constantToBool(node->input(2)->node()); if (!train) { return createIdentity(graph, {input}); } // Input tensor is required to be more than 2-d since feature dropout assumes // that the input represents a 2-d map of features: N x C x (feature shape) std::vector drop_shape = shapeFromTensor(input); ERROR_ON_MSG(drop_shape.size() < 2, "Feature dropout requires at least 2 dimensions in the input"); // The dropout mask shape will be N x C with as many trailing singleton // dimensions as needed to meet the broadcast requirement std::fill(drop_shape.begin() + 2, drop_shape.end(), 1); return createShapeddropout(graph, {input}, drop_shape, ratio); } } // namespace __attribute__((constructor(HANDLER_INIT_PRIORITY))) static void registration() { registerHandler(c10::aten::feature_dropout, featureDropoutHandler); registerHandler(c10::aten::dropout, dropoutHandler); } } // namespace poptorch ================================================ FILE: poptorch/source/popart_canonicalization/EinsumOp.cpp ================================================ // Copyright (c) 2020 Graphcore Ltd. All rights reserved. #include "EinsumOp.hpp" #include "PopartCanonicalizationUtils.hpp" #include "poptorch/OpBuilder.hpp" #include "poptorch/Utils.hpp" #include "poptorch_logging/Error.hpp" namespace poptorch { EinsumOp::EinsumOp(std::string eq, const std::vector &tensors) { _tensors = tensors; // Remove all whitespace in equation eq.erase(std::remove(eq.begin(), eq.end(), ' '), eq.end()); _lhs = eq; auto pos = eq.find("->"); if (pos != std::string::npos) { _lhs = eq.substr(0, pos); // Add 2 to exclude arrow _rhs = eq.substr(pos + 2); } // Split lhs into labels using ',' delimiter std::stringstream ss(_lhs); std::string s; while (std::getline(ss, s, ',')) { _labels.push_back(s); } ERROR_ON(_labels.size() != _tensors.size()); for (const auto &label : _labels) { for (char c : label) { if (_lhs_char_indices.find(c) == _lhs_char_indices.end()) { _lhs_char_indices[c] = _ordered_chars.size(); _char_counts_seen[c] = 0; _char_counts_remaining[c] = 1; _ordered_chars.push_back(c); } else { _char_counts_remaining[c]++; } } } // Shared rank of tensors during multiplication _n_dims = _ordered_chars.size(); // Calculate implicit rhs according to classical einstein summation if (pos == std::string::npos) { std::copy_if(_ordered_chars.begin(), _ordered_chars.end(), std::back_inserter(_rhs), [&](char c) { return _char_counts_remaining[c] == 1; }); // Must be alphabetical in this case std::sort(_rhs.begin(), _rhs.end()); } _rdims_bs.resize(_n_dims); _bdims_bs.resize(_n_dims); _rhs_bs.resize(_n_dims); for (char c : _rhs) { _rhs_bs[_lhs_char_indices[c]] = true; _rhs_char_indices[c] = _rhs_char_indices.size(); } // All characters must be present in the map but only the indices of rhs // characters matter for (char c : _lhs) { _rhs_char_indices.emplace(c, 0); } } torch::jit::Node * EinsumOp::create(torch::jit::Graph *graph, const std::vector &output_shape) { canonicalizeTensors(graph); torch::jit::Node *output = nullptr; // One tensor means summation or transpose is applied if (_tensors.size() == 1) { if (_lhs.size() > _rhs.size()) { std::vector axes; for (std::size_t i = 0; i < _n_dims; i++) { if (!_rhs_bs[i]) { axes.push_back(static_cast(i)); } } output = createReducesum(graph, {_tensors[0]}, axes, 1); } else { std::vector p_lhs = sortedPermutation(_rhs_char_indices, _labels[0]); output = createTranspose(graph, {_tensors[0]}, p_lhs); } } else { updateCharCounts(_labels[0]); // Base output output = _tensors[0]->node(); // Build product from left to right for (std::size_t i = 1; i < _tensors.size(); i++) { output = createProduct(graph, output->output(), _tensors[i], _labels[i]); } output = permuteOutput(graph, output->output()); } // Remove reduced single dimensions by reshaping return createReshape(graph, output->output(), output_shape); } torch::jit::Node *EinsumOp::tensordotBmm(torch::jit::Graph *graph, torch::jit::Value *x1, torch::jit::Value *x2) const { const std::vector shape_x1 = shapeFromTensor(x1); const std::vector shape_x2 = shapeFromTensor(x2); ERROR_ON(shape_x1.size() != shape_x2.size()); std::int64_t rdims_prod = 1; std::int64_t bdims_prod = 1; for (std::size_t i = 0; i < _n_dims; i++) { if (_rdims_bs[i]) { if (shape_x1[i] == shape_x2[i]) { rdims_prod *= shape_x1[i]; } else if (shape_x1[i] == 1) { x2 = createReducesum(graph, {x2}, {static_cast(i)}, 1) ->output(); } else if (shape_x2[i] == 1) { x1 = createReducesum(graph, {x1}, {static_cast(i)}, 1) ->output(); } } if (_bdims_bs[i]) { bdims_prod *= shape_x1[i]; } } // Partitions existing permutation vector p according to bitset bs. If // should_partition_front == true, elements of p are moved to the front // if the corresponding bool in bs == true. Otherwise, they are moved to // the back. The relative order of other elements must not change. auto fn_partition = [&](auto &p, const auto &bs, bool should_partition_front) { std::stable_partition(p.begin(), p.end(), [&](std::int64_t n) { return bs[n] == should_partition_front; }); }; // Original permutation std::vector p1(_n_dims); std::iota(p1.begin(), p1.end(), 0); std::vector p2 = p1; // Cast the reduction to a batch matrix multiplication by permuting input // dimensions and reshaping to ensure there is one batch dimension and // one reduce (dot product) dimension. // Permute x1 so that rdims are the last dims fn_partition(p1, _rdims_bs, false); // Permute again so that bdims are the first dims fn_partition(p1, _bdims_bs, true); torch::jit::Node *p_x1 = createTranspose(graph, {x1}, p1); // Reshape to (bdims_prod, -1, rdims_prod) torch::jit::Node *p_x1_bmat = createReshape(graph, p_x1->output(), {bdims_prod, -1, rdims_prod}); // Permute x2 so that rdims are the first dims fn_partition(p2, _rdims_bs, true); // Permute again so that bdims are the first dims and rdims follow fn_partition(p2, _bdims_bs, true); torch::jit::Node *p_x2 = createTranspose(graph, {x2}, p2); // Reshape to (bdims_prod, rdims_prod, -1) torch::jit::Node *p_x2_bmat = createReshape(graph, p_x2->output(), {bdims_prod, rdims_prod, -1}); // Matmul -> (bdims_prod, unreduced_x1, unreduced_x2) torch::jit::Node *mm = createMatmul(graph, {p_x1_bmat->output(), p_x2_bmat->output()}); std::vector new_shape; for (std::size_t i = 0; i < _n_dims; i++) { if (_bdims_bs[i]) { new_shape.push_back(shape_x1[i]); } } for (std::size_t i = 0; i < _n_dims; i++) { if (_rdims_bs[i]) { new_shape.push_back(1); } else if (!_bdims_bs[i]) { // If not a batch dim or reduce dim, at least one dim == 1 // so we can multiply to get the right result new_shape.push_back(shape_x1[i] * shape_x2[i]); } } // Restore flattened dims return createReshape(graph, mm->output(), new_shape); } void EinsumOp::canonicalizeTensors(torch::jit::Graph *graph) { for (std::size_t i = 0; i < _tensors.size(); i++) { torch::jit::Value *t = _tensors[i]; std::vector shape = shapeFromTensor(t); // Get permute indices of lhs std::vector p_lhs = sortedPermutation(_lhs_char_indices, _labels[i]); // Calculate permuted shape and label std::vector shape_p; std::transform(p_lhs.begin(), p_lhs.end(), std::back_inserter(shape_p), [&](auto d) { return shape[d]; }); // TODO(T60456): Implement diagonals whenever ai.onnx.EyeLike is implemented // in PopART // Insert missing dims for (std::size_t j = 0; j < _ordered_chars.size(); j++) { if (_labels[i].find(_ordered_chars[j]) == std::string::npos) { shape_p.insert(shape_p.begin() + j, 1); } } // Permute and reshape t = createTranspose(graph, {t}, p_lhs)->output(); _tensors[i] = createReshape(graph, t, shape_p)->output(); } } torch::jit::Node *EinsumOp::permuteOutput(torch::jit::Graph *graph, torch::jit::Value *output) const { std::vector out_chars = _ordered_chars; std::stable_partition(out_chars.begin(), out_chars.end(), [&](char c) { return _bdims_bs[_lhs_char_indices.at(c)]; }); // Permute batch dims back to original locations std::vector p_lhs = sortedPermutation(_lhs_char_indices, out_chars); // Permute to the order specified by rhs std::vector p_rhs = sortedPermutation(_rhs_char_indices, _ordered_chars); // Combine permutations std::vector p_combined; std::transform(p_rhs.begin(), p_rhs.end(), std::back_inserter(p_combined), [&](auto d) { return p_lhs[d]; }); return createTranspose(graph, {output}, p_combined); } void EinsumOp::updateCharCounts(const std::string &label) { for (char c : label) { _char_counts_seen[c]++; _char_counts_remaining[c]--; } } torch::jit::Node *EinsumOp::createProduct(torch::jit::Graph *graph, torch::jit::Value *lhs, torch::jit::Value *rhs, const std::string &rhs_label) { updateCharCounts(rhs_label); for (std::size_t i = 0; i < _n_dims; i++) { char c = _ordered_chars[i]; // if dim appears in rhs, don't reduce // if dim appears in future operands, don't reduce yet _rdims_bs[i] = !_rhs_bs[i] && _char_counts_remaining[_ordered_chars[i]] == 0; _bdims_bs[i] = !_rdims_bs[i] && _char_counts_seen[c] > 1; } return tensordotBmm(graph, lhs, rhs); } } // namespace poptorch ================================================ FILE: poptorch/source/popart_canonicalization/EinsumOp.hpp ================================================ // Copyright (c) 2020 Graphcore Ltd. All rights reserved. #include #include #include #include #include namespace poptorch { class EinsumOp { public: EinsumOp(std::string eq, const std::vector &tensors); torch::jit::Node *create(torch::jit::Graph *graph, const std::vector &output_shape); private: // A modified version of tensordot that handles batch dimensions and takes // two tensors of the same rank that have been unsqueezed (if necessary) to // match. The output is of the same rank. Batch dims always appear first in // the output to allow chaining. torch::jit::Node *tensordotBmm(torch::jit::Graph *graph, torch::jit::Value *x1, torch::jit::Value *x2) const; // Get permute indices of 's' according to the order specified by char_indices template std::vector sortedPermutation(const std::unordered_map &char_indices, const T &s) const { std::vector p(s.size()); std::iota(p.begin(), p.end(), 0); std::sort(p.begin(), p.end(), [&](auto d1, auto d2) { return char_indices.at(s[d1]) < char_indices.at(s[d2]); }); return p; } // Ensure all tensors have same number of dims that are in the same order - // The order in which they appear in the lhs void canonicalizeTensors(torch::jit::Graph *graph); // Combines the following permutations into a single permutation: // 1) Permuting batch dims to their original locations // 2) Permuting to the order specified by the rhs torch::jit::Node *permuteOutput(torch::jit::Graph *graph, torch::jit::Value *output) const; // Updates char counts used to calculate reduce dims and batch dims void updateCharCounts(const std::string &label); torch::jit::Node *createProduct(torch::jit::Graph *graph, torch::jit::Value *lhs, torch::jit::Value *rhs, const std::string &rhs_label); std::vector _tensors; std::string _lhs, _rhs; std::vector _labels; std::size_t _n_dims; // List of characters ordered as seen from left to right. This // is the order of dims during the multiply/reduce stage std::vector _ordered_chars; // Used to determine whether a non-reduce dimension should be // considered a batch dimension during calculation std::unordered_map _char_counts_seen; // Number of times a character appears in future operands - // used to determine whether a dimension should be reduced std::unordered_map _char_counts_remaining; // Mapping of each character to the index in which it appears // in the intermediate tensor shape std::unordered_map _lhs_char_indices; // Mapping of each character to the index in which it appears // in the output shape std::unordered_map _rhs_char_indices; // Bitset indicating dimensions to be reduced std::vector _rdims_bs; // Bitset indicating batch dimensions std::vector _bdims_bs; // Bitset indicating dimensions that appear in rhs std::vector _rhs_bs; }; // class einsum } // namespace poptorch ================================================ FILE: poptorch/source/popart_canonicalization/EmbeddingOps.cpp ================================================ // Copyright (c) 2020 Graphcore Ltd. All rights reserved. #include #include #include "../PoptorchStaticInit.hpp" #include "../PoptorchSymbols.hpp" #include "PopartCanonicalizationUtils.hpp" #include "poptorch/OpBuilder.hpp" #include "poptorch/Utils.hpp" #include "poptorch_logging/Error.hpp" #include "poptorch_logging/Logging.hpp" #include namespace poptorch { namespace { torch::jit::Node *embeddingHandler(torch::jit::Graph *graph, torch::jit::Node *node) { // aten::embedding(Tensor weight, Tensor indices, int padding_idx, bool // scale_grad_by_freq, bool sparse) -> Tensor const bool scale_grad_by_freq = constantToBool(node->input(3)->node()); const bool sparse = constantToBool(node->input(4)->node()); ERROR_ON_MSG(scale_grad_by_freq || sparse, "Unsupported aten::embedding operation"); auto *weight = node->input(0); auto *indices = node->input(1); const auto padding_idx = constantToLong(node->input(2)->node()); if (padding_idx < 0) { // Default: padding_idx == -1 indicates no padding. return createGather(graph, {node->input(0), node->input(1)}, 0); } const std::string msg = fmt::format("{{\"padding_idx\":{}}}", padding_idx); auto *out = createCustomOperation(graph, {weight, indices}, "Embedding", "poptorch.custom_ops", 1, 1, msg); const auto input_type = getNodeScalarType(weight); const auto out_type = c10::TensorType::create(input_type, c10::nullopt, c10::nullopt, c10::nullopt); out->output(0)->setType(out_type); return out; } torch::jit::Node *embeddingBagHandler(torch::jit::Graph *graph, torch::jit::Node *node) { // aten::embedding_bag(Tensor weight, Tensor indices, Tensor offsets, bool // scale_grad_by_freq, int mode, bool sparse, Tensor per_sample_weights, bool // include_last_offset, int? padding_idx) -> Tensor const bool scale_grad_by_freq = constantToBool(node->input(3)->node()); const bool sparse = constantToBool(node->input(5)->node()); auto *padding_idx = node->input(8); ERROR_ON_MSG(scale_grad_by_freq || sparse, "Unsupported aten::embedding_bag operation"); if (!isNone(padding_idx)) { const auto padding_idx_val = constantToInt(node->input(8)->node()); ERROR_ON_MSG(padding_idx_val >= 0, "Unsupported aten::embedding_bag operation: padding_idx " "parameter is unsupported."); } // aten::embedding_bag has 4 outputs but only the first one is used so we // delete them here to match our output while (node->outputs().size() > 1) { node->eraseOutput(node->outputs().size() - 1); } auto *weight = node->input(0); auto *indices = node->input(1); auto *offsets = node->input(2); const int64_t mode = constantToLong(node->input(4)->node()); auto *per_sample_weights = node->input(6); const bool include_last_offset = constantToBool(node->input(7)->node()); const auto reduction = [mode](torch::jit::Graph *g, torch::jit::Value *v) { if (mode == 0) { return createReducesum(g, {v}, {0}, 1)->output(); } if (mode == 1) { return createReducemean(g, {v}, {0}, 1)->output(); } return createReducemax(g, {v}, {0}, 1)->output(); }; ERROR_ON_MSG(!isTensorConstant(offsets->node()), "Unsupported aten::embedding_bag operation: offsets tensor must " "be a constant."); auto offsets_tensor = getNodeTensorAttrValue(offsets->node()); if (!include_last_offset) { // Append INT_MAX to use as the last offset slice offsets_tensor = at::cat({offsets_tensor, at::tensor(INT_MAX)}); } const auto slices = offsets_tensor.accessor(); torch::jit::value_list values; // Use the offsets to extract each bag from the indices. // For each bag: Gather then reduce from the embedding matrix for (int64_t i = 0; i < offsets_tensor.size(0) - 1; i++) { auto *bag = createSlice(graph, {indices}, {slices[i + 1]}, {slices[i]}, {0}) ->output(); auto *gather = createGather(graph, {weight, bag}, 0)->output(); if (!isNone(per_sample_weights)) { auto *psw = createSlice(graph, {per_sample_weights}, {slices[i + 1]}, {slices[i]}, {0}) ->output(); psw = createUnsqueeze(graph, {psw}, {1})->output(); gather = createMul(graph, {gather, psw})->output(); } values.push_back(reduction(graph, gather)); } return createConcat(graph, values, 0); } torch::jit::Node *onehotHandler(torch::jit::Graph *graph, torch::jit::Node *node) { torch::jit::Value *tensor = node->input(0); std::int64_t const num_classes = constantToLong(node->input(1)->node()); ERROR_ON_MSG(num_classes == -1, "OneHot num classes must be specified and must be constant."); // The "hot/cold" values for the one hot representation. torch::jit::Node *values = createConstantInt(graph, {0, 1}, {2}); torch::jit::Node *depth = createConstantInt(graph, {num_classes}, {}); return createOnehot(graph, {tensor, depth->output(), values->output()}, -1); } } // namespace __attribute__((constructor(HANDLER_INIT_PRIORITY))) static void registration() { registerHandler(c10::aten::embedding, embeddingHandler); registerHandler(c10::aten::embedding_bag, embeddingBagHandler); registerHandler(c10::aten::_embedding_bag, embeddingBagHandler); registerHandler(c10::aten::one_hot, onehotHandler); } } // namespace poptorch ================================================ FILE: poptorch/source/popart_canonicalization/IndexOps.cpp ================================================ // Copyright (c) 2020 Graphcore Ltd. All rights reserved. #include #include #include "../PoptorchStaticInit.hpp" #include "../PoptorchSymbols.hpp" #include "PopartCanonicalizationUtils.hpp" #include "ScatterReduction.hpp" #include "poptorch/DispatchTracer.hpp" #include "poptorch/OpBuilder.hpp" #include "poptorch/Utils.hpp" #include "poptorch_logging/Error.hpp" #include "poptorch_logging/Logging.hpp" namespace poptorch { namespace { struct IndexInfo { torch::jit::Value *x_partial_flat; torch::jit::Value *indices_partial_flat; }; std::vector padShape(const std::vector &shape, std::size_t pad, bool pad_front) { std::vector output_shape; auto ones_generator = []() { return 1; }; if (pad_front) { std::generate_n(std::back_inserter(output_shape), pad, ones_generator); } std::copy(shape.begin(), shape.end(), std::back_inserter(output_shape)); if (!pad_front) { std::generate_n(std::back_inserter(output_shape), pad, ones_generator); } return output_shape; } IndexInfo processIndex(torch::jit::Graph *graph, torch::jit::Value *x, std::vector *p_indices) { auto &indices = *p_indices; auto shape = shapeFromTensor(x); std::size_t pad = 0; std::vector index_shape; bool indexed = false; bool pad_front = true; // Calculate the final index size with which the gather operation will be // performed for (torch::jit::Value *index : indices) { if (isNone(index)) { if (indexed) { pad_front = false; } pad++; } else { auto s = shapeFromTensor(index); if (s.size() > index_shape.size()) { index_shape = s; } indexed = true; } } std::size_t const index_size = index_shape.size(); std::vector flat_indices_shape = padShape(index_shape, pad, pad_front); std::size_t nones_indexed = 0; // Reshape each tensor into shape broadcastable with final output shape for (std::size_t i = 0; i < indices.size(); i++) { if (isNone(indices[i])) { // Optional tensors: 'None' means indexing over entire dimension // Replace each None tensor with its explicit index representation std::vector idx(shape[i]); std::iota(idx.begin(), idx.end(), 0); std::vector new_shape(index_size + pad, 1); auto final_shape_index = pad_front ? nones_indexed : index_size + nones_indexed; new_shape[final_shape_index] = shape[i]; flat_indices_shape[final_shape_index] = shape[i]; nones_indexed++; indices[i] = createReshape(graph, intVectorToIrConstant(graph, idx), new_shape) ->output(); } else { const auto original_shape = shapeFromTensor(indices[i]); const std::vector new_shape = padShape(original_shape, pad, pad_front); indices[i] = createReshape(graph, indices[i], new_shape)->output(); } } auto *flat_indices = indices[indices.size() - 1]; std::int64_t stride = shape[indices.size() - 1]; // Calculate indices within partially flattened shape // Tensors are automatically broadcast to the correct shape during calculation for (auto i = 1u; i < indices.size(); i++) { auto *index = indices[indices.size() - i - 1]; auto *offset = createMul(graph, {index, wrapInConstant1D(graph, stride)})->output(); flat_indices = createAdd(graph, {flat_indices, offset})->output(); stride *= shape[indices.size() - i - 1]; } // Retain the shape for downstream calculation flat_indices = createReshape(graph, flat_indices, flat_indices_shape)->output(); std::vector flatten_shape = {-1}; std::copy_n(shape.begin() + indices.size(), shape.size() - indices.size(), std::back_inserter(flatten_shape)); // Flatten the tensor being indexed into [-1, u1, u2, ..., uN] where // each u is a dimension not being indexed into const int64_t num_elems = std::accumulate(shape.cbegin(), shape.cend(), 1, std::multiplies()); auto *flatten = createReshape(graph, x, at::infer_size(flatten_shape, num_elems)); return {flatten->output(), flat_indices}; } torch::jit::Node *indexHandler(torch::jit::Graph *graph, torch::jit::Node *node) { // aten::index(Tensor self, Tensor?[] indices) torch::jit::Value *x = node->input(0); std::vector indices = handleTensorList(node->input(1)->node()); const IndexInfo info = processIndex(graph, x, &indices); // Gather in first dimension using calculated indices into partially flattened // tensor return createGather(graph, {info.x_partial_flat, info.indices_partial_flat}, 0); } bool isMaskedAssign(torch::jit::Graph *graph, torch::jit::Value *x, std::vector &indices) { // Masked fill only takes one index tensor which is broadcastable // with the input if (indices.size() != 1) { return false; } auto index = indices[0]->type()->expect(); ERROR_ON(!index->scalarType().has_value()); auto dtype = index->scalarType().value(); // Masks must be of type bool or byte if (dtype != c10::ScalarType::Bool && dtype != c10::ScalarType::Byte) { return false; } auto mask_shape = shapeFromTensor(indices[0]); auto x_shape = shapeFromTensor(x); // popart::where expects a bool tensor mask so cast if necessary if (dtype == c10::ScalarType::Byte) { indices[0] = createCast(graph, indices[0], c10::ScalarType::Bool)->output(); } // Pad indices to enable broadcasting if (mask_shape.size() < x_shape.size()) { mask_shape.resize(x_shape.size(), 1); indices[0] = createReshape(graph, indices[0], mask_shape)->output(); } return true; } std::optional canVectorizeInDim(std::vector &indices) { std::optional dim; std::int32_t const num_indices = static_cast(indices.size()); for (std::int32_t i = 0; i < num_indices; i++) { if (isNone(indices[i])) { continue; } if (dim) { // Already found a valid dim but additional indices are specified so // cannot vectorise this case. return std::nullopt; } auto idx = indices[i]->type()->expect(); ERROR_ON(!idx->scalarType().has_value()); auto dtype = idx->scalarType().value(); if (!isIntegralType(dtype, false)) { return std::nullopt; } if (idx->dim() != 1 || idx->numel() == 1) { return std::nullopt; } dim = i; } return dim; } void applyInplaceSlice(torch::jit::Node *node, torch::jit::Node *out) { // If we're performing an index_put on a slice - this should operate // "in-place" // // Slices are tensor views in torch, and index_put_ should modify the tensor // being sliced. To simulate in-place modification to slices, we replace all // uses of the tensor being sliced with the output of this operation torch::jit::Value *x = node->input(0); if (x->node()->kind() == symbols::popart::slice) { auto *slice_input = x->node()->input(0); // Recursively follow the chain of slices until we find the original tensor // actually being sliced while (slice_input->node()->kind() == symbols::popart::slice) { slice_input = slice_input->node()->input(0); } slice_input->replaceAllUsesAfterNodeWith(node, out->output()); } } torch::jit::Node *indexPutHandler(torch::jit::Graph *graph, torch::jit::Node *node) { // aten::index_put(Tensor self, Tensor?[] indices, Tensor value, bool // accumulate) torch::jit::Value *x = node->input(0); std::vector indices = handleTensorList(node->input(1)->node()); torch::jit::Value *v = node->input(2); if (isMaskedAssign(graph, x, indices)) { return createWhere(graph, {indices[0], v, x}); } const auto fn_gen_none = [graph]() { torch::jit::Value *none = graph->create(c10::prim::Constant)->output(); none->setType(c10::NoneType::get()); return none; }; const auto shape = shapeFromTensor(x); const auto vectorized_dim = canVectorizeInDim(indices); const auto v_shape = shapeFromTensor(v); if (vectorized_dim) { logging::trace( "Using vectorized ScatterReduce with none reduction in dim {}", *vectorized_dim); // Expand the value tensor to match the input if necessary if (v_shape.size() < shape.size()) { auto new_shape = shape; // In the vectorised case, the index will always be a 1D tensor new_shape[*vectorized_dim] = shapeFromTensor(indices[*vectorized_dim])[0]; v = createExpand(graph, {v, intVectorToIrConstant(graph, new_shape)}) ->output(); } static constexpr auto none_reduce = static_cast(ScatterReduction::None); static constexpr bool enable_index_broadcast = true; auto *out = createScatterreduce(graph, {v, indices[*vectorized_dim], x}, shape[0], *vectorized_dim, enable_index_broadcast, none_reduce); applyInplaceSlice(node, out); return out; } // ONNX Scatter cannot assign entire dimensions, only individual elements, so // we must pad the end of indices with NoneTypes so that the entire input is // flattened during indexing std::generate_n(std::back_inserter(indices), shape.size() - indices.size(), fn_gen_none); IndexInfo info = processIndex(graph, x, &indices); auto indices_shape = shapeFromTensor(info.indices_partial_flat); auto indices_size = std::accumulate(indices_shape.begin(), indices_shape.end(), 1, std::multiplies{}); // Ensure value tensor can be broadcast with indexing result if (v_shape.size() < indices_shape.size()) { v = createReshape(graph, v, {1, -1})->output(); auto v_size = std::accumulate(v_shape.begin(), v_shape.end(), 1, std::multiplies{}); // Repeat v to match indices shape v = createExpand(graph, {v, intVectorToIrConstant( graph, {indices_size / v_size, 1})}) ->output(); } info.indices_partial_flat = createReshape(graph, info.indices_partial_flat, {indices_size})->output(); v = createReshape(graph, v, {indices_size})->output(); // Scatter in first dimension using calculated indices into fully flattened // tensor auto *scatter = createScatterElements( graph, {info.x_partial_flat, info.indices_partial_flat, v}, 0); // Restore original input shape auto *out = createReshape(graph, scatter->output(), shape); applyInplaceSlice(node, out); return out; } torch::jit::Node *indexFillHandler(torch::jit::Graph *graph, torch::jit::Node *node) { // aten::index_fill.int_Scalar(Tensor self, int dim, Tensor index, Scalar // value) -> Tensor aten::index_fill.int_Tensor(Tensor self, int dim, Tensor // index, Tensor value) -> Tensor auto *self = node->input(0); auto dim = constantToLong(node->input(1)->node()); auto *index = node->input(2); auto *value = node->input(3); auto self_dtype = getNodeScalarType(self); if (getNodeScalarType(value) != self_dtype) { value = createCast(graph, value, self_dtype)->output(); } // Create Tensor?[] indices, where indices[dim] = index, and indices[d] = // None, where d < dim std::vector indices; auto fn_gen_none = [graph]() { auto *none = graph->createNone(); insertNodeInGraph(graph, none); return none->output(); }; std::generate_n(std::back_inserter(indices), dim, fn_gen_none); indices.push_back(index); auto *list = createAndInsertNode(graph, c10::prim::ListConstruct, indices); auto *accumulate = createConstantInt(graph, {0}, {}); // Re-use index_put handler auto index_put_handler = getHandler(c10::aten::index_put); return createHandlerOperation( graph, index_put_handler, {self, list->output(), value, accumulate->output()}); } } // namespace __attribute__((constructor(HANDLER_INIT_PRIORITY))) static void registration() { registerHandler(c10::aten::index, indexHandler); registerHandler(c10::aten::index_put, indexPutHandler); registerHandler(c10::aten::index_fill, indexFillHandler); } } // namespace poptorch ================================================ FILE: poptorch/source/popart_canonicalization/LossOps.cpp ================================================ // Copyright (c) 2020 Graphcore Ltd. All rights reserved. #include "../PoptorchStaticInit.hpp" #include "PopartCanonicalizationUtils.hpp" #include "../PoptorchSymbols.hpp" #include "poptorch/DispatchTracer.hpp" #include "poptorch/OpBuilder.hpp" #include "poptorch/Utils.hpp" #include "poptorch_logging/Error.hpp" #include "poptorch_logging/Logging.hpp" namespace poptorch { namespace { torch::jit::Node *binaryCrossEntropyHandler(torch::jit::Graph *graph, torch::jit::Node *node) { // aten::binary_cross_entropy(Tensor input, Tensor target, // Tensor? weight, int reduction) // L = loss, w = weight, y= target, x = input. // Algorithm is: L = - w * (y *log(x) + (1 - y)*log(1 - x)) // The input. torch::jit::Value *x = node->input(0); // The target. torch::jit::Value *y = node->input(1); // Optional weight term. torch::jit::Value *weight = node->input(2); // Loss reduction. std::int64_t reduction = constantToLong(node->input(3)->node()); // Convert to popart reduce values. reduction = convertReduceToPopart(reduction); // Add the one constant torch::jit::Node *one = createConstantFloatLike(graph, x, {1.0}, {}); torch::jit::Node *log_x = createLog(graph, {x}); // Log(x)*y torch::jit::Node *log_x_mul_y = createMul(graph, {y, log_x->output()}); // Do (1 - y) and (1 - x) torch::jit::Node *x_minus_one = createSub(graph, {one->output(), x}); torch::jit::Node *y_minus_one = createSub(graph, {one->output(), y}); // Log(1 - x) torch::jit::Node *log_x_minus_one = createLog(graph, {x_minus_one->output()}); // (1 -y)*Log(1 - x) torch::jit::Node *subs_multiplied = createMul(graph, {y_minus_one->output(), log_x_minus_one->output()}); // Log(x)*y + (1 -y)*Log(1 - x) torch::jit::Node *add_terms = createAdd(graph, {log_x_mul_y->output(), subs_multiplied->output()}); torch::jit::Node *final_node = add_terms; if (weight->node()->kind() != c10::prim::Constant) { final_node = createMul(graph, {add_terms->output(), weight}); } final_node = createNeg(graph, {final_node->output()}); return createIdentityloss(graph, {final_node->output()}, reduction); } torch::jit::Node *nllLossNdHandler(torch::jit::Graph *graph, torch::jit::Node *node) { // "aten::nll_loss2d(Tensor input, Tensor target, Tensor height, Tensor // weight, int reduction, int ignore_index) -> Tensor" // aten::nll_loss2d_forward(Tensor self, Tensor target, Tensor? weight, int // reduction, int ignore_index) -> (Tensor output, Tensor total_weight) // aten::nll_loss2d() is implemented based on popart:nllloss(). // Suppose the input[0] has the shape of (N, C, M, K) // input[0] will be transposed with perm [0, 2, 3, 1], // and reshaped with (N * M * K, C), pushing C to the last dimension. // input[1] will be reshaped to (N * M * K), before calling nllloss. // The generated IRs are as follows: // %37 : Tensor = popart::transpose[perm=[0, 2, 3, 1]](%35) // %38 : Tensor(500:4, 4:1) = popart::reshape_static_shape[shape=[500,4]](%37) // %39 : Int(500:1) = popart::reshape_static_shape[shape=[500]](%25) // %40 : Float() = popart::nllloss[reduction=1, ignoreIndex=-100](%38, %39) // aten::nll_loss_nd(Tensor input, Tensor target, Tensor? weight, int // reduction, int ignore_index) -> Tensor std::int64_t reduction = constantToLong(node->input(3)->node()); std::int64_t ignore_index = constantToLong(node->input(4)->node()); reduction = convertReduceToPopart(reduction); torch::jit::Value *input = node->input(0); torch::jit::Value *target = node->input(1); torch::jit::Value *weight = node->input(2); // TODO(T42695): Support optional weight parameter ERROR_ON_MSG(!isNone(weight), "Parameter \"weight\" is unsupported for aten::nll_loss_nd"); std::vector shape_input = shapeFromTensor(input); std::vector shape_target = shapeFromTensor(target); if (shape_input.size() != 2) { // Input shape: (N, C, d1, d2, ..., dk) // Target shape: (N, d1, d2, ..., dk) // Suppose the input has the shape of (N, C, M, K) // The input will be transposed with perm [0, 2, 3, 1], // and reshaped with (N * M * K, C), pushing C to the last dimension. // The target will be reshaped to (N * M * K), before calling nllloss std::int64_t c = shape_input[1]; std::int64_t flat = std::accumulate(shape_target.begin(), shape_target.end(), 1, std::multiplies{}); // Create an input permutation of (0, 2, 3, ..., N, 1) std::vector p(shape_input.size(), 0); std::iota(p.begin() + 1, p.end() - 1, 2); p[p.size() - 1] = 1; // Permute the class dimension to the end torch::jit::Node *perm = createTranspose(graph, {input}, p); input = createReshape(graph, perm->output(), {flat, c})->output(); target = createReshape(graph, target, {flat})->output(); } torch::jit::Node *loss = createNllloss(graph, {input, target}, reduction, ignore_index, /*inputIsLogProbability=*/true); if (reduction == 2) { // If "none" reduction, return the results with target's original shape loss = createReshape(graph, loss->output(), shape_target); } return createIdentityloss(graph, {loss->output()}, reduction); } torch::jit::Node *crossEntropyLossHandler(torch::jit::Graph *graph, torch::jit::Node *node) { // aten::cross_entropy_loss(Tensor self, Tensor target, Tensor? weight, int // reduction, int ignore_index) auto *input = node->input(0); auto *target = node->input(1); auto *weight = node->input(2); // TODO(T42695): Support optional weight parameter ERROR_ON_MSG( !isNone(weight), "Parameter \"weight\" is unsupported for aten::cross_entropy_loss"); auto *reduction = node->input(3); auto *ignore_index = node->input(4); auto log_softmax_handler = getHandler(c10::aten::log_softmax); auto *log_softmax = createHandlerOperation( graph, log_softmax_handler, {input, wrapInConstant1D(graph, 1)}); // logSoftmaxHandler loses shape information required by nllLossNdHandler, // so we need to set the type to that of the input, as the type will be the // same log_softmax->output()->setType(input->type()); return createHandlerOperation( graph, nllLossNdHandler, {log_softmax->output(), target, weight, reduction, ignore_index}); } torch::jit::Node *klDivHandler(torch::jit::Graph *graph, torch::jit::Node *node) { // aten::kl_div(Tensor self, Tensor target, int reduction, bool log_target) // Input torch::jit::Value *x = node->input(0); // Target torch::jit::Value *y = node->input(1); std::int64_t reduction = constantToLong(node->input(2)->node()); // Convert to popart reduce values reduction = convertReduceToPopart(reduction); // Whether the target is passed as log-probabilities bool log_target = constantToBool(node->input(3)->node()); // log(y) torch::jit::Value *log_y; // Handle log-space targets at this stage if (log_target) { log_y = y; y = createExp(graph, {y})->output(); } else { log_y = createLog(graph, {y})->output(); } // log(y) - x torch::jit::Node *log_y_minus_x = createSub(graph, {log_y, x}); // y(log(y) - x) torch::jit::Node *y_log_y_minus_x = createMul(graph, {y, log_y_minus_x->output()}); // Handle any log(y) where y<=0 from earlier torch::jit::Node *zeros = createConstantFloatLike(graph, y, {0}, {}); torch::jit::Node *mask = createGreater(graph, {y, zeros->output()}); torch::jit::Node *final_node = createWhere( graph, {mask->output(), y_log_y_minus_x->output(), zeros->output()}); return createIdentityloss(graph, {final_node->output()}, reduction); } torch::jit::Node *poissonNllLossHandler(torch::jit::Graph *graph, torch::jit::Node *node) { // aten::poisson_nll_loss(Tensor input, Tensor target, bool log_input, // bool full, float eps, int reduction) // Input torch::jit::Value *x = node->input(0); // Target torch::jit::Value *y = node->input(1); // Whether the input is passed as log-probabilities bool log_input = constantToBool(node->input(2)->node()); // Whether to compute full loss using Stirling approximation bool full = constantToBool(node->input(3)->node()); // Added to avoid log(0) when log_input == false torch::jit::Value *epsilon = node->input(4); std::int64_t reduction = constantToLong(node->input(5)->node()); // Convert to popart reduce values reduction = convertReduceToPopart(reduction); // log(x) torch::jit::Value *log_x; // Handle log-space inputs at this stage if (log_input) { log_x = x; x = createExp(graph, {x})->output(); } else { torch::jit::Value *x_plus_eps = createAdd(graph, {x, epsilon})->output(); log_x = createLog(graph, {x_plus_eps})->output(); } // y log(x) torch::jit::Node *y_mul_log_x = createMul(graph, {y, log_x}); // x - y log(x) torch::jit::Node *final_node = createSub(graph, {x, y_mul_log_x->output()}); // Stirling approximation term = y log(y) -y + 0.5 log(2*PI*y) if (full) { // log(y) torch::jit::Node *log_y = createLog(graph, {y}); // y log(y) torch::jit::Node *y_mul_log_y = createMul(graph, {y, log_y->output()}); // y log(y) - y torch::jit::Node *minus_y = createSub(graph, {y_mul_log_y->output(), y}); // 2*PI torch::jit::Node *two_pi = createConstantFloatLike(graph, x, {2 * M_PI}, {}); // 2*PI*y torch::jit::Node *two_pi_y = createMul(graph, {two_pi->output(), y}); // log(2*PI*y) torch::jit::Node *log_two_pi_y = createLog(graph, {two_pi_y->output()}); // 0.5 torch::jit::Node *half = createConstantFloatLike(graph, x, {0.5}, {}); // 0.5 log(2*PI*y) torch::jit::Node *mul_half = createMul(graph, {half->output(), log_two_pi_y->output()}); // y log(y) - y + 0.5 log(2*PI*y) torch::jit::Node *add = createAdd(graph, {minus_y->output(), mul_half->output()}); // Approximation values only added for target values > 1 std::vector shape = shapeFromTensor(y); torch::jit::Node *ones = createConstantFloatLike(graph, x, {1}, shape); torch::jit::Node *mask = createGreater(graph, {y, ones->output()}); torch::jit::Node *zeros = createConstantFloatLike(graph, x, {0}, shape); torch::jit::Node *masked_fill = createWhere(graph, {mask->output(), add->output(), zeros->output()}); // x - y log(x) + y log(y) - y + 0.5 log(2*PI*y) final_node = createAdd(graph, {final_node->output(), masked_fill->output()}); } return createIdentityloss(graph, {final_node->output()}, reduction); } torch::jit::Node *bceWithLogitsHandler(torch::jit::Graph *graph, torch::jit::Node *node) { // aten::binary_cross_entropy_with_logits(Tensor input, Tensor target, // Tensor? weight, Tensor? pos_weight, // int reduction) // Input torch::jit::Value *x = node->input(0); // Target torch::jit::Value *y = node->input(1); // Weight torch::jit::Value *w = node->input(2); // Weight of positive examples torch::jit::Value *pos_w = node->input(3); std::int64_t reduction = constantToLong(node->input(4)->node()); // Convert to popart reduce values reduction = convertReduceToPopart(reduction); // -x torch::jit::Node *loss = createNeg(graph, {x}); // 0 torch::jit::Node *zeros = createConstantFloatLike(graph, x, {0}, {}); // m = max(-x, 0) torch::jit::Node *m = createMax(graph, {loss->output(), zeros->output()}); // -x - m loss = createSub(graph, {loss->output(), m->output()}); // exp(-x - m) loss = createExp(graph, {loss->output()}); // -m torch::jit::Node *neg_m = createNeg(graph, {m->output()}); // exp(-m) torch::jit::Node *exp_neg_m = createExp(graph, {neg_m->output()}); // exp(-m) + exp(-x - m) loss = createAdd(graph, {exp_neg_m->output(), loss->output()}); // log(exp(-m) + exp(-x - m)) loss = createLog(graph, {loss->output()}); // m + log(exp(-m) + exp(-x - m)) loss = createAdd(graph, {m->output(), loss->output()}); // 1 torch::jit::Node *ones = createConstantFloatLike(graph, x, {1}, {}); // if pos_weight is specified if (!isNone(pos_w)) { // p - 1 torch::jit::Node *p_minus_one = createSub(graph, {pos_w, ones->output()}); // (p - 1) y torch::jit::Node *p_minus_one_mul_y = createMul(graph, {p_minus_one->output(), y}); // l_p = (p - 1) y + 1 torch::jit::Node *l_p = createAdd(graph, {p_minus_one_mul_y->output(), ones->output()}); // l_p (m + log(exp(-m) + exp(-x - m))) loss = createMul(graph, {l_p->output(), loss->output()}); } // (1 - y) torch::jit::Node *one_minus_y = createSub(graph, {ones->output(), y}); // (1 - y) x torch::jit::Node *mul_x = createMul(graph, {one_minus_y->output(), x}); // (1 - y) x + l_p (m + log(exp(-m) + exp(-x - m))) loss = createAdd(graph, {mul_x->output(), loss->output()}); // if weight is specified if (!isNone(w)) { // w [(1 - y) x + l_p (m + log(exp(-m) + exp(-x - m)))] loss = createMul(graph, {w, loss->output()}); } return createIdentityloss(graph, {loss->output()}, reduction); } // TODO(T30688): Unsupported since the PyTorch implementation doesn't // currently use this aten function torch::jit::Node *multiLabelSoftMarginLossHandler(torch::jit::Graph *graph, torch::jit::Node *node) { // aten::multilabel_soft_margin_loss(Tensor input, Tensor target, // Tensor? weight, int reduction) // Input torch::jit::Value *x = node->input(0); // Target torch::jit::Value *y = node->input(1); // Weight torch::jit::Value *w = node->input(2); std::int64_t reduction = constantToLong(node->input(3)->node()); // Convert to popart reduce values reduction = convertReduceToPopart(reduction); auto log_sigmoid_handler = getHandler(c10::aten::log_sigmoid); // -x torch::jit::Node *loss = createNeg(graph, {x}); // log(sigmoid(-x)) loss = createHandlerOperation(graph, log_sigmoid_handler, {loss->output()}); // 1 torch::jit::Node *ones = createConstantFloatLike(graph, x, {1}, {}); // 1 - y torch::jit::Node *one_minus_y = createSub(graph, {ones->output(), y}); // (1 - y) log(sigmoid(-x)) loss = createMul(graph, {one_minus_y->output(), loss->output()}); // log(sigmoid(x)) torch::jit::Node *log_sig_x = createHandlerOperation(graph, log_sigmoid_handler, {x}); // y log(sigmoid(x)) torch::jit::Node *y_mul_log_sig_x = createMul(graph, {y, log_sig_x->output()}); // y log(sigmoid(x)) + (1 - y) log(sigmoid(-x)) loss = createAdd(graph, {y_mul_log_sig_x->output(), loss->output()}); // -(y log(sigmoid(x)) + (1 - y) log(sigmoid(-x))) loss = createNeg(graph, {loss->output()}); // if weight is specified if (!isNone(w)) { // -w (y log(sigmoid(x)) + (1 - y) log(sigmoid(-x))) loss = createMul(graph, {w, loss->output()}); } return createIdentityloss(graph, {loss->output()}, reduction); } torch::jit::Node *cosineEmbeddingLossHandler(torch::jit::Graph *graph, torch::jit::Node *node) { // aten::cosine_embedding_loss(Tensor input1, Tensor input2, Tensor target, // float margin, int reduction) // Input 1 torch::jit::Value *x1 = node->input(0); // Input 2 torch::jit::Value *x2 = node->input(1); // Target torch::jit::Value *y = node->input(2); // Margin torch::jit::Value *margin = node->input(3); std::int64_t reduction = constantToLong(node->input(4)->node()); // Convert to popart reduce values reduction = convertReduceToPopart(reduction); // Epsilon torch::jit::Value *epsilon = createConstantFloatLike(graph, x1, {1e-12}, {})->output(); // x1 * x2 torch::jit::Node *x1_mul_x2 = createMul(graph, {x1, x2}); // sum(x1 * x2) torch::jit::Node *sum_x1_mul_x2 = createReducesum(graph, {x1_mul_x2->output()}, {1}, 0); // sum_sqr(x1) torch::jit::Node *sum_sqr_x1 = createReducesumsquare(graph, {x1}, {1}, 0); // sq1 = sum_sqr(x1) + eps torch::jit::Node *sum_sqr_x1_plus_eps = createAdd(graph, {sum_sqr_x1->output(), epsilon}); // sum_sqr(x2) torch::jit::Node *sum_sqr_x2 = createReducesumsquare(graph, {x2}, {1}, 0); // sq2 = sum_sqr(x2) + eps torch::jit::Node *sum_sqr_x2_plus_eps = createAdd(graph, {sum_sqr_x2->output(), epsilon}); // sq1 * sq1 torch::jit::Node *sq1_mul_sq2 = createMul( graph, {sum_sqr_x1_plus_eps->output(), sum_sqr_x2_plus_eps->output()}); // sqrt(sq1 * sq2) torch::jit::Node *sqrt_sq1_mul_sq2 = createSqrt(graph, {sq1_mul_sq2->output()}); // cos_sim(x1, x2) torch::jit::Node *cos_sim = createDiv(graph, {sum_x1_mul_x2->output(), sqrt_sq1_mul_sq2->output()}); // 1 torch::jit::Node *ones = createConstantFloatLike(graph, x1, {1}, {}); // 1 - cos_sim(x1, x2) torch::jit::Node *one_minus_cos_sim = createSub(graph, {ones->output(), cos_sim->output()}); // cos_sim(x1, x2) - margin torch::jit::Node *cos_sim_minus_margin = createSub(graph, {cos_sim->output(), margin}); // 0 torch::jit::Node *zeros = createConstantFloatLike(graph, x1, {0}, {}); // max(0, cos_sim(x1, x2) - margin) torch::jit::Node *max_zero_cos_sim_minus_margin = createMax(graph, {zeros->output(), cos_sim_minus_margin->output()}); // -1 torch::jit::Node *neg_ones = createConstantInt(graph, {-1}, {}); // if y = 1 torch::jit::Node *ones_mask = createEqual(graph, {y, ones->output()}); // if y = -1 torch::jit::Node *neg_ones_mask = createEqual(graph, {y, neg_ones->output()}); // l = 1 - cos(x1, x2) if y = 1 torch::jit::Node *ones_masked_fill = createWhere(graph, {ones_mask->output(), one_minus_cos_sim->output(), zeros->output()}); // l = max(0, cos(x1, x2) - margin) if y = -1 torch::jit::Node *neg_ones_masked_fill = createWhere( graph, {neg_ones_mask->output(), max_zero_cos_sim_minus_margin->output(), zeros->output()}); torch::jit::Node *loss = createAdd( graph, {ones_masked_fill->output(), neg_ones_masked_fill->output()}); return createIdentityloss(graph, {loss->output()}, reduction); } torch::jit::Node *tripletMarginLossHandler(torch::jit::Graph *graph, torch::jit::Node *node) { // aten::triplet_margin_loss(Tensor anchor, Tensor positive, Tensor negative, // float margin, float p, float eps, bool swap, int // reduction) // Anchor torch::jit::Value *a = node->input(0); // Positive torch::jit::Value *pos = node->input(1); // Negative torch::jit::Value *neg = node->input(2); // Margin torch::jit::Value *margin = node->input(3); // Norm degree for pairwise distance torch::jit::Value *p = node->input(4); // Small value to avoid division by zero torch::jit::Value *eps = node->input(5); // Swap bool swap = constantToBool(node->input(6)->node()); // keepdim = false torch::jit::Value *keepdim = createConstantInt(graph, {0}, {})->output(); std::int64_t reduction = constantToLong(node->input(7)->node()); // Convert to popart reduce values reduction = convertReduceToPopart(reduction); // pairwiseDistanceHandler auto pairwise_dist_handler = getHandler(c10::aten::pairwise_distance); // d(a, pos) torch::jit::Node *loss = createHandlerOperation(graph, pairwise_dist_handler, {a, pos, p, eps, keepdim}); // d(a, neg) torch::jit::Node *dist_neg = createHandlerOperation( graph, pairwise_dist_handler, {a, neg, p, eps, keepdim}); if (swap) { torch::jit::Node *dist_swap = createHandlerOperation( graph, pairwise_dist_handler, {pos, neg, p, eps, keepdim}); // d(a, neg) = min(d(a, neg), d(pos, neg)) dist_neg = createMin(graph, {dist_neg->output(), dist_swap->output()}); } // d(a, pos) - d(a, neg) loss = createSub(graph, {loss->output(), dist_neg->output()}); // d(a, pos) - d(a, neg) + margin loss = createAdd(graph, {loss->output(), margin}); torch::jit::Node *zeros = createConstantFloatLike(graph, a, {0}, {}); // max(d(a, pos) - d(a, neg) + margin, 0) loss = createMax(graph, {loss->output(), zeros->output()}); return createIdentityloss(graph, {loss->output()}, reduction); } torch::jit::Node *ctcLossHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *log_probs = node->input(0); auto *targets = node->input(1); auto *input_lengths = node->input(2); auto *target_lengths = node->input(3); auto blank = constantToInt(node->input(4)->node()); auto reduction = constantToLong(node->input(5)->node()); auto zero_inf = constantToBool(node->input(6)->node()); ERROR_ON_MSG(reduction == 0, "CTCLoss with reduction=\"none\" is currently not supported"); targets = createCast(graph, {targets}, "UINT32")->output(); if (input_lengths->type()->kind() == c10::TypeKind::TensorType) { // aten::ctc_loss.Tensor input_lengths = createCast(graph, {input_lengths}, "UINT32")->output(); } else { // aten::ctc_loss.IntList: convert to tensor for popart::_ctcloss const auto values = constantToLongVec(input_lengths->node()); const std::int64_t shape = values.size(); input_lengths = createConstantInt(graph, values, {shape})->output(); } if (target_lengths->type()->kind() == c10::TypeKind::TensorType) { // aten::ctc_loss.Tensor target_lengths = createCast(graph, {target_lengths}, "UINT32")->output(); } else { // aten::ctc_loss.IntList: convert to tensor for popart::_ctcloss const auto values = constantToLongVec(target_lengths->node()); const std::int64_t shape = values.size(); target_lengths = createConstantInt(graph, values, {shape})->output(); } reduction = convertReduceToPopart(reduction); auto *loss = create_ctcloss(graph, {log_probs, targets, input_lengths, target_lengths}, reduction, blank, "UNDEFINED", zero_inf); return createIdentityloss(graph, {loss->output()}, reduction); } torch::jit::Node *ctcbeamsearchdecoderHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *log_probs = node->input(0); auto *lengths = node->input(1); auto blank = constantToInt(node->input(2)->node()); auto width = constantToInt(node->input(3)->node()); auto top_paths = constantToInt(node->input(4)->node()); lengths = createCast(graph, {lengths}, "UINT32")->output(); auto *decoder = createCtcbeamsearchdecoder(graph, {log_probs, lengths}, blank, width, top_paths); decoder->addOutput(); decoder->addOutput(); node->output(0)->replaceAllUsesWith(decoder->output(0)); node->output(1)->replaceAllUsesWith(decoder->output(1)); node->output(2)->replaceAllUsesWith(decoder->output(2)); markNodeForDeletion(node); return decoder; } } // namespace __attribute__((constructor(HANDLER_INIT_PRIORITY))) static void registration() { registerHandler(c10::aten::nll_loss2d, nllLossNdHandler); registerHandler(c10::aten::nll_loss2d_forward, nllLossNdHandler); registerHandler(c10::aten::nll_loss_nd, nllLossNdHandler); registerHandler(c10::aten::nll_loss_forward, nllLossNdHandler); registerHandler(c10::aten::binary_cross_entropy, binaryCrossEntropyHandler); registerHandler(c10::aten::kl_div, klDivHandler); registerHandler(c10::aten::poisson_nll_loss, poissonNllLossHandler); registerHandler(c10::aten::binary_cross_entropy_with_logits, bceWithLogitsHandler); registerHandler(c10::aten::multilabel_soft_margin_loss, multiLabelSoftMarginLossHandler); registerHandler(c10::aten::cosine_embedding_loss, cosineEmbeddingLossHandler); registerHandler(c10::aten::triplet_margin_loss, tripletMarginLossHandler); registerHandler(c10::aten::ctc_loss, ctcLossHandler); registerHandler(symbols::poptorch::ctc_beam_search_decoder, ctcbeamsearchdecoderHandler); registerHandler(c10::aten::cross_entropy_loss, crossEntropyLossHandler); } } // namespace poptorch ================================================ FILE: poptorch/source/popart_canonicalization/NormalizationOps.cpp ================================================ // Copyright (c) 2020 Graphcore Ltd. All rights reserved. #include "../PoptorchStaticInit.hpp" #include "../PoptorchSymbols.hpp" #include "PopartCanonicalizationUtils.hpp" #include "poptorch/ImplicitCasting.hpp" #include "poptorch/OpBuilder.hpp" #include "poptorch/Utils.hpp" #include "poptorch_logging/Error.hpp" namespace poptorch { namespace { void initializeParamConstant(torch::jit::Graph *graph, torch::jit::Value *input, torch::jit::Value **param, float value, const std::vector &shape, const std::string &norm_name, const std::string &input_name, bool always_f32 = false) { c10::ScalarType const scalar_type = *input->type()->expect()->scalarType(); switch (scalar_type) { case c10::ScalarType::Int: { *param = createConstantInt(graph, {static_cast(value)}, shape) ->output(); break; } case c10::ScalarType::Half: case c10::ScalarType::Float: { if (always_f32) { *param = createConstantFloat32(graph, {value}, shape)->output(); } else { *param = createConstantFloatLike(graph, input, {value}, shape)->output(); } break; } default: ERROR(norm_name << " input \"" << input_name << "\"" << " of type " << c10::toString(scalar_type) << " not supported"); } } // Return true if parameters are initialised by this function, otherwise return // false bool maybeInitializeAffineParamConstants(torch::jit::Graph *graph, torch::jit::Value *input, torch::jit::Value **weight, torch::jit::Value **bias, const std::vector &shape, const std::string &norm_name) { // Either both should be defined, or neither ERROR_ON(isNone(*weight) != isNone(*bias)); if (!isNone(*weight)) { return false; } initializeParamConstant(graph, input, weight, 1, shape, norm_name, "weight"); initializeParamConstant(graph, input, bias, 0, shape, norm_name, "bias"); return true; } // Ensures running_mean and running_var tensors by creating constants if they // are not set (None) The running_mean and running_var may be none e.g. if // track_running_stats is set to False for the relevant PyTorch BatchNorm layer. // To satisfy popart/onnx, create a zero input for running_mean and all ones for // running_var void maybeInitializeRunningParamConstants( torch::jit::Graph *graph, torch::jit::Value *input, torch::jit::Value **running_mean, torch::jit::Value **running_var, const std::vector &shape) { // Either both should be defined, or neither ERROR_ON(isNone(*running_mean) != isNone(*running_var)); if (!isNone(*running_mean)) { return; } std::string const norm_name = "BatchNorm"; const bool always_f32 = false; initializeParamConstant(graph, input, running_mean, 0, shape, norm_name, "running_mean", always_f32); initializeParamConstant(graph, input, running_var, 1, shape, norm_name, "running_var", always_f32); } torch::jit::Node *batchNormHandler(torch::jit::Graph *graph, torch::jit::Node *node) { // aten::batch_norm(Tensor input, Tensor? weight, Tensor? bias, Tensor? // running_mean, Tensor? running_var, bool training, float momentum, float // eps, bool cudnn_enabled) -> Tensor // aten::native_batch_norm(Tensor input, Tensor? weight, Tensor? bias, Tensor? // running_mean, Tensor? running_var, bool training, float momentum, float // eps) -> (Tensor, Tensor, Tensor) // Input is value at 0th position. torch::jit::Value *input = node->input(0); auto input_shape = shapeFromTensor(input); torch::jit::Value *weight = node->input(1); torch::jit::Value *bias = node->input(2); castWeightAndBias(graph, input, weight, bias); torch::jit::Value *running_mean = node->input(3); torch::jit::Value *running_var = node->input(4); const float momentum = constantToFloat(node->input(6)->node()); const float epsilon = constantToFloat(node->input(7)->node()); const bool training = constantToBool(node->input(5)->node()); const bool three_outputs = (node->kind() == c10::aten::native_batch_norm); const std::vector param_shape{input_shape[1]}; maybeInitializeAffineParamConstants(graph, input, &weight, &bias, param_shape, "BatchNorm"); // Use initialised constants if running_mean and running_var are none maybeInitializeRunningParamConstants(graph, input, &running_mean, &running_var, param_shape); // PyTorch supports an input size of (N, C, *) but PopART requires the spatial // dimension, so we must ensure an input size of (N, C, L, *) if (input_shape.size() == 2) { input = createUnsqueeze(graph, {input}, {2})->output(); } // To indicate training, for BatchNormalization-9, use num_outputs = 5 // From ONNX // Output case #1: Y, mean, var, saved_mean, saved_var (training mode) // Output case #2: Y (test mode) // Popart supports this with "if (output->n() > 1)" auto *batch_norm = createBatchnormalization( graph, {input, weight, bias, running_mean, running_var}, training ? 5 : 1, epsilon, 1.0f - momentum, training && three_outputs ? 3 : 1); // If the input size was of rank 2, we need to squeeze out the added dim if (input_shape.size() == 2) { batch_norm = createSqueeze(graph, {batch_norm->output(0)}, {2}); } return batch_norm; } torch::jit::Node *layerNormHandler(torch::jit::Graph *graph, torch::jit::Node *node) { // aten::layer_norm(Tensor input,int[] normalized_shape, Tensor? weight, // Tensor? bias, float eps, bool cudnn_enable) -> Tensor // aten::native_layer_norm(Tensor input, int[] normalized_shape, // Tensor? weight, Tensor? bias, float eps) -> (Tensor, Tensor, Tensor) // NB return tensors match PopART // Tensor to normalise. torch::jit::Value *input = node->input(0); std::vector normalized_shape = constantToLongVec(node->input(1)->node()); // Weight to multiply. torch::jit::Value *gamma = node->input(2); // Bias to add. torch::jit::Value *beta = node->input(3); castWeightAndBias(graph, input, gamma, beta); auto numel_affine = std::accumulate(normalized_shape.begin(), normalized_shape.end(), 1, std::multiplies{}); const bool initialized = maybeInitializeAffineParamConstants( graph, input, &gamma, &beta, {numel_affine}, "LayerNorm"); if (!initialized) { // GroupNorm takes per-channel affine parameters whereas LayerNorm takes // elementwise affine parameters. Therefore we first need to reshape such // that the affine parameters are "per-channel" which in the case of // LayerNorm is equivalent to flattening them gamma = createReshape(graph, gamma, {static_cast(numel_affine)}) ->output(); beta = createReshape(graph, beta, {static_cast(numel_affine)}) ->output(); } const float epsilon = constantToFloat(node->input(4)->node()); // Pytorch normalizes across arbitrary number of dimensions from the end. // We flatten into a [M, N] array and normalize the N. // (In the event of using native_layer_norm, there will be three outputs. // Use only the first.) const std::vector output_shape = shapeFromTensor(node->output(0)); const std::vector input_shape = shapeFromTensor(input); const std::int64_t axis = input_shape.size() - normalized_shape.size(); // Flatten into [M, N] torch::jit::Node *flatten = createFlatten(graph, {input}, axis); // Normalize. torch::jit::Node *normalize = createGroupnormalization( graph, {flatten->output(), gamma, beta}, 1, epsilon); // Perform the reshape. return createReshape(graph, normalize->output(), output_shape); } // This handler ensures that the input to popart is 4-dimensional torch::jit::Node *groupNormHandler(torch::jit::Graph *graph, torch::jit::Node *node) { // aten::group_norm(Tensor input, int num_groups, Tensor? weight, Tensor? // bias, float eps, bool cudnn_enabled) torch::jit::Value *input = node->input(0); std::int64_t const num_groups = constantToLong(node->input(1)->node()); // Weight to multiply torch::jit::Value *gamma = node->input(2); // Bias to add torch::jit::Value *beta = node->input(3); castWeightAndBias(graph, input, gamma, beta); auto num_channels = shapeFromTensor(input)[1]; maybeInitializeAffineParamConstants(graph, input, &gamma, &beta, {num_channels}, "GroupNorm"); const float epsilon = constantToFloat(node->input(4)->node()); return createGroupnormalization(graph, {input, gamma, beta}, num_groups, epsilon); } // aten::native_group_norm has a different signature to aten::group_norm torch::jit::Node *nativeGroupNormHandler(torch::jit::Graph *graph, torch::jit::Node *node) { // aten::native_group_norm(Tensor input, Tensor? weight, Tensor? bias, int N, // int C, int HxW, int group, float eps) -> (Tensor, Tensor, Tensor) // Returns are (result, mean, inv_std_dev) which matches PopTorch torch::jit::Value *input = node->input(0); // Weight to multiply torch::jit::Value *gamma = node->input(1); // Bias to add torch::jit::Value *beta = node->input(2); castWeightAndBias(graph, input, gamma, beta); auto num_channels = shapeFromTensor(input)[1]; maybeInitializeAffineParamConstants(graph, input, &gamma, &beta, {num_channels}, "GroupNorm"); // N, C and HxW are redundant given that the input size must be known for // IPU, but provide a useful check auto input_shape = shapeFromTensor(input); ERROR_ON(input_shape[0] != constantToLong(node->input(3)->node())); ERROR_ON(input_shape[1] != constantToLong(node->input(4)->node())); auto hx_w = std::accumulate(input_shape.begin() + 2, input_shape.end(), static_cast(1), std::multiplies()); ERROR_ON(hx_w != constantToLong(node->input(5)->node())); std::int64_t const num_groups = constantToLong(node->input(6)->node()); const float epsilon = constantToFloat(node->input(7)->node()); return createGroupnormalization(graph, {input, gamma, beta}, num_groups, epsilon); } torch::jit::Node *instanceNormHandler(torch::jit::Graph *graph, torch::jit::Node *node) { // aten::instance_norm(Tensor input, Tensor? weight, Tensor? bias, Tensor? // running_mean, Tensor? running_var, bool // use_input_stats, float momentum, float eps, bool // cudnn_enabled) // Tensor to normalise // Input: (N, C, L) InstanceNorm1d // (N, C, H, W) InstanceNorm2d // (N, C, D, H, W) InstanceNorm3d torch::jit::Value *input = node->input(0); // Weight to multiply torch::jit::Value *gamma = node->input(1); // Bias to add torch::jit::Value *beta = node->input(2); castWeightAndBias(graph, input, gamma, beta); std::int64_t const num_channels = shapeFromTensor(input)[1]; maybeInitializeAffineParamConstants(graph, input, &gamma, &beta, {num_channels}, "InstanceNorm"); // Group normalization does not currently allow passing a momentum value, // nor the running mean or running variance const float epsilon = constantToFloat(node->input(7)->node()); // Normalize per channel C, so use Group normalization with C groups return createGroupnormalization(graph, {input, gamma, beta}, num_channels, epsilon); } } // namespace __attribute__((constructor(HANDLER_INIT_PRIORITY))) static void registration() { registerHandler(c10::aten::batch_norm, batchNormHandler); registerHandler(c10::aten::native_batch_norm, batchNormHandler); registerHandler(c10::aten::layer_norm, layerNormHandler); registerHandler(c10::aten::native_layer_norm, layerNormHandler); registerHandler(c10::aten::group_norm, groupNormHandler); registerHandler(c10::aten::native_group_norm, nativeGroupNormHandler); registerHandler(c10::aten::instance_norm, instanceNormHandler); } } // namespace poptorch ================================================ FILE: poptorch/source/popart_canonicalization/OtherOps.cpp ================================================ // Copyright (c) 2020 Graphcore Ltd. All rights reserved. #include #include "../PoptorchStaticInit.hpp" #include "../PoptorchSymbols.hpp" #include "EinsumOp.hpp" #include "PopartCanonicalizationUtils.hpp" #include "ScatterReduction.hpp" #include "poptorch/OpBuilder.hpp" #include "poptorch/PopartCanonicalization.hpp" #include "poptorch/Utils.hpp" #include "poptorch_logging/Error.hpp" #include "poptorch_logging/Logging.hpp" #include namespace poptorch { namespace { torch::jit::Node *bucketizeHandler(torch::jit::Graph *graph, torch::jit::Node *node) { // aten::bucketize.Tensor(Tensor self, Tensor boundaries, *, // bool out_int32=False, bool right=False) -> Tensor const auto args = poptorch::promoteTensors(graph, node->input(0), node->input(1)); const bool right = constantToBool(node->input(3)->node()); return createBucketize(graph, args, right); } torch::jit::Node *bincountHandler(torch::jit::Graph *graph, torch::jit::Node *node) { // aten::bincount(Tensor self, Tensor? weights=None, int minlength=0) // -> Tensor auto *input = node->input(0); auto *const weights_param = node->input(1); auto *const minlength = node->input(2); const int64_t axis_size = constantToLong(minlength->node()); const auto weights_length = shapeFromTensor(input).front(); auto *const weights = isNone(weights_param) ? createConstantInt(graph, std::vector(weights_length, 1), {weights_length}) ->output() : weights_param; if (getNodeScalarType(input) != c10::kInt) { input = createCast(graph, createFloor(graph, {input})->output(), c10::kInt) ->output(); } auto *const condition = createLess(graph, {input, minlength})->output(); auto *const max_index = createConstantInt(graph, {weights_length - 1}, {1})->output(); input = createWhere(graph, {condition, input, max_index})->output(); static constexpr bool enable_index_broadcast = false; static constexpr int64_t reduction_type = static_cast(ScatterReduction::Sum); static constexpr int64_t axis = 0; return createScatterreduce(graph, {weights, input}, axis_size, axis, enable_index_broadcast, reduction_type); } torch::jit::Node *einsumHandler(torch::jit::Graph *graph, torch::jit::Node *node) { // aten::einsum(string equation, Tensor[] tensors) -> Tensor // Einstein summation convention equation const std::string eq = constantToString(node->input(0)->node()); // List of inputs to perform the operation on const std::vector tensors = handleTensorList(node->input(1)->node()); const std::vector output_shape = shapeFromTensor(node->output()); EinsumOp einsum(eq, tensors); return einsum.create(graph, output_shape); } torch::jit::Node *meshgridHandler(torch::jit::Graph *graph, torch::jit::Node *node) { // aten::meshgrid(Tensor[] tensors) -> Tensor[] const std::vector tensors = handleTensorList(node->input(0)->node()); std::vector expand_shape; expand_shape.reserve(tensors.size()); for (torch::jit::Value *tensor : tensors) { // Each tensor is 1D so the shape is just the first dim expand_shape.push_back(shapeFromTensor(tensor)[0]); } std::vector grids; for (std::size_t i = 0; i < tensors.size(); i++) { std::vector shape(tensors.size(), 1); shape[i] = -1; // Reshape 1D tensor to rank N, N = number of tensors, such that // all but the ith dimension is a singleton torch::jit::Node *reshaped = createReshape(graph, tensors[i], shape); // Expand over the dimensions of all other tensors torch::jit::Node *expanded = createExpand(graph, {reshaped->output(), intVectorToIrConstant(graph, expand_shape)}); grids.push_back(expanded->output()); } return createAndInsertNode(graph, at::prim::ListConstruct, grids); } torch::jit::Node *cartesianProdHandler(torch::jit::Graph *graph, torch::jit::Node *node) { // aten::cartesian_prod(Tensor[] tensors) -> Tensor const std::vector tensors = handleTensorList(node->input(0)->node()); if (tensors.size() == 1) { return tensors[0]->node(); } auto meshgrid_handler = getHandler(c10::aten::meshgrid); auto stack_handler = getHandler(c10::aten::stack); torch::jit::Node *grids = createHandlerOperation(graph, meshgridHandler, {node->input(0)}); std::vector grids_vector = handleTensorList(grids); for (torch::jit::Value *&grid : grids_vector) { // Flatten into 1 x N torch::jit::Node *flatten = createFlatten(graph, {grid}, 0); // Squeeze the first dimension flatten = createSqueeze(graph, {flatten->output()}, {0}); grid = flatten->output(); } torch::jit::Node *grid_list = createAndInsertNode(graph, at::prim::ListConstruct, grids_vector); // Stack 1D tensors along dimension 1 return createHandlerOperation( graph, stack_handler, {grid_list->output(), wrapInConstant1D(graph, 1)}); } torch::jit::Node *tensordotHandler(torch::jit::Graph *graph, torch::jit::Node *node) { // aten::tensordot(Tensor self, Tensor other, int[] dims_self, // int[] dims_other) -> Tensor torch::jit::Value *x1 = node->input(0); torch::jit::Value *x2 = node->input(1); std::vector rdims_x1 = constantToLongVec(node->input(2)->node()); const std::vector rdims_x2 = constantToLongVec(node->input(3)->node()); // rdims_prod (default = 1 with no reduction) std::int64_t rdims_prod = 1; const std::vector shape_x1 = shapeFromTensor(x1); const std::vector shape_x2 = shapeFromTensor(x2); // Original permutation std::vector p1 = shape_x1; std::iota(p1.begin(), p1.end(), 0); std::vector p2 = shape_x2; std::iota(p2.begin(), p2.end(), 0); const std::size_t n_dims_x1 = p1.size(); const std::size_t n_dims_x2 = p2.size(); const std::size_t n_rdims = rdims_x1.size(); // Negative (relative) indexing -> absolute indexing for (std::int64_t &rdim : rdims_x1) { if (rdim < 0) { rdim += n_dims_x1; } } std::vector rdims_x1_bs(n_dims_x1); std::vector rdims_x2_bs(n_dims_x2); for (std::size_t i = 0; i < n_rdims; i++) { rdims_x1_bs[rdims_x1[i]] = true; rdims_x2_bs[rdims_x2[i]] = true; // prod(rdims_x1) == prod(rdims_x2) so just use x1 rdims_prod *= shape_x1[rdims_x1[i]]; } // Permutes x according to existing permutation vector p and bitset bs. If // should_partition_front == true, elements of p are moved to the front // if the corresponding bool in bs == true. Otherwise, they are moved to // the back. The relative order of other elements must not change. const auto fn_partition_permute = [&](torch::jit::Value *x, auto &p, const auto &bs, bool should_partition_front) { std::stable_partition(p.begin(), p.end(), [&](std::int64_t n) { return bs[n] == should_partition_front; }); return createTranspose(graph, {x}, p); }; // Permute x1 so that rdims_x1 are the last dims torch::jit::Node *p_x1 = fn_partition_permute(x1, p1, rdims_x1_bs, false); // Reshape to (-1, rdims_prod(rdims)) torch::jit::Node *p_x1_mat = createReshape(graph, p_x1->output(), {-1, rdims_prod}); // Permute x2 so that rdims_x2 are the first dims torch::jit::Node *p_x2 = fn_partition_permute(x2, p2, rdims_x2_bs, true); // Reshape to (rdims_prod(rdims), -1) torch::jit::Node *p_x2_mat = createReshape(graph, p_x2->output(), {rdims_prod, -1}); // Matmul -> (unreduced_x1, unreduced_x2) torch::jit::Node *mm = createMatmul(graph, {p_x1_mat->output(), p_x2_mat->output()}); std::vector new_shape; new_shape.reserve(n_dims_x1 + n_dims_x2); for (std::size_t i = 0; i < n_dims_x1; i++) { if (!rdims_x1_bs[i]) { new_shape.push_back(shape_x1[i]); } } for (std::size_t i = 0; i < n_dims_x2; i++) { if (!rdims_x2_bs[i]) { new_shape.push_back(shape_x2[i]); } } // Restore flattened dims return createReshape(graph, mm->output(), new_shape); } bool isIndexBroadcastEnabled(torch::jit::Node *node) { static const auto bcast_attr = c10::Symbol::attr("enable_index_broadcast"); return node->hasAttribute(bcast_attr) ? static_cast(node->i(bcast_attr)) : false; } torch::jit::Node *scatterAddHandler(torch::jit::Graph *graph, torch::jit::Node *node) { static constexpr std::int32_t sum_reduce = static_cast(ScatterReduction::Sum); auto *output = node->input(0); auto *index = node->input(2); auto *src = node->input(3); const auto src_type = src->type()->expect(); const auto axis = handleDimensionParam(node->input(1), src_type); const auto shape = shapeFromTensor(node->output()); const auto axissize = shape.at(axis); const auto enable_index_broadcast = isIndexBroadcastEnabled(node); if (isTensorConstant(output->node())) { // output may have been generated by calling zeros(...) and at this point // in the canonicalization the node is represented as a tensor constant. auto out_tensor = getNodeTensorAttrValue(output->node()); const auto scalar_zero = at::zeros(1, out_tensor.dtype()); const bool all_zeros = at::all(out_tensor.eq(scalar_zero)).item().toBool(); if (all_zeros) { logging::trace("Removing zeros output to scatter_add: {}", nodeToString(output->node())); markNodeForDeletion(output->node()); return createScatterreduce(graph, {src, index}, axissize, axis, enable_index_broadcast, sum_reduce); } } return createScatterreduce(graph, {src, index, output}, axissize, axis, enable_index_broadcast, sum_reduce); } torch::jit::Node * meanScatterReduceHandler(torch::jit::Graph *graph, torch::jit::Value *self, torch::jit::Value *index, torch::jit::Value *src, const std::int64_t axis, const std::int64_t axissize, const bool include_self, const bool enable_index_broadcast) { static constexpr int32_t sum_reduce = static_cast(ScatterReduction::Sum); auto *ones_self = createConstantFloatLike(graph, src, {1.0}, shapeFromTensor(self)); auto *ones_src = createConstantFloatLike(graph, src, {1.0}, shapeFromTensor(src)); torch::jit::Node *count; if (include_self) { // Count the number of elements reduced to each index. count = createScatterreduce( graph, {ones_src->output(), index, ones_self->output()}, axissize, axis, enable_index_broadcast, sum_reduce); } else { static constexpr int32_t none_reduce = static_cast(ScatterReduction::None); auto *zeros_src = createConstantFloatLike(graph, src, {0.0}, shapeFromTensor(src)); // Tensor with zeros where the indices are updated and ones otherwise. auto *count_mask = createScatterreduce( graph, {zeros_src->output(), index, ones_self->output()}, axissize, axis, enable_index_broadcast, none_reduce); // Count the number of elements reduced to each index. count = createScatterreduce( graph, {ones_src->output(), index, count_mask->output()}, axissize, axis, enable_index_broadcast, sum_reduce); // Put zeros in those indices in self tensor that are not updated, // so that they don't impact the reduction result (include_self=False). auto *masked_self = createScatterreduce(graph, {zeros_src->output(), index, self}, axissize, axis, enable_index_broadcast, none_reduce); self = masked_self->output(); } // Sum reduction and then division to calculate `mean`. auto *sr = createScatterreduce(graph, {src, index, self}, axissize, axis, enable_index_broadcast, sum_reduce); return createDiv(graph, {sr->output(), count->output()}); } torch::jit::Node *scatterReduce(torch::jit::Graph *graph, torch::jit::Node *node, const bool enable_index_broadcast) { // Signature for scatter_reduce // (Tensor src, int dim, Tensor index, Tensor src, string reduce, // bool include_self) auto *self = node->input(0); auto *dim = node->input(1); auto *index = node->input(2); auto *src = node->input(3); const auto reduce = getReductionMethod(node->input(4)->node()); const bool include_self = constantToBool(node->input(5)->node()); const auto src_type = src->type()->expect(); const auto axis = handleDimensionParam(dim, src_type); const auto outshape = shapeFromTensor(node->output(0)); const auto axissize = outshape.at(axis); if (reduce == static_cast(ScatterReduction::Mean)) { // `Mean` is decomposed as two scatter_reduce sums. return meanScatterReduceHandler(graph, self, index, src, axis, axissize, include_self, enable_index_broadcast); } if (!include_self) { // Mask those indices in `self` that are specified by `index` auto *init = createConstantFloatLike( graph, src, {getReductionInitValue(reduce)}, shapeFromTensor(src)); static constexpr std::int32_t none_reduce = static_cast(ScatterReduction::None); auto *masked_self = createScatterreduce(graph, {init->output(), index, self}, axissize, axis, enable_index_broadcast, none_reduce); return createScatterreduce(graph, {src, index, masked_self->output()}, axissize, axis, enable_index_broadcast, reduce); } return createScatterreduce(graph, {src, index, self}, axissize, axis, enable_index_broadcast, reduce); } torch::jit::Node *scatterReduceHandler(torch::jit::Graph *graph, torch::jit::Node *node) { const bool enable_index_broadcast = isIndexBroadcastEnabled(node); return scatterReduce(graph, node, enable_index_broadcast); } torch::jit::Node *indexReduceHandler(torch::jit::Graph *graph, torch::jit::Node *node) { static constexpr bool enable_index_broadcast = true; return scatterReduce(graph, node, enable_index_broadcast); } torch::jit::Node *weightNormHandler(torch::jit::Graph *graph, torch::jit::Node *node) { // aten::_weight_norm(Tensor v, Tensor g, int dim) -> Tensor auto *v = node->input(0); auto *g = node->input(1); const auto shape = shapeFromTensor(v); auto dim = constantToLong(node->input(2)->node()); // Correct negative indices // PyTorch handles dim -1 in a special way - it computes the // norm over all dimensions. We handle that case separately if (dim < -1) { dim += shape.size(); } std::vector axes(shape.size()); std::iota(axes.begin(), axes.end(), 0); // If we have the special case dim -1: We don't erase any // axes so that the norm is computed over all dimensions) if (dim != -1) { axes.erase(axes.begin() + dim); } std::vector axes_constants; axes_constants.reserve(axes.size()); for (auto d : axes) { axes_constants.push_back(wrapInConstant1D(graph, d)); } // tensorNormHandler expects ListConstruct for axes_constants torch::jit::Value *axes_list = createAndInsertNode(graph, c10::prim::ListConstruct, axes_constants) ->output(); // Order 2 norm auto *p = wrapInConstant1D(graph, 2); // Keep the normalised dims to enable broadcasting auto *keepdim = wrapInConstant1D(graph, 1); // tensorNormHandler auto norm_handler = getHandler(c10::aten::norm); // PyTorch defines the weight calculation as // w = g * v / norm(v) // This can be rewritten as // w = v * g / norm(v) // Which is slightly more efficient, since it doesn't require // expanding g to be broadcastable with v auto *norm_v = createHandlerOperation(graph, norm_handler, {v, p, axes_list, keepdim}); auto *scaled_v = createDiv(graph, {g, norm_v->output()}); return createMul(graph, {v, scaled_v->output()}); } torch::jit::Node *setAvailableMemoryHandler(torch::jit::Graph *graph, torch::jit::Node *node) { // poptorch::set_available_memory(Tensor, float) -> Tensor auto *x = node->input(0); auto *y = node->input(1); const auto t0 = constantToFloat(y->node()); return createSetAvailableMemory(graph, x, t0); } torch::jit::Node *randintHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *out = node->output(0); const auto shape = shapeFromTensor(out); const auto scalar_type = getNodeScalarType(out); // Note: the popart range is closed whereas the pytorch range is expected to // be half open const auto high = constantToFloat(node->input(1)->node()) - 1.0f; const auto low = constantToFloat(node->input(0)->node()); auto *ints = createRandomUniform(graph, out, shape, high, low, c10::ScalarType::Int); return createCast(graph, ints->output(0), scalar_type); } torch::jit::Node *randomHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *out = node->input(0); const auto shape = shapeFromTensor(out); const auto scalar_type = getNodeScalarType(out); // Note: the popart range is closed whereas the pytorch range is expected to // be half open const auto high = constantToFloat(node->input(2)->node()) - 1.0f; const auto low = constantToFloat(node->input(1)->node()); auto *ints = createRandomUniform(graph, out, shape, high, low, c10::ScalarType::Int); return createCast(graph, ints->output(0), scalar_type); } } // namespace __attribute__((constructor(HANDLER_INIT_PRIORITY))) static void registration() { registerHandler(c10::aten::bincount, bincountHandler); registerHandler(c10::aten::bucketize, bucketizeHandler); registerHandler(c10::aten::einsum, einsumHandler); registerHandler(c10::aten::meshgrid, meshgridHandler); registerHandler(c10::aten::cartesian_prod, cartesianProdHandler); registerHandler(c10::aten::tensordot, tensordotHandler); registerHandler(c10::aten::scatter_add, scatterAddHandler); registerHandler(c10::aten::scatter_reduce, scatterReduceHandler); registerHandler(c10::aten::index_reduce, indexReduceHandler); registerHandler(c10::aten::_weight_norm, weightNormHandler); registerHandler(c10::aten::randint, randintHandler); registerHandler(c10::aten::random_, randomHandler); registerHandler(symbols::poptorch::set_available_memory, setAvailableMemoryHandler); } } // namespace poptorch ================================================ FILE: poptorch/source/popart_canonicalization/PoolingOps.cpp ================================================ // Copyright (c) 2020 Graphcore Ltd. All rights reserved. #include #include #include "../PoptorchStaticInit.hpp" #include "PopartCanonicalizationUtils.hpp" #include "poptorch/OpBuilder.hpp" #include "poptorch/Utils.hpp" #include "poptorch_logging/Error.hpp" #include "poptorch_logging/Logging.hpp" namespace poptorch { namespace { torch::jit::Node *poolingHandler(torch::jit::Graph *graph, torch::jit::Node *node) { const torch::jit::Symbol kind = node->kind(); // aten::max_pool2d(Tensor self, int[] kernel_size, int[] stride, int[] // padding, int[] dilation, bool ceil_mode) -> Tensor // // aten::avg_pool2d(Tensor self, int[] kernel_size, int[] stride, int[] // padding, bool ceil_mode, bool count_include_pad, // int? divisor_override) -> Tensor // aten::max_pool2d_with_indices(Tensor self, int[2] kernel_size, int[2] // stride=[], int[2] padding=[0, 0], int[2] dilation=[1, 1], bool // ceil_mode=False) -> (Tensor, Tensor) torch::jit::Value *x = node->input(0); const auto kernel_size = constantToLongVec(node->input(1)->node()); const auto stride = constantToLongVec(node->input(2)->node()); auto padding = constantToLongVec(node->input(3)->node()); auto shape = shapeFromTensor(x); bool reshape_after = false; // The torch input might be missing the batch dimension, so add one if // necessary // (C, *in) -> (1, C, *in) if (shape.size() != stride.size() + 2) { shape.push_back(1); // simple rotation to the right std::rotate(shape.rbegin(), shape.rbegin() + 1, shape.rend()); x = createReshape(graph, x, shape)->output(); reshape_after = true; } // If we reshape, the output shape will be (1, C, *out) but torch expects // (C, *out) const auto maybe_reshape_output = [&](torch::jit::Node *output) { if (reshape_after) { return createReshape(graph, output->output(), shapeFromTensor(node->output())); } return output; }; // Pytorch gives the padding as being the amount to pad in both // directions. Popart two arguments for each axis, the amount to pad in // each direction along that axis. In the form (Axis0Left, AxisNLeft..., // Axis0Right, AxisNRight) where left and right refer to the direction // along the axis to add zeros to. const std::size_t num_pads = padding.size(); for (std::size_t pad_index = 0; pad_index < num_pads; ++pad_index) { padding.push_back(padding[pad_index]); } const bool is_max_pool = kind == c10::aten::max_pool1d || kind == c10::aten::max_pool2d || kind == c10::aten::max_pool3d || kind == c10::aten::max_pool1d_with_indices || kind == c10::aten::max_pool2d_with_indices || kind == c10::aten::max_pool3d_with_indices; if (is_max_pool) { const auto dilations = constantToLongVec(node->input(4)->node()); const auto ceil_mode = constantToLong(node->input(5)->node()); auto *output = createMaxpool(graph, {x}, 1, kernel_size, ceil_mode, dilations, padding, 0, stride); return maybe_reshape_output(output); } // divisor_override is ignored for now due to not being supported directly in // popart. const auto ceil_mode = constantToLong(node->input(4)->node()); const bool count_include_pad = constantToBool(node->input(5)->node()); // count_include_pad isn't supported in PopART so we check and pad manually if // the average pool is supposed to include the padding in its average. if (count_include_pad) { x = createConstantPad(graph, x, padding, 0.f)->output(); // Ensure that padding isn't added twice. padding = {}; } // popart only supports float types for avgpool const auto input_type = getNodeScalarType(x); if (input_type == c10::kFloat) { auto *output = createAveragepool(graph, {x}, kernel_size, ceil_mode, 0, padding, stride); return maybe_reshape_output(output); } // all other types require casting via float x = createCast(graph, x, c10::kFloat)->output(); x = createAveragepool(graph, {x}, kernel_size, ceil_mode, 0, padding, stride) ->output(); auto *output = createCast(graph, x, input_type); return maybe_reshape_output(output); } torch::jit::Node *adaptivePoolingHandler(torch::jit::Graph *graph, torch::jit::Node *node) { // aten::adaptive_avg_pool1d(Tensor self, int[] output_size) -> Tensor // aten::adaptive_avg_pool2d(Tensor self, int[] output_size) -> Tensor // aten::adaptive_avg_pool3d(Tensor self, int[] output_size) -> Tensor torch::jit::Value *x = node->input(0); const std::vector output_shape = constantToLongVec(node->input(1)->node()); const std::size_t n_output_dims = output_shape.size(); const std::vector input_shape = shapeFromTensor(x); const std::size_t input_offset = input_shape.size() - n_output_dims; std::vector stride(n_output_dims); std::vector kernel_shape(n_output_dims); for (std::size_t i = 0; i < n_output_dims; i++) { const std::int64_t in_dim = input_shape[input_offset + i]; const std::int64_t out_dim = output_shape[i]; // This matches PyTorch's implementation as long as each input dim is // divisible by the corresponding output dim. If this is not the case, the // shape will be correct but the output will differ. if (in_dim % out_dim != 0) { const auto msg = fmt::format("Input dim {} ({}) is not divisible by the corresponding " "output dim ({}). The results will differ numerically " "from PyTorch's implementation.", i, in_dim, out_dim); ERROR(msg); } stride[i] = in_dim / out_dim; kernel_shape[i] = in_dim - (out_dim - 1) * stride[i]; } const std::vector padding(n_output_dims * 2, 0); return createAveragepool(graph, {x}, kernel_shape, 0, 0, padding, stride); } } // namespace __attribute__((constructor(HANDLER_INIT_PRIORITY))) static void registration() { registerHandler(c10::aten::max_pool1d, poolingHandler); registerHandler(c10::aten::avg_pool1d, poolingHandler); registerHandler(c10::aten::max_pool2d, poolingHandler); registerHandler(c10::aten::avg_pool2d, poolingHandler); registerHandler(c10::aten::max_pool3d, poolingHandler); registerHandler(c10::aten::avg_pool3d, poolingHandler); registerHandler(c10::aten::max_pool1d_with_indices, poolingHandler); registerHandler(c10::aten::max_pool2d_with_indices, poolingHandler); registerHandler(c10::aten::max_pool3d_with_indices, poolingHandler); registerHandler(c10::aten::adaptive_avg_pool1d, adaptivePoolingHandler); registerHandler(c10::aten::adaptive_avg_pool2d, adaptivePoolingHandler); registerHandler(c10::aten::adaptive_avg_pool3d, adaptivePoolingHandler); } } // namespace poptorch ================================================ FILE: poptorch/source/popart_canonicalization/PopartCanonicalizationUtils.cpp ================================================ // Copyright (c) 2020 Graphcore Ltd. All rights reserved. #include #include #include #include "poptorch_logging/Error.hpp" #include "poptorch_logging/Logging.hpp" #include "poptorch/DispatchTracer.hpp" #include "poptorch/OpBuilder.hpp" #include "poptorch/Utils.hpp" #include "../PoptorchSymbols.hpp" #include "PopartCanonicalizationUtils.hpp" namespace poptorch { namespace { const c10::Symbol delete_node_attr = c10::Symbol::attr("delete_node"); // This avoids the static initialisation order fiasco, std::unordered_map &symbolHandlers() { static std::unordered_map symbol_handlers; return symbol_handlers; } } // namespace bool registerHandler(c10::Symbol symbol, const SymbolHandler &handler) { logging::trace("Registering handler for symbol {}", symbol.toDisplayString()); const bool new_handler = symbolHandlers().emplace(symbol, handler).second; ERROR_ON_MSG(!new_handler, "Symbol " << symbol.toDisplayString() << " already has a handler registered"); return new_handler; } // Return a pointer to a handler if one is registered for this kind of node or // an empty std::function otherwise. SymbolHandler getHandler(torch::jit::NodeKind kind) { const auto it = symbolHandlers().find(kind); if (it != symbolHandlers().cend()) { return it->second; } return {}; } bool allInputsOfType(torch::jit::Node *node, at::ScalarType type, int ignore_input) { int idx = 0; for (const auto &input : node->inputs()) { if (idx++ == ignore_input) { continue; } const auto tensor_type = input->type()->cast(); ERROR_ON(!tensor_type); ERROR_ON(!tensor_type->scalarType()); if ((*tensor_type->scalarType()) != type) { return false; } } return true; } bool allInputsBool(torch::jit::Node *node, int ignore_input) { return allInputsOfType(node, at::ScalarType::Bool, ignore_input); } bool allInputsInteger(torch::jit::Node *node, int ignore_input) { int idx = 0; for (const auto &input : node->inputs()) { if (idx++ == ignore_input) { continue; } const auto tensor = input->type()->cast(); ERROR_ON(!tensor); ERROR_ON(!tensor->scalarType()); if (!isIntegralType(*tensor->scalarType(), false)) { return false; } } return true; } std::vector handleTensorList(torch::jit::Node *node) { const auto inputs = node->inputs(); // // Just convert the node->inputs array ref to vector and return it. return std::vector(inputs.cbegin(), inputs.cend()); } // Add a vector of ints to the IR as a constant. torch::jit::Value * intVectorToIrConstant(torch::jit::Graph *graph, const std::vector &ints) { const std::vector dimensions = { static_cast(ints.size())}; return createConstantInt(graph, ints, dimensions)->output(); } // Get the shape of a tensor and add it to the graph as a constant value. torch::jit::Value *shapeFromTensorAsIR(torch::jit::Graph *graph, torch::jit::Value *value) { // Extract the type from the pytorch IR. const std::vector shape = shapeFromTensor(value); return intVectorToIrConstant(graph, shape); } // Get the scalar type of a given tensor. at::ScalarType getNodeScalarType(const torch::jit::Value *tensor) { // The returned value must be a tensor. c10::TensorTypePtr const return_tensor = tensor->type()->expect(); // Deduce the type from the scalar type on the return. return *return_tensor->scalarType(); } bool hasUnityValue(torch::jit::Value *value) { const auto tensor = getNodeTensorAttrValue(value->node()); if (tensor.numel() != 1) { return false; } return tensor.to(at::ScalarType::Float).item() == 1.0; } bool isNone(torch::jit::Node *node) { if (node->kind() != c10::prim::Constant) { return false; } const auto sym = c10::attr::value; return !node->hasAttribute(sym); } bool isNone(const torch::jit::Value *value) { return value->type()->cast(); } std::int64_t handleDimensionParam(torch::jit::Value *value, const c10::TensorTypePtr &as_tensor) { // Extract the dim. std::int64_t dim = constantToLong(value->node()); c10::VaryingShape const dims = as_tensor->sizes(); // If dim is less than zero subtract it to get the actual dimension. if (dim < 0) { dim = *dims.size() + dim; } // Return the dim. return dim; } bool isAnyConstant(torch::jit::Node *node) { return isTensorConstant(node) || node->kind() == c10::prim::Constant; } bool isFloatingPointConstant(torch::jit::Node *node) { const auto tensor_type = node->output()->type()->cast(); if (tensor_type) { const auto scalar_type = *tensor_type->scalarType(); return c10::isFloatingType(scalar_type); } ERROR_ON(!node->output()->type()->isSubtypeOf(c10::NumberType::get())); return torch::jit::constant_as(node->output()) .value() .isFloatingPoint(); } bool isTensorConstant(torch::jit::Node *node) { return (node->kind() == symbols::poptorch::tensor_constant || node->kind() == symbols::poptorch::host_side_tensor_constant); } bool isConstantScalar(torch::jit::Value *input) { if (!isTensorConstant(input->node())) { return false; } const std::vector shape = shapeFromTensor(input); const int64_t numel = std::accumulate(shape.begin(), shape.end(), 1, std::multiplies()); return numel == 1; } float constantToFloat(torch::jit::Node *node) { ERROR_ON_MSG(!isTensorConstant(node), "Cannot force a non-constant '" << node->kind().toQualString() << "' node to a float"); if (node->output()->type()->cast()) { return getNodeTensorAttrValue(node).to(at::ScalarType::Float).item(); } ERROR_ON(!node->output()->type()->isSubtypeOf(c10::NumberType::get())); const auto s = torch::jit::constant_as(node->output()); return s.value().toFloat(); } torch::jit::Node *constantToLongConstant(torch::jit::Node *node) { ERROR_ON_MSG(!isTensorConstant(node), "Cannot force a non-constant '" << node->kind().toQualString() << "' node to a long constant"); ERROR_ON(!node->output()->type()->cast()); setNodeTensorAttrValue(node, getNodeTensorAttrValue(node).to(at::ScalarType::Long)); node->output()->inferTypeFrom(getNodeTensorAttrValue(node)); return node; } std::int32_t constantToInt(torch::jit::Node *node) { ERROR_ON_MSG(!isTensorConstant(node), "Cannot force a non-constant '" << node->kind().toQualString() << "' node to an int"); if (node->output()->type()->cast()) { return getNodeTensorAttrValue(node) .to(at::ScalarType::Int) .item(); } ERROR_ON(!node->output()->type()->isSubtypeOf(c10::NumberType::get())); const auto s = torch::jit::constant_as(node->output()); return s.value().toInt(); } std::int64_t constantToLong(torch::jit::Node *node) { ERROR_ON_MSG(!isTensorConstant(node), "Cannot force a non-constant '" << node->kind().toQualString() << "' node to a long"); if (node->output()->type()->cast()) { return getNodeTensorAttrValue(node) .to(at::ScalarType::Long) .item(); } ERROR_ON(!node->output()->type()->isSubtypeOf(c10::NumberType::get())); const auto s = torch::jit::constant_as(node->output()); const std::int64_t val = s.value().toLong(); return val == INT_MAX ? LONG_MAX : val; } std::vector constantToLongVec(torch::jit::Node *node) { return constantListToVec(node, constantToLong); } std::vector constantToFloatVec(torch::jit::Node *node) { return constantListToVec(node, constantToFloat); } bool constantToBool(torch::jit::Node *node) { ERROR_ON_MSG(!isTensorConstant(node), "Cannot force a non-constant node to a bool"); return constantToInt(node) != 0; } std::string constantToString(torch::jit::Node *node) { ERROR_ON_MSG(!isTensorConstant(node), "Cannot force a non-constant node to a string"); auto &&t = getNodeTensorAttrValue(node); ERROR_ON(!t.is_contiguous()); const auto length = t.sizes().at(0); std::string s(reinterpret_cast(t.data_ptr()), length); return s; } at::ScalarType constantToScalarType(torch::jit::Node *node) { const auto as_num = constantToInt(node); ERROR_ON_MSG(as_num < 0 || as_num > at::NumScalarTypes, "Node has a value (" << as_num << ") which is not " "representable as a torch dtype"); return static_cast(as_num); } std::int32_t convertReduceToPopart(std::int32_t pytorchReduce) { // Popart: // Sum = 0, Mean =1, NoReduction = 2 // Pytorch // Sum = 2, Mean =1, NoReduction = 0 if (pytorchReduce == 0) { return 2; } if (pytorchReduce == 1) { return 1; } if (pytorchReduce == 2) { return 0; } ERROR("Unsupported pytorch reduce"); } void markNodeForDeletion(torch::jit::Node *node) { node->i_(delete_node_attr, 1); } bool isMarkedForDeletion(torch::jit::Node *node) { return node->hasAttribute(delete_node_attr) && node->i(delete_node_attr) > 0; } void replaceOutputUse(torch::jit::Value *old_val, torch::jit::Value *new_val) { // Make sure the new value matches the type of the original value. new_val->setType(old_val->type()); // Replace the old value with the new one. old_val->replaceAllUsesWith(new_val); } void replaceOutputUse(torch::jit::Node *oldNode, torch::jit::Node *new_node, std::uint64_t outputIdx) { logging::trace("Replacing node output %{} with that of {}", oldNode->output(outputIdx)->debugName(), *new_node); torch::jit::Value *new_val = new_node->output(outputIdx); torch::jit::Value *old_val = oldNode->output(outputIdx); replaceOutputUse(old_val, new_val); } // An odd function which returns each tensor dimension as an array, a helper for // torch.max(tensor) and torch.min(tensor). I.E a 4D tensor will return (0, 1, // 2, 3). std::vector reduceHelperDimensionCreator(torch::jit::Value *value) { // Extract the type from the pytorch IR. c10::TensorTypePtr const as_tensor = value->type()->expect(); c10::VaryingShape const dims = as_tensor->sizes(); // Convert that IR type into a C++ vector of ints. std::vector shape(dims.sizes()->size()); // Fill the vector with sequentially incrementing values. std::iota(shape.begin(), shape.end(), 0); return shape; } bool attributeEqual(torch::jit::Node *a, torch::jit::Node *b, c10::Symbol attr) { if (!a->hasAttribute(attr) || !b->hasAttribute(attr)) { return false; } const auto attr_kind = a->kindOf(attr); if (b->kindOf(attr) != attr_kind) { return false; } switch (attr_kind) { case torch::jit::AttributeKind::f: return a->f(attr) == b->f(attr); case torch::jit::AttributeKind::fs: return a->fs(attr) == b->fs(attr); case torch::jit::AttributeKind::s: return a->s(attr) == b->s(attr); case torch::jit::AttributeKind::ss: return a->ss(attr) == b->ss(attr); case torch::jit::AttributeKind::i: return a->i(attr) == b->i(attr); case torch::jit::AttributeKind::is: return a->is(attr) == b->is(attr); case torch::jit::AttributeKind::t: return a->t(attr).equal(b->t(attr)); case torch::jit::AttributeKind::ts: { if (a->ts(attr).size() != b->ts(attr).size()) { return false; } auto a_it = a->ts(attr).cbegin(); auto b_it = b->ts(attr).cbegin(); for (; a_it != a->ts(attr).cend(); a_it++, b_it++) { if (!a_it->equal(*b_it)) { return false; } } return true; } case torch::jit::AttributeKind::g: return a->g(attr) == b->g(attr); case torch::jit::AttributeKind::gs: return a->gs(attr) == b->gs(attr); case torch::jit::AttributeKind::c: return a->c(attr) == b->c(attr); case torch::jit::AttributeKind::cs: return a->cs(attr) == b->cs(attr); case torch::jit::AttributeKind::ty: return a->ty(attr) == b->ty(attr); case torch::jit::AttributeKind::tys: return a->tys(attr) == b->tys(attr); case torch::jit::AttributeKind::ival: return a->ival(attr) == b->ival(attr); } ERROR("Invalid type in attributeSame."); } torch::jit::Value *castToPromoteType(torch::jit::Graph *graph, torch::jit::Value *tensor, c10::ScalarType promoteType) { if (getNodeScalarType(tensor) != promoteType) { return createCast(graph, tensor, promoteType)->output(); } return tensor; } } // namespace poptorch ================================================ FILE: poptorch/source/popart_canonicalization/PopartCanonicalizationUtils.hpp ================================================ // Copyright (c) 2020 Graphcore Ltd. All rights reserved. #ifndef SOURCE_POPART_CANONICALIZATION_UTILS_H #define SOURCE_POPART_CANONICALIZATION_UTILS_H #include #include #include #include #include "poptorch_logging/Error.hpp" namespace poptorch { using SymbolHandler = std::function; bool registerHandler(c10::Symbol symbol, const SymbolHandler &handler); std::vector reduceHelperDimensionCreator(torch::jit::Value *value); inline std::vector reduceHelperDimensionCreator(torch::jit::Value *value, const std::vector &axes) { if (!axes.empty()) { return axes; } return reduceHelperDimensionCreator(value); } // Return a pointer to a handler if one is registered for this kind of node or // an empty std::function otherwise. SymbolHandler getHandler(torch::jit::NodeKind kind); // Returns true if all inputs to `node` are of the given `type`. // // \param ignore_input Index of an input to not check. // // \note Errors if a not-ignored input is not a tensor, or doesn't have a scalar // type. bool allInputsOfType(torch::jit::Node *node, at::ScalarType type, int ignore_input = -1); // Returns true if all inputs are Bools bool allInputsBool(torch::jit::Node *node, int ignore_input = -1); // Returns true if all inputs are of integral type, compatible with // c10::isIntegralType. bool allInputsInteger(torch::jit::Node *node, int ignore_input = -1); // Get the tensor shape and add it to the IR as a constant primitive. torch::jit::Value *shapeFromTensorAsIR(torch::jit::Graph *graph, torch::jit::Value *value); // Get the scalar type of this tensor. at::ScalarType getNodeScalarType(const torch::jit::Value *tensor); torch::jit::Value *intVectorToIrConstant(torch::jit::Graph *graph, const std::vector &ints); std::vector handleTensorList(torch::jit::Node *node); // Returns true if the value is a constant of exactly unity (1) bool hasUnityValue(torch::jit::Value *value); // Some operations take in an optional tensor. A "none" constant is passed in to // mark a tensor which is not there. bool isNone(torch::jit::Node *node); bool isNone(const torch::jit::Value *value); std::int64_t handleDimensionParam(torch::jit::Value *value, const c10::TensorTypePtr &as_tensor); bool isAnyConstant(torch::jit::Node *node); bool isFloatingPointConstant(torch::jit::Node *node); bool isTensorConstant(torch::jit::Node *node); // Does the given value (tensor) represent a single, scalar value? // // Useful in cases of broadcasting. bool isConstantScalar(torch::jit::Value *input); // Force a constant to be a float: this is appropriate if required for popart // (onnx); e.g. Gemm alpha and beta are always floats float constantToFloat(torch::jit::Node *node); // Force a constant to be a long constant by casting. // This is appropriate if required for popart (onnx) // e.g. TopK takes int64 indices as a tensor. torch::jit::Node *constantToLongConstant(torch::jit::Node *node); // Force a constant to be an int: this is appropriate if required for popart // (onnx) std::int32_t constantToInt(torch::jit::Node *node); // Force a constant to be a long: this is appropriate if required for popart // (onnx) e.g. Slice takes int64 indices std::int64_t constantToLong(torch::jit::Node *node); // Forces a ListConstruct to be a vector of int64_ts std::vector constantToLongVec(torch::jit::Node *node); // Forces a ListConstruct to be a vector of floats std::vector constantToFloatVec(torch::jit::Node *node); // Extract a boolean from a constant containing one (encoded as an int32_t) bool constantToBool(torch::jit::Node *node); // Extracts a string from a constant containing a string std::string constantToString(torch::jit::Node *node); // Extract a `at::ScalarType` from a constant containing a number that // represents one. at::ScalarType constantToScalarType(torch::jit::Node *node); // Forces a ListConstuct into a vector of the given type template std::vector constantListToVec(torch::jit::Node *node, ExtractFunc &&constantExtractFunc) { ERROR_ON(node->kind() != c10::prim::ListConstruct); auto node_inputs = node->inputs(); std::vector result; result.reserve(node_inputs.size()); for (torch::jit::Value *value : node_inputs) { result.push_back(constantExtractFunc(value->node())); } return result; } // Both pytorch and popart represent reduce as an enum but with different // values. std::int32_t convertReduceToPopart(std::int32_t pytorchReduce); void markNodeForDeletion(torch::jit::Node *node); bool isMarkedForDeletion(torch::jit::Node *node); void replaceOutputUse(torch::jit::Value *old_val, torch::jit::Value *new_val); void replaceOutputUse(torch::jit::Node *oldNode, torch::jit::Node *new_node, std::uint64_t outputIdx); bool attributeEqual(torch::jit::Node *a, torch::jit::Node *b, c10::Symbol attrb); template c10::ScalarType promoteTypes(const c10::ScalarType &a, const c10::ScalarType &b, Tail &&...tail) { if constexpr (sizeof...(tail) == 0) { return c10::promoteTypes(a, b); } else { return promoteTypes(promoteTypes(a, b), std::forward(tail)...); } } template c10::ScalarType promoteTypes(const torch::jit::Value *a, const torch::jit::Value *b, Tail &&...tail) { return promoteTypes(getNodeScalarType(a), getNodeScalarType(b), getNodeScalarType(std::forward(tail))...); } torch::jit::Value *castToPromoteType(torch::jit::Graph *graph, torch::jit::Value *tensor, c10::ScalarType promoteType); template std::vector promoteTensors(torch::jit::Graph *graph, torch::jit::Value *tensor_a, torch::jit::Value *tensor_b, Tail &&...tail) { const c10::ScalarType promote_type = promoteTypes(tensor_a, tensor_b, std::forward(tail)...); return {castToPromoteType(graph, tensor_a, promote_type), castToPromoteType(graph, tensor_b, promote_type), castToPromoteType(graph, std::forward(tail), promote_type)...}; } } // namespace poptorch #endif // SOURCE_POPART_CANONICALIZATION_UTILS_H ================================================ FILE: poptorch/source/popart_canonicalization/PoptorchHandlers.gen.cpp ================================================ // DO NOT EDIT! Generated by PopTorchHandlers.py // Copyright (c) 2022 Graphcore Ltd. All rights reserved. #include "../PoptorchStaticInit.hpp" #include "../PoptorchSymbols.hpp" #include "PopartCanonicalizationUtils.hpp" #include "poptorch/OpBuilder.hpp" #include "poptorch/Utils.hpp" #include "poptorch_logging/Error.hpp" #include "poptorch_logging/Logging.hpp" namespace poptorch { namespace { torch::jit::Node *beginIpuBlockHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *x = node->input(0); auto t0 = constantToLong(x->node()); auto *y = node->input(1); auto t1 = constantToLong(y->node()); auto *z = node->input(2); auto t2 = constantToLong(z->node()); // beginIpuBlock(clong(x), clong(y), clong(z)) return createBeginIpuBlock(graph, t0, t1, t2); } torch::jit::Node *beginMultiConvHandler(torch::jit::Graph * /*graph*/, torch::jit::Node * /*node*/) { // return nullptr; } torch::jit::Node *callCpuOpHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *x = node->input(0); auto t0 = handleTensorList(x->node()); auto *s = node->input(1); auto t1 = constantToString(s->node()); auto *original_node = node; // callCpuOp(TensorList(x), cstr(s), original_node) return createCallCpuOp(graph, t0, t1, original_node); } torch::jit::Node *endCpuOpHandler(torch::jit::Graph * /*graph*/, torch::jit::Node * /*node*/) { // return nullptr; } torch::jit::Node *endForLoopHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *output = node->input(0); auto *inputs = node->input(1); auto *trip_count = node->input(2); auto t0 = constantToLong(trip_count->node()); // endForLoop(output, inputs, clong(trip_count)) return createEndForLoop(graph, output, inputs, t0); } torch::jit::Node *endIpuBlockHandler(torch::jit::Graph * /*graph*/, torch::jit::Node * /*node*/) { // return nullptr; } torch::jit::Node *identityLossHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *x = node->input(0); auto *r = node->input(1); auto t0 = constantToInt(r->node()); // identityloss(x, cint(r)) return createIdentityloss(graph, {x}, t0); } torch::jit::Node *internalCastHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *tensor = node->input(0); auto *dtype = node->input(1); auto t0 = constantToString(dtype->node()); // internalCast(tensor, cstr(dtype)) return createInternalCast(graph, tensor, t0); } torch::jit::Node *nopHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *args = node->input(0); // nop(args) return createNop(graph, {args}); } torch::jit::Node *optimizerGroupHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *x = node->input(0); auto t0 = constantToLong(x->node()); auto *l = node->input(1); auto t1 = handleTensorList(l->node()); // optimizerGroup(clong(x), TensorList(l)) return createOptimizerGroup(graph, t0, t1); } torch::jit::Node *popNameScopeHandler(torch::jit::Graph * /*graph*/, torch::jit::Node * /*node*/) { // return nullptr; } torch::jit::Node *recomputationCheckpointHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *i0 = node->input(0); // recomputationCheckpoint(i0) return createRecomputationCheckpoint(graph, i0); } torch::jit::Node *setMatmulSerializationHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *x = node->input(0); auto *s = node->input(1); auto t0 = constantToString(s->node()); auto *a = node->input(2); auto t1 = constantToLong(a->node()); auto *b = node->input(3); auto t2 = constantToInt(b->node()); // setMatMulSerialization(x, cstr(s), clong(a), cint(b)) return createSetMatMulSerialization(graph, x, t0, t1, t2 != 0); } torch::jit::Node *startForLoopHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *inputs = node->input(0); // startForLoop(inputs) return createStartForLoop(graph, inputs); } torch::jit::Node *startIfBlockHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *condition = node->input(0); // startIfBlockHandler(condition) return createStartIfBlock(graph, condition); } torch::jit::Node *startElseBlockHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *outputs_then = node->input(0); // startElseBlockHandler(outputs_then) return createStartElseBlock(graph, outputs_then); } torch::jit::Node *endIfBlockHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *outputs_else = node->input(0); auto *condition = node->input(1); // startElseBlockHandler(outputs_else, condition) return createEndIfBlock(graph, outputs_else, condition); } torch::jit::Node *updateParamInplaceHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *i0 = node->input(0); auto *i1 = node->input(1); // copyvarupdate(i0, i1) return createCopyvarupdate(graph, {i0, i1}); } } // namespace __attribute__((constructor(HANDLER_INIT_PRIORITY))) static void registration() { registerHandler(symbols::poptorch::begin_ipu_block, beginIpuBlockHandler); registerHandler(symbols::poptorch::begin_multi_conv, beginMultiConvHandler); registerHandler(symbols::poptorch::call_cpu_op, callCpuOpHandler); registerHandler(symbols::poptorch::end_cpu_op, endCpuOpHandler); registerHandler(symbols::poptorch::end_for_loop, endForLoopHandler); registerHandler(symbols::poptorch::end_ipu_block, endIpuBlockHandler); registerHandler(symbols::poptorch::identity_loss, identityLossHandler); registerHandler(symbols::poptorch::internal_cast, internalCastHandler); registerHandler(symbols::poptorch::nop, nopHandler); registerHandler(symbols::poptorch::optimizer_group, optimizerGroupHandler); registerHandler(symbols::poptorch::pop_name_scope, popNameScopeHandler); registerHandler(symbols::poptorch::recomputation_checkpoint, recomputationCheckpointHandler); registerHandler(symbols::poptorch::set_matmul_serialization, setMatmulSerializationHandler); registerHandler(symbols::poptorch::start_for_loop, startForLoopHandler); registerHandler(symbols::poptorch::start_if_block, startIfBlockHandler); registerHandler(symbols::poptorch::start_else_block, startElseBlockHandler); registerHandler(symbols::poptorch::end_if_block, endIfBlockHandler); registerHandler(symbols::poptorch::update_param_inplace, updateParamInplaceHandler); } } // namespace poptorch ================================================ FILE: poptorch/source/popart_canonicalization/PyGTorchScatterOps.cpp ================================================ // Copyright (c) 2022 Graphcore Ltd. All rights reserved. #include "../PoptorchStaticInit.hpp" #include "../PoptorchSymbols.hpp" #include "PopartCanonicalizationUtils.hpp" #include "ScatterReduction.hpp" #include "poptorch/OpBuilder.hpp" #include "poptorch/Utils.hpp" #include "poptorch_logging/Error.hpp" #include "poptorch_logging/Logging.hpp" namespace poptorch { namespace { std::int32_t getReductionMethod(const torch::jit::Node *node) { const auto kind = node->kind(); if (kind == torch_scatter::scatter_max) { return static_cast(ScatterReduction::Max); } if (kind == torch_scatter::scatter_min) { return static_cast(ScatterReduction::Min); } if (kind == torch_scatter::scatter_mul) { return static_cast(ScatterReduction::Mul); } ERROR("Unsupported reduction for node: " << nodeToString(node)); } torch::jit::Node *torchScatterHandler(torch::jit::Graph *graph, torch::jit::Node *node) { static constexpr bool enable_index_broadcast = true; // Signatures for scatter_max, scatter_min, scatter_mul: // (Tensor src, Tensor index, int dim, Tensor? out, int? dim_size) auto *src = node->input(0); auto *index = node->input(1); const auto src_type = src->type()->expect(); const auto axis = handleDimensionParam(node->input(2), src_type); auto *opt_out = node->input(3); std::vector args{src, index}; if (!isNone(opt_out)) { args.push_back(opt_out); } auto shape = shapeFromTensor(node->output(0)); auto axis_size = shape.at(axis); auto *opt_axis_size = node->input(4); if (!isNone(opt_axis_size)) { axis_size = constantToInt(opt_axis_size->node()); } auto *result = createScatterreduce(graph, args, axis_size, axis, enable_index_broadcast, getReductionMethod(node)); if (node->outputs().size() == 1) { return result; } // Both scatter_max and scatter_min return two outputs where the second one // is the index but most often this second output is simply ignored. if (!node->output(1)->hasUses()) { // the indices output is unused so is safe to delete node->eraseOutput(1); return result; } // Calculate the indices of the max/min const auto ishape = shapeFromTensor(src); std::vector index_range_shape(ishape.size(), 1); index_range_shape[axis] = ishape[axis]; const auto gather_handler = getHandler(c10::aten::gather); result->output()->setType(src_type->withSizes(shape)); auto *gather = createHandlerOperation(graph, gather_handler, {result->output(), node->input(2), index}) ->output(); // true if the scatter chose this location in src, false if we didn't auto *mask = createEqual(graph, {gather, src})->output(); std::vector vals(ishape[axis]); std::iota(std::begin(vals), std::end(vals), 1); auto *index_range = createConstantInt(graph, vals, index_range_shape)->output(); auto *not_chosen = createConstantInt(graph, {ishape[axis] + 1}, {1})->output(); // The 1-based index in src if this location was chosen, ishape[axis] + 1 if // it wasn't auto *index_of_result = createWhere(graph, {mask, index_range, not_chosen})->output(); // Apply the same scattering to our index tensor as we did to the input tensor static constexpr std::int32_t min_reduce = static_cast(ScatterReduction::Min); auto *arg_scatter = createScatterreduce(graph, {index_of_result, index}, axis_size, axis, enable_index_broadcast, min_reduce) ->output(); // Now we've got a tensor of 1-based indices, with zeroes where no index // was scattered. We need to transform this to zero-based indices, with // ishape[axis] where no index was scattered. auto *one = createConstantInt(graph, {1}, {1})->output(); arg_scatter = createSub(graph, {arg_scatter, one})->output(); arg_scatter = createRemainder(graph, {arg_scatter, not_chosen})->output(); replaceOutputUse(node->output(0), result->output()); replaceOutputUse(node->output(1), arg_scatter); markNodeForDeletion(node); return result; } } // namespace __attribute__((constructor(HANDLER_INIT_PRIORITY))) static void registration() { registerHandler(torch_scatter::scatter_max, torchScatterHandler); registerHandler(torch_scatter::scatter_min, torchScatterHandler); registerHandler(torch_scatter::scatter_mul, torchScatterHandler); } } // namespace poptorch ================================================ FILE: poptorch/source/popart_canonicalization/PyGTorchSplineConvOps.cpp ================================================ // Copyright (c) 2023 Graphcore Ltd. All rights reserved. #include "../PoptorchStaticInit.hpp" #include "../PoptorchSymbols.hpp" #include "PopartCanonicalizationUtils.hpp" #include "poptorch/OpBuilder.hpp" #include "poptorch/Utils.hpp" #include "poptorch_logging/Error.hpp" #include "poptorch_logging/Logging.hpp" namespace poptorch { namespace { torch::jit::Node *torchSplineBasisHandler(torch::jit::Graph *graph, torch::jit::Node *node) { // Signatures for spline_basis // (Tensor pseudo, Tensor kernelSize, Tensor isOpenSpline, int degree) const std::vector args{node->input(0), node->input(1), node->input(2)}; const std::int32_t degree = constantToInt(node->input(3)->node()); auto *result = createSplinebasis(graph, args, degree); return result; } torch::jit::Node *torchSplineWeightingHandler(torch::jit::Graph *graph, torch::jit::Node *node) { // Signatures for spline_weighting // (Tensor input, Tensor weight, Tensor basis, Tensor weightIndex) const std::vector args{node->input(0), node->input(1), node->input(2), node->input(3)}; auto *result = createSplineweighting(graph, args); return result; } } // namespace __attribute__((constructor(HANDLER_INIT_PRIORITY))) static void registration() { registerHandler(torch_spline_conv::spline_basis, torchSplineBasisHandler); registerHandler(torch_spline_conv::spline_weighting, torchSplineWeightingHandler); } } // namespace poptorch ================================================ FILE: poptorch/source/popart_canonicalization/RNNOps.cpp ================================================ // Copyright (c) 2020 Graphcore Ltd. All rights reserved. #include "../PoptorchStaticInit.hpp" #include "PopartCanonicalizationUtils.hpp" #include "poptorch/OpBuilder.hpp" #include "poptorch/Utils.hpp" #include "poptorch_logging/Error.hpp" namespace poptorch { namespace { torch::jit::Value *prependDimension(torch::jit::Graph *graph, torch::jit::Value *tensor) { auto shape = shapeFromTensor(tensor); shape.insert(shape.begin(), 1); return createReshape(graph, tensor, shape)->output(); } torch::jit::Value *reshapeWeights(torch::jit::Graph *graph, torch::jit::Value *tensor, int64_t slice_size, bool transpose = false, bool swap = true) { std::vector slices; unsigned num_slices = 3; torch::jit::Node *split = createSplit(graph, {tensor}, num_slices, 1, {slice_size, slice_size, slice_size}); for (unsigned i = 0; i < num_slices; ++i) { torch::jit::Value *transposed = split->output(i); if (transpose) { transposed = createTranspose(graph, {transposed}, {0, 2, 1})->output(); } slices.push_back(transposed); } if (swap) { std::swap(slices[0], slices[1]); } torch::jit::Node *concat = createConcat(graph, slices, 1); return concat->output(); } torch::jit::Node *gruHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *input = node->input(0); auto *hx = node->input(1); auto params = node->input(2)->node()->inputs(); bool bias = constantToBool(node->input(3)->node()); int num_layers = constantToLong(node->input(4)->node()); float dropout = constantToFloat(node->input(5)->node()); bool bidirectional = constantToBool(node->input(7)->node()); bool batch_first = constantToBool(node->input(8)->node()); ERROR_ON_MSG(num_layers != 1, "Only GRU with 1 layer supported"); ERROR_ON_MSG(dropout != 0.0f, "GRU only supports dropout = 0.0"); ERROR_ON_MSG(bidirectional, "bidirectional GRU not supported"); auto *gate_weights = prependDimension(graph, params[0]); auto *recur_weights = prependDimension(graph, params[1]); auto input_shape = shapeFromTensor(input); unsigned seq_length = input_shape[0]; unsigned batch_size = input_shape[1]; auto recur_shape = shapeFromTensor(recur_weights); int64_t hidden_size = recur_shape[2]; gate_weights = reshapeWeights(graph, gate_weights, hidden_size); recur_weights = reshapeWeights(graph, recur_weights, hidden_size); torch::jit::Value *biases; if (bias) { auto *gate_biases = prependDimension(graph, params[2]); auto *recur_biases = prependDimension(graph, params[3]); gate_biases = reshapeWeights(graph, gate_biases, hidden_size); recur_biases = reshapeWeights(graph, recur_biases, hidden_size); biases = createConcat(graph, {gate_biases, recur_biases}, 1)->output(); } else { biases = createConstantFloatLike(graph, input, {0.}, {1l, 6l * hidden_size}) ->output(); } // TODO(T54563) auto *seq_lens = createConstantInt(graph, {seq_length}, {batch_size})->output(); if (batch_first) { input = createTranspose(graph, {input}, {1, 0, 2})->output(); } auto *gru = createGru( graph, {input, gate_weights, recur_weights, biases, seq_lens, hx}, hidden_size); auto *output = createSqueeze(graph, {gru->output(0)}, {1})->output(); if (batch_first) { output = createTranspose(graph, {output}, {1, 0, 2})->output(); } replaceOutputUse(node->output(0), output); replaceOutputUse(node->output(1), gru->output(1)); markNodeForDeletion(node); return nullptr; } torch::jit::Node *lstmHandler(torch::jit::Graph *graph, torch::jit::Node *node) { // aten::lstm(Tensor self, Tensor[] hx, Tensor[] weights, bool bias, // int num_layers, float dropout, bool training, bool bidirectional, // bool batch_first) -> Tensor, (Tensor, Tensor) torch::jit::Value *input = node->input(0); torch::jit::ArrayRef hidden_layers = node->input(1)->node()->inputs(); torch::jit::ArrayRef weights_list = node->input(2)->node()->inputs(); bool use_bias = constantToBool(node->input(3)->node()); ERROR_ON_MSG(!use_bias, "LSTM without biases not supported"); std::int64_t num_layers = constantToLong(node->input(4)->node()); ERROR_ON_MSG(num_layers != 1, "Only LSTM with 1 layer supported"); float dropout = constantToFloat(node->input(5)->node()); ERROR_ON_MSG(dropout != 0.0f, "LSTM only supports dropout = 0.0"); bool bidirectional = constantToBool(node->input(7)->node()); ERROR_ON_MSG(bidirectional, "bidirectional LSTM not supported"); bool batch_first = constantToBool(node->input(8)->node()); // An LSTM state is made of 4 values constexpr std::uint64_t state_size = 4; const std::int64_t num_weights = *weights_list[0]->type()->expect()->sizes()[0]; ERROR_ON(num_weights % state_size != 0); const std::int64_t num_hidden_layers = num_weights / state_size; // def reshape_weights(onnx_weights): // ws = builder.aiOnnx.split([w], 4, 1, [hidden_size] * 4) // ws = [builder.aiOnnx.transpose([i], [0, 2, 1]) for i in ws] // ws = builder.aiOnnx.concat([ws[i] for i in (2, 0, 3, 1)], 0) // return ws // // Note: onnx weights are in IOFC order while Torch uses IFCO // // Biases don't need to be transposed auto reshape_tensor = [&](torch::jit::Value *values, bool areWeights) { const std::uint64_t num_dims_without_batch = areWeights ? 2 : 1; std::vector shape = shapeFromTensor(values); if (shape.size() == num_dims_without_batch) { // Add a batch dimension shape.insert(shape.begin(), 1); torch::jit::Node *reshape = createReshape(graph, values, shape); values = reshape->output(); } torch::jit::Node *states = createSplit(graph, {values}, state_size, 1, {num_hidden_layers, num_hidden_layers, num_hidden_layers, num_hidden_layers}); std::vector slices; for (std::uint64_t i = 0; i < state_size; ++i) { if (areWeights) { // Weights also need to be transposed torch::jit::Node *transposed = createTranspose(graph, {states->output(i)}, {0, 2, 1}); slices.push_back(transposed->output()); } else { slices.push_back(states->output(i)); } } torch::jit::Node *concat = createConcat(graph, {slices[1], slices[0], slices[2], slices[3]}, 0); return concat->output(); }; torch::jit::Node *concat_weights = createConcat(graph, {reshape_tensor(weights_list[0], true), reshape_tensor(weights_list[1], true)}, 1); torch::jit::Node *combine_biases = createAddNotInPlace(graph, reshape_tensor(weights_list[2], false), reshape_tensor(weights_list[3], false)); torch::jit::Node *concat_states = createConcat(graph, {hidden_layers[0], hidden_layers[1]}, 0); std::vector input_shape = shapeFromTensor(input); std::int64_t batch_dim = 0; // Transpose output BSF -> SBF if (batch_first) { torch::jit::Node *transpose = createTranspose(graph, {input}, {1, 0, 2}); input = transpose->output(); batch_dim = 1; } std::vector args; args.push_back(input); args.push_back(concat_weights->output()); // input weights + output_weights args.push_back(combine_biases->output()); // biases args.push_back(concat_states->output()); // init_states torch::jit::Node *lstm = createLstm(graph, args, 1); // Keep the last slice from Y `[seq_length, num_directions, batch_size, // hidden_size] torch::jit::Node *y_h = createSlice(graph, {lstm->output(0)}, {INT_MAX}, {input_shape[batch_dim] - 1}, {0}); torch::jit::Value *output = lstm->output(0); // Transpose output SBF -> BSF if (batch_first) { torch::jit::Node *transpose = createTranspose(graph, {output}, {1, 0, 2}); output = transpose->output(); } // The shape of y_c returned by PopART has shape (batch_size, hidden_size). // Torch's c_n output has shape // (num_directions * num_layers, batch_size, hidden_size), but since we don't // support bidirectional or > 1 layers, this dimension is always 1 so we just // need to prepend a single dim auto *y_c = createUnsqueeze(graph, {lstm->output(1)}, {0}); ERROR_ON(node->outputs().size() != 3); if (node->hasUses()) { replaceOutputUse(node->output(0), output); replaceOutputUse(node->output(1), y_h->output()); replaceOutputUse(node->output(2), y_c->output()); } markNodeForDeletion(node); return nullptr; } torch::jit::Node *rnnHandler(torch::jit::Graph *graph, torch::jit::Node *node, const std::string &nonlinearity) { // rnn_{tanh/relu}.input(Tensor input, Tensor hx, Tensor[] params, // bool has_biases, int num_layers, float dropout, bool train, // bool bidirectional, bool batch_first) -> (Tensor, Tensor) torch::jit::Value *input = node->input(0); torch::jit::Value *hx = node->input(1); torch::jit::ArrayRef params = node->input(2)->node()->inputs(); torch::jit::Value *w_ih = params[0]; // input-hidden weights torch::jit::Value *w_hh = params[1]; // hidden-hidden weights torch::jit::Value *b_ih = params[2]; // input-hidden bias torch::jit::Value *b_hh = params[3]; // hidden-hidden bias bool has_biases = constantToBool(node->input(3)->node()); ERROR_ON_MSG(!has_biases, "RNN without biases is not supported"); int num_layers = constantToInt(node->input(4)->node()); ERROR_ON_MSG(num_layers != 1, "Only RNN with 1 layer is supported"); float dropout = constantToFloat(node->input(5)->node()); ERROR_ON_MSG(dropout != 0.0f, "RNN only supports dropout = 0.0"); bool bidirectional = constantToBool(node->input(7)->node()); ERROR_ON_MSG(bidirectional, "Bidirectional RNN is not supported"); bool batch_first = constantToBool(node->input(8)->node()); auto input_shape = shapeFromTensor(input); int64_t sequence_length; int64_t batch_size; if (batch_first) { // N, L, H_in -> L, N, H_in input = createTranspose(graph, {input}, {1, 0, 2})->output(); sequence_length = input_shape.at(1); batch_size = input_shape.at(0); } else { sequence_length = input_shape.at(0); batch_size = input_shape.at(1); } auto *b = createConcat(graph, {b_ih, b_hh}, 0)->output(); // Fix concat result shape so that we can use prependDimension(). auto hidden_size = shapeFromTensor(b_ih).front(); b->setType( b_ih->type()->expect()->withSizes({2 * hidden_size})); // TODO(T54563) auto *sequence_lens = createConstantInt(graph, {sequence_length}, {batch_size})->output(); std::vector args = { // [seq_length, batch_size, input_size] input, // [num_directions, hidden_size, input_size] prependDimension(graph, w_ih), // [num_directions, hidden_size, hidden_size] prependDimension(graph, w_hh), // [num_directions, 2*hidden_size] prependDimension(graph, b), // [batch_size] sequence_lens, // [num_directions, batch_size, hidden_size] hx, }; auto *rnn = createRnn(graph, args, {nonlinearity}); auto *output_0 = createReshape(graph, rnn->output(0), {sequence_length, batch_size, hidden_size}) ->output(); if (batch_first) { // L, N, H_out -> N, L, H_out output_0 = createTranspose(graph, {output_0}, {1, 0, 2})->output(); } replaceOutputUse(node->output(0), output_0); replaceOutputUse(node->output(1), rnn->output(1)); markNodeForDeletion(node); return nullptr; } torch::jit::Node *rnnTanhHandler(torch::jit::Graph *graph, torch::jit::Node *node) { return rnnHandler(graph, node, "Tanh"); } torch::jit::Node *rnnReluHandler(torch::jit::Graph *graph, torch::jit::Node *node) { return rnnHandler(graph, node, "Relu"); } } // namespace __attribute__((constructor(HANDLER_INIT_PRIORITY))) static void registration() { registerHandler(c10::aten::gru, gruHandler); registerHandler(c10::aten::lstm, lstmHandler); registerHandler(c10::aten::rnn_tanh, rnnTanhHandler); registerHandler(c10::aten::rnn_relu, rnnReluHandler); } } // namespace poptorch ================================================ FILE: poptorch/source/popart_canonicalization/RandomSamplingOps.cpp ================================================ // Copyright (c) 2020 Graphcore Ltd. All rights reserved. #include #include "PopartCanonicalizationUtils.hpp" #include "../PoptorchStaticInit.hpp" #include "../PoptorchSymbols.hpp" #include "poptorch/OpBuilder.hpp" #include "poptorch/Utils.hpp" #include "poptorch_logging/Error.hpp" namespace poptorch { namespace { torch::jit::Node *normalHandler(torch::jit::Graph *graph, torch::jit::Node *node) { // Overloads for aten::normal // 1) both mean and std are scalar floats // aten::normal(float mean, float std, int[] size, Generator?, int? dtype, // int? layout, Device? device, bool? pin_memory) -> Tensor // // 2) mean is a tensor and std is a scalar // aten::normal(Tensor mean, float std, Tensor? out) // // 3) mean is a scalar and std is a tensor // aten::normal(float mean, Tensor std, Tensor? out) // // 4) both mean and std are tensors // aten::normal(Tensor mean, Tensor std, Tensor? out) torch::jit::Value *mean = node->input(0); torch::jit::Value *std = node->input(1); std::vector shape = shapeFromTensor(node->output()); bool mean_scalar = isConstantScalar(mean); bool std_scalar = isConstantScalar(std); if (mean_scalar && std_scalar) { // Both mean and std are scalar constant floats float mean_constant = constantToFloat(mean->node()); float std_constant = constantToFloat(std->node()); return createRandomNormal(graph, {mean, std}, shape, mean_constant, std_constant); } // One or both of mean/std inputs must be tensors. Generate the output tensor // of random numbers drawn from separate normal distribution whose mean and // std are given as tensors using the following transform: // // normal(mean=0, std=1) * std + mean // // Broadcasting will take care of expanding any scalars to the correct shape. // Use {mean} to identify the type only auto mean_type = getNodeScalarType(mean); auto std_type = getNodeScalarType(std); if (mean_type != std_type) { if (mean_scalar && !std_scalar) { mean = createCast(graph, mean, std_type)->output(); } if (!mean_scalar && std_scalar) { std = createCast(graph, std, mean_type)->output(); } } torch::jit::Node *normal = createRandomNormal(graph, {mean, std}, shape, 0.0f, 1.0f); torch::jit::Node *mul = poptorch::createMul(graph, {normal->output(), std}); return poptorch::createAdd(graph, {mul->output(), mean}); } torch::jit::Node *bernoulliHandler(torch::jit::Graph *graph, torch::jit::Node *node) { // aten::bernoulli(Tensor self, float? probability) // Check for scalar probability torch::jit::Value *prob = node->input(1); if (isNone(prob)) { // probabilities passed as input tensor prob = node->input(0); } std::vector shape = shapeFromTensor(node->output()); c10::ScalarType dtype = getNodeScalarType(node->input(0)); torch::jit::Value *uniform = createRandomUniform(graph, nullptr, shape, 1.0, 0.0, dtype)->output(); torch::jit::Value *lt = createLess(graph, {uniform, prob})->output(); return createCast(graph, lt, dtype); } torch::jit::Node *exponentialHandler(torch::jit::Graph *graph, torch::jit::Node *node) { // aten::exponential_(Tensor self, double lambda) torch::jit::Value *self = node->input(0); torch::jit::Value *lambda = node->input(1); torch::jit::Value *output = node->output(); std::vector shape = shapeFromTensor(output); c10::ScalarType dtype = getNodeScalarType(self); c10::ScalarType dtype_rng = c10::ScalarType::Float; // Use smallest non-zero value to prevent the posibility of // log(0) with minimal bias on the sampling distribution float low = std::numeric_limits::min(); torch::jit::Value *x = createRandomUniform(graph, nullptr, shape, 1.0, low, dtype_rng)->output(); auto *log_x = createLog(graph, {x})->output(); auto *neg_log_x = createNeg(graph, {log_x})->output(); auto *exponential = createDiv(graph, {neg_log_x, lambda})->output(); return createCast(graph, exponential, dtype); } } // namespace __attribute__((constructor(HANDLER_INIT_PRIORITY))) static void registration() { registerHandler(c10::aten::normal, normalHandler); registerHandler(c10::aten::bernoulli, bernoulliHandler); registerHandler(c10::aten::exponential_, exponentialHandler); } } // namespace poptorch ================================================ FILE: poptorch/source/popart_canonicalization/ReduceOps.cpp ================================================ // Copyright (c) 2020 Graphcore Ltd. All rights reserved. #include #include "../PoptorchStaticInit.hpp" #include "PopartCanonicalizationUtils.hpp" #include "poptorch/OpBuilder.hpp" #include "poptorch_logging/Error.hpp" #include "../PoptorchSymbols.hpp" #include "poptorch/Utils.hpp" #include namespace poptorch { namespace { torch::jit::Node *reduceHandler(torch::jit::Graph *graph, torch::jit::Node *node) { // Reductions have three overloads. The first is: // aten::mean(Tensor self, int[] dim, int keepdim, Tensor? out)) -> tensor // The second is: // aten::mean(Tensor self, int? dtype)) -> tensor // The third is for boolean reductions // aten::all(Tensor self) -> tensor torch::jit::Symbol const kind = node->kind(); torch::jit::Value *input = node->input(0); // sum and prod works even for bool types in PyTorch const auto tensor_type = input->type()->expect(); if (tensor_type->scalarType() == at::ScalarType::Bool) { auto *cast_node = createCast(graph, input, c10::ScalarType::Int); input = cast_node->output(); } std::vector axes{}; std::int64_t keepdim = 0; // Case 2/3 or case 1 with no dimension specified. const size_t case_2_3 = (kind == c10::aten::any || kind == c10::aten::all) ? 1 : 2; bool flatten = node->inputs().size() == case_2_3; if (!flatten) { // Case 1. // Sometimes the dimensions are just one int. if (node->input(1)->node()->kind() == symbols::poptorch::tensor_constant) { axes.push_back(constantToLong(node->input(1)->node())); } else { axes = constantToLongVec(node->input(1)->node()); // No dimension specified: this is actually a case 1. if (axes.empty()) { flatten = true; } } keepdim = constantToLong(node->input(2)->node()); } if (flatten) { // Need to use reshape as "Flatten" is for 2D output auto numels_optional = tensor_type->numel(); ERROR_ON(!numels_optional); input = createReshape(graph, input, {static_cast(*numels_optional)}) ->output(); axes = {0}; keepdim = 0; } // Output the correct reduction. if (kind == c10::aten::prod) { return createReduceprod(graph, {input}, axes, keepdim); } if (kind == c10::aten::mean) { return createReducemean(graph, {input}, axes, keepdim); } if (kind == c10::aten::sum) { return createReducesum(graph, {input}, axes, keepdim); } if (kind == c10::aten::logsumexp) { return createReducelogsumexp(graph, {input}, axes, keepdim); } if (kind == c10::aten::all) { auto *t0 = createAbs(graph, {input})->output(); auto *t1 = createReducemin(graph, {t0}, axes, keepdim)->output(); return createCast(graph, t1, at::ScalarType::Bool); } if (kind == c10::aten::any) { auto *t0 = createAbs(graph, {input})->output(); auto *t1 = createReducemax(graph, {t0}, axes, keepdim)->output(); return createCast(graph, t1, at::ScalarType::Bool); } ERROR("Popart Canonicalisation: UNREACHABLE reached in reductions."); } torch::jit::Node *reduceMedianHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *input = node->input(0); std::vector axes; std::int64_t keepdim = 0; torch::jit::Node *output; if (node->inputs().size() == 1) { // aten::median(Tensor self) -> Tensor axes = reduceHelperDimensionCreator(input); auto *reduced = createReducemedian(graph, {input}, axes, keepdim); reduced->eraseOutput(1); output = reduced; } else { // aten::median(Tensor self, int dim, bool keepdim) // -> (Tensor values, Tensor indices) axes.push_back(constantToLong(node->input(1)->node())); keepdim = constantToLong(node->input(2)->node()); output = createReducemedian(graph, {input}, axes, keepdim); } return output; } torch::jit::Node *aMinMaxHandler(torch::jit::Graph *graph, torch::jit::Node *node) { // aten::max(Tensor self, int[] dim, int keepdim) // aten::min(Tensor self, int[] dim, int keepdim) auto *input = node->input(0); auto axes = constantToLongVec(node->input(1)->node()); const auto keepdim = constantToLong(node->input(2)->node()); if (axes.empty()) { input = createFlatten(graph, {input}, 0)->output(); axes = {1}; } if (node->kind() == c10::aten::amax) { return createReducemax(graph, {input}, axes, keepdim); } return createReducemin(graph, {input}, axes, keepdim); } torch::jit::Node *argMinMaxHandler(torch::jit::Graph *graph, torch::jit::Node *node) { // aten::argmin(Tensor in, int? dim, int keep_dims) -> Tensor // aten::argmax(Tensor in, int? dim, int keep_dims) -> Tensor // dim (int) - the dimension to reduce. If None, the argmax // of the flattened input is returned. torch::jit::Symbol const kind = node->kind(); torch::jit::Value *input = node->input(0); std::optional dim; if (node->input(1)->node()->kind() == symbols::poptorch::tensor_constant) { dim = constantToLong(node->input(1)->node()); } std::int64_t const keep_dim = constantToLong(node->input(2)->node()); // If dim is not provided we will flatten input so just use 0 in that // case. std::int64_t dim_to_use = 1; // Check if dim is NONE. if (!dim) { torch::jit::Node *flatten = createFlatten(graph, {node->input(0)}, 0); input = flatten->output(); } else { dim_to_use = *dim; } torch::jit::Node *indices; // Create the actual argmax/argmin. if (kind == c10::aten::argmax) { indices = createArgmax(graph, {input}, dim_to_use, keep_dim); } else { indices = createArgmin(graph, {input}, dim_to_use, keep_dim); } // Note: these ops return int64, so we need to cast them to int return createCast(graph, indices->output(), c10::ScalarType::Int); } torch::jit::Node *argsortHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *x = node->input(0); auto t0 = x->type()->expect(); const std::vector shape = shapeFromTensor(node->input(0)); const auto dim = handleDimensionParam(node->input(1), t0); auto *size = createConstantLong(graph, {shape[dim]}, {1})->output(); auto *topk = createTopk(graph, {x, size}, dim, true /*largest*/, true /*sorted*/); auto *indices = topk->output(1); // Onnx will output the indices long, so use a cast to revert the type. // PopART will remove it as an identity when topk resolves to output an int. indices = createCast(graph, indices, c10::ScalarType::Int)->output(); const auto descending = constantToBool(node->input(2)->node()); if (descending) { return indices->node(); } const std::vector dims{dim}; return createReverse(graph, {indices}, dims); } torch::jit::Node *minMaxWithIndicesHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *x = node->input(0); auto t0 = x->type()->expect(); const std::vector shape = shapeFromTensor(x); torch::jit::Value *values; torch::jit::Value *indices; if (shape.empty()) { values = createIdentity(graph, {x})->output(); indices = createConstantInt(graph, {0}, {})->output(); } else { const auto dim = handleDimensionParam(node->input(1), t0); const auto keepdim = constantToBool(node->input(2)->node()); const bool negate = node->kind() == c10::aten::min; if (negate) { x = createNeg(graph, {x})->output(); } auto *one = tensorToConstant(graph, at::tensor(1L))->output(); auto *result = createTopk(graph, {x, one}, dim, true /*largest*/, true /*sorted*/); values = result->output(0); indices = result->output(1); // TopK returns UINT32 indices, but torch doesn't have unsigned // 32 bit integer tensor types so we need to cast back to INT32 indices = createCast(graph, indices, c10::ScalarType::Int)->output(); if (negate) { values = createNeg(graph, {values})->output(); } if (!keepdim) { // Squeeze out the singleton-dim left by topk values = createSqueeze(graph, {values}, {dim})->output(); indices = createSqueeze(graph, {indices}, {dim})->output(); } } replaceOutputUse(node->output(0), values); replaceOutputUse(node->output(1), indices); markNodeForDeletion(node); return nullptr; } template torch::jit::Node *minMaxHandler(torch::jit::Graph *graph, torch::jit::Node *node, ReduceFunc &&reduceFunc, ExtremaFunc &&extremaFunc) { if (node->inputs().size() == 1) { auto *x = node->input(0); auto t0 = reduceHelperDimensionCreator(x); return reduceFunc(graph, {x}, t0, 0); } if (node->inputs().size() == 2) { auto *i0 = node->input(0); auto *i1 = node->input(1); return extremaFunc(graph, {i0, i1}); } return minMaxWithIndicesHandler(graph, node); } torch::jit::Node *minHandler(torch::jit::Graph *graph, torch::jit::Node *node) { return minMaxHandler(graph, node, createReducemin, createMin); } torch::jit::Node *maxHandler(torch::jit::Graph *graph, torch::jit::Node *node) { return minMaxHandler(graph, node, createReducemax, createMax); } torch::jit::Node *tensorNormHandler(torch::jit::Graph *graph, torch::jit::Node *node) { // aten::norm(Tensor in, int p) -> Tensor // aten::norm(Tensor in, float p) -> Tensor // aten::norm(Tensor in, int p, int[] dim, int keepdim) -> Tensor // aten::norm(Tensor in, float p, int[] dim, int keepdim) -> Tensor // aten::norm(Tensor self, Scalar? p, int[1] dim, bool keepdim, *, // ScalarType dtype) -> Tensor // aten::norm(Tensor self, Scalar? p, Dimname[1] dim, bool keepdim, *, // ScalarType dtype) -> Tensor // aten::norm(Tensor self, Scalar? p, int[1] dim, bool keepdim=False, *, // Tensor(a!) out) -> Tensor(a!) // aten::norm(Tensor self, Scalar? p, int[1] dim, bool keepdim=False, *, // Tensor(a!) out) -> Tensor(a!) // aten::norm(Tensor self, Scalar? p, Dimname[1] dim, bool keepdim=False, *, // Tensor(a!) out) -> Tensor(a!) // aten::norm(Tensor self, Scalar? p, Dimname[1] dim, bool keepdim=False, *, // Tensor(a!) out) -> Tensor(a!) // // aten::norm(Tensor self, Scalar? p, int[1] dim, bool keepdim, *, // ScalarType dtype, Tensor(a!) out) -> Tensor(a!) // aten::norm(Tensor self, Scalar? p, int[1] dim, bool keepdim, *, // ScalarType dtype, Tensor(a!) out) -> Tensor(a!) // aten::norm(Tensor self, Scalar? p, Dimname[1] dim, bool keepdim, *, // ScalarType dtype, Tensor(a!) out) -> Tensor(a!) // aten::norm(Tensor self, Scalar? p, Dimname[1] dim, bool keepdim, *, // ScalarType dtype, Tensor(a!) out) -> Tensor(a!) // // aten::linalg_norm(Tensor self, Scalar? ord=None, int[1]? dim=None, // bool keepdim=False, *, ScalarType? dtype=None) -> Tensor // aten::linalg_norm(Tensor self, str ord, int[1]? dim=None, // bool keepdim=False, *, ScalarType? dtype=None) -> Tensor // aten::linalg_norm(Tensor self, Scalar? ord=None, int[1]? dim=None, // bool keepdim=False, *, ScalarType? dtype=None, // Tensor(a!) out) -> Tensor(a!) // aten::linalg_norm(Tensor self, Scalar? ord=None, int[1]? dim=None, // bool keepdim=False, *, ScalarType? dtype=None, // Tensor(a!) out) -> Tensor(a!) // aten::linalg_norm(Tensor self, str ord, int[1]? dim=None, // bool keepdim=False, *, ScalarType? dtype=None, // Tensor(a!) out) -> Tensor(a!) // aten::linalg_norm(Tensor self, str ord, int[1]? dim=None, // bool keepdim=False, *, ScalarType? dtype=None, // Tensor(a!) out) -> Tensor(a!) torch::jit::Value *input = node->input(0); torch::jit::Value *p_val = node->input(1); std::vector axes{}; std::int64_t keepdim = 0; if (node->inputs().size() == 2) { torch::jit::Node *flatten = createFlatten(graph, {input}, 0); input = flatten->output(); axes = {1}; } else { auto *axes_val = node->input(2); if (!isNone(axes_val)) { axes = constantToLongVec(node->input(2)->node()); } keepdim = constantToLong(node->input(3)->node()); const auto shape = shapeFromTensor(input); // Empty axes array means reduce over all axes in PyTorch, but means // do nothing in PopART if (axes.empty()) { axes.resize(shape.size()); std::iota(std::begin(axes), std::end(axes), 0); } // handle optional dtype if (node->inputs().size() >= 5) { auto *input_4 = node->input(4); const bool is_scalar_type = input_4->type()->kind() == c10::TypeKind::ScalarTypeType; if (is_scalar_type) { if (auto *opt_dtype = input_4; opt_dtype->mustNotBeNone()) { const auto &opt_dtype_tensors = opt_dtype->node()->ts(c10::attr::value); ERROR_ON(opt_dtype_tensors.empty()); if (!opt_dtype_tensors.front().is_floating_point()) { input = createCast(graph, input, constantToScalarType(opt_dtype->node())) ->output(); } } } } // If we're reducing over singleton dims and keeping them, the // behaviour of PopART reduce ops is to do nothing, but PyTorch will // still take the absolute value of the tensor, so we need to // do the same if ((keepdim != 0) && std::all_of(axes.begin(), axes.end(), [&](std::int64_t i) { return shape[i] == 1; })) { return createAbs(graph, {input}); } } constexpr float pos_inf = std::numeric_limits::infinity(); constexpr float neg_inf = -std::numeric_limits::infinity(); const float p = constantToFloat(node->input(1)->node()); if (p == 1.0) { return createReducel1(graph, {input}, axes, keepdim); } if (p == 2.0) { return createReducel2(graph, {input}, axes, keepdim); } if (p == pos_inf || p == neg_inf) { // max/min(abs(x)) torch::jit::Node *abs = createAbs(graph, {input}); input = abs->output(); if (p == pos_inf) { return createReducemax(graph, {input}, axes, keepdim); } return createReducemin(graph, {input}, axes, keepdim); } // sum(abs(x)**p)**(1./p) torch::jit::Node *abs = createAbs(graph, {input}); torch::jit::Node *pow = createPow(graph, {abs->output(), p_val}); torch::jit::Node *sum = createReducesum(graph, {pow->output()}, axes, keepdim); at::ScalarType const p_type = getNodeScalarType(p_val); if (p_type == c10::ScalarType::Int || p_type == c10::ScalarType::Long) { // Cast int to float before reciprocal torch::jit::Node *to_float = createCast(graph, p_val, c10::kFloat); p_val = to_float->output(); } torch::jit::Node *one_over_p = createReciprocal(graph, {p_val}); return createPow(graph, {sum->output(), one_over_p->output()}); } torch::jit::Node *frobeniusnormHandler(torch::jit::Graph *graph, torch::jit::Node *node) { if (node->inputs().size() == 1) { auto *x = node->input(0); auto t0 = reduceHelperDimensionCreator(x); return createReducel2(graph, {x}, t0, 0); } if (node->inputs().size() == 3) { auto *x = node->input(0); auto *l = node->input(1); const auto t0 = constantToLongVec(l->node()); const auto t1 = reduceHelperDimensionCreator(x, t0); auto *c = node->input(2); const auto t2 = constantToLong(c->node()); const auto shape = shapeFromTensor(x); // If we're reducing over singleton dims and keeping them, the // behaviour of PopART reduce ops is to do nothing, but PyTorch will // still take the absolute value of the tensor, so we need to // do the same if ((t2 != 0) && std::all_of(t1.begin(), t1.end(), [&](std::int64_t i) { return shape[i] == 1; })) { return createAbs(graph, {x}); } return createReducel2(graph, {x}, t1, t2); } ERROR("Incorrect number of arguments for operator " << "c10::aten::frobenius_norm. " << "Expecting 1 or 3 operands, " << "got " << node->inputs().size() << " operand(s)."); return nullptr; } // count_nonzero.dim_IntList(Tensor self, int[] dim) -> Tensor torch::jit::Node *countNonzeroHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *self = node->input(0); auto dim = constantToLongVec(node->input(1)->node()); if (dim.empty()) { dim = shapeFromTensor(self); std::iota(dim.begin(), dim.end(), 0); } auto *self_bool = self; if (getNodeScalarType(self) != c10::ScalarType::Bool) { self_bool = createCast(graph, self, c10::ScalarType::Bool)->output(); } auto *where = createWhere(graph, {self_bool, wrapInConstant1D(graph, 1), wrapInConstant1D(graph, 0)}); return createReducesum(graph, {where->output()}, dim, /*keepdims=*/0); } torch::jit::Node *nanSumHandler(torch::jit::Graph *graph, torch::jit::Node *node) { // isNan -> where -> sum -> cast (if applicable) -> out torch::jit::Value *in_tensor = node->input(0); auto *is_nan = createIsnan(graph, {in_tensor}); auto *zeros = createConstantFloatLike(graph, in_tensor, {0}, shapeFromTensor(in_tensor)); auto *non_nans = createWhere(graph, {is_nan->output(0), zeros->output(0), in_tensor}); std::vector dims; auto *dim = node->input(1); if (auto *n = dim->node(); n->kind() == c10::prim::ListConstruct) { dims = constantToLongVec(n); } else if (isNone(dim)) { // We only get a node with Constant kind if `dim` is not // provided, so preform the sum over all the dimensions. const auto in_dim_count = shapeFromTensor(in_tensor).size(); dims.resize(in_dim_count); std::iota(dims.begin(), dims.end(), 0); } else { ERROR("Popart Canonicalisation: UNREACHABLE reached in nansum handler."); } const auto keepdim = constantToLong(node->input(2)->node()); auto *sum = createReducesum(graph, {non_nans->output(0)}, dims, keepdim); auto *dtype = node->input(3); if (!isNone(dtype)) { const auto type = constantToScalarType(dtype->node()); return createCast(graph, sum->output(0), type); } return sum; } } // namespace __attribute__((constructor(HANDLER_INIT_PRIORITY))) static void registration() { registerHandler(c10::aten::amax, aMinMaxHandler); registerHandler(c10::aten::amin, aMinMaxHandler); registerHandler(c10::aten::argmax, argMinMaxHandler); registerHandler(c10::aten::argmin, argMinMaxHandler); registerHandler(c10::aten::argsort, argsortHandler); registerHandler(c10::aten::prod, reduceHandler); registerHandler(c10::aten::mean, reduceHandler); registerHandler(c10::aten::median, reduceMedianHandler); registerHandler(c10::aten::sum, reduceHandler); registerHandler(c10::aten::logsumexp, reduceHandler); registerHandler(c10::aten::norm, tensorNormHandler); registerHandler(c10::aten::linalg_vector_norm, tensorNormHandler); registerHandler(c10::aten::frobenius_norm, frobeniusnormHandler); registerHandler(c10::aten::min, minHandler); registerHandler(c10::aten::minimum, minHandler); registerHandler(c10::aten::max, maxHandler); registerHandler(c10::aten::maximum, maxHandler); registerHandler(c10::aten::any, reduceHandler); registerHandler(c10::aten::all, reduceHandler); registerHandler(c10::aten::count_nonzero, countNonzeroHandler); registerHandler(c10::aten::nansum, nanSumHandler); } } // namespace poptorch ================================================ FILE: poptorch/source/popart_canonicalization/ReshapeOps.cpp ================================================ // Copyright (c) 2020 Graphcore Ltd. All rights reserved. #include #include "../PoptorchStaticInit.hpp" #include "PopartCanonicalizationUtils.hpp" #include "ScatterReduction.hpp" #include "poptorch/DispatchTracer.hpp" #include "poptorch/OpBuilder.hpp" #include "poptorch/Utils.hpp" #include "poptorch_logging/Error.hpp" #include "poptorch_logging/Logging.hpp" #include "../PoptorchSymbols.hpp" #include namespace poptorch { namespace { torch::jit::Node *expandHandler(torch::jit::Graph *graph, torch::jit::Node *node) { // aten::broadcast_to(Tensor(a) self, int[] size) -> Tensor(a) // aten::expand(Tensor self, int[] size) -> Tensor // NB signature in source has an, apparently unused boolean: // aten::expand(Tensor(a) self, int[] size, *, bool implicit=False) -> // Tensor(a) torch::jit::Node *new_node; // Extract the type from the pytorch IR. c10::TensorTypePtr const self_tensor = node->input(0)->type()->expect(); c10::VaryingShape const self_dims = self_tensor->sizes(); // Old shape const std::vector old_shape = shapeFromTensor(node->input(0)); // Count the elems in the old shape. const std::int64_t old_elem_count = std::accumulate( old_shape.begin(), old_shape.end(), 1, std::multiplies()); // Get the target size for the expand. std::vector new_shape = constantToLongVec(node->input(1)->node()); ERROR_ON_MSG(new_shape.size() < old_shape.size(), "The desired shape passed to expand should have at least as " "many dimensions as the input tensor (required at least " << old_shape.size() << ", got " << new_shape.size() << ")"); // A new shape element of -1 means that dimension should not change for (size_t i = 0; i < old_shape.size(); i++) { // If you give more dimensions in the desired shape than there are in the // input tensor, they'll get *pre*pended -- so to turn the -1s into lengths // from the input, work backwards. const auto input_idx = old_shape.size() - (i + 1); const auto input_len = old_shape[input_idx]; const auto desired_idx = new_shape.size() - (i + 1); const auto desired_len = new_shape[desired_idx]; if (desired_len == -1) { new_shape[desired_idx] = input_len; } else if (desired_len != input_len && input_len != 1) { ERROR("Can only expand dimensions of size 1; however, trying " "to expand dimension " << input_idx << " of size " << input_len << " to " << desired_len); } } // Count the number of elements in the target shape. const std::int64_t new_elem_count = std::accumulate( new_shape.begin(), new_shape.end(), 1, std::multiplies()); // Elements don't change so just a reshape. if (new_elem_count == old_elem_count) { new_node = createReshape(graph, node->input(0), new_shape); } else { // Otherwise we are expanding the original tensor. new_node = createConstantInt(graph, new_shape, {static_cast(new_shape.size())}); new_node = createCast(graph, new_node->output(), c10::kLong); new_node = createExpand(graph, {node->input(0), new_node->output()}); } return new_node; } torch::jit::Node *flattenHandler(torch::jit::Graph *graph, torch::jit::Node *node) { // flatten.using_ints(Tensor(a) self, int start_dim=0, int end_dim=-1) -> // Tensor(a) std::int64_t start_dim = constantToLong(node->input(1)->node()); std::int64_t end_dim = constantToLong(node->input(2)->node()); c10::TensorTypePtr const self_tensor = node->input(0)->type()->expect(); c10::VaryingShape const self_dims = self_tensor->sizes(); // Respect PyTorch negative dimensions if (end_dim < 0) { end_dim = (*self_dims.sizes()).size() + end_dim; } if (start_dim < 0) { start_dim = (*self_dims.sizes()).size() + start_dim; } std::vector new_shape; int dim = 0; std::int64_t flattened_dims = 1; // Flatten the selected dimensions. for (auto optional_int : *self_dims.sizes()) { if (dim < start_dim || dim > end_dim) { new_shape.push_back(*optional_int); } else { flattened_dims *= *optional_int; } if (dim == end_dim) { new_shape.push_back(flattened_dims); } dim++; } return createReshape(graph, node->input(0), new_shape); } torch::jit::Node *asStridedHandler(torch::jit::Graph * /*graph*/, torch::jit::Node * /*node*/) { // as_strided(Tensor(a) self, int[] size, int[] stride, int? // storage_offset=None) -> Tensor(a) // as_strided is very generic and as a result complex and expensive to handle. // However it is always generated as part of a decomposition so we should // catch whichever op is getting decomposed rather than deal with as_strided. ERROR( "InternalError: aten::as_strided should have been intercepted earlier."); return nullptr; } torch::jit::Node *reshapeHandler(torch::jit::Graph *graph, torch::jit::Node *node) { // aten::view(Tensor(a) self, int[] size) -> (Tensor(a)) // aten::_unsafe_view(Tensor self, SymInt[] size) -> Tensor const std::vector new_shape = shapeFromTensor(node->output()); // Reshape the tensor into that shape. return createReshape(graph, node->input(0), new_shape); } torch::jit::Node *expandAsHandler(torch::jit::Graph *graph, torch::jit::Node *node) { // aten::expand(Tensor self, int[] size, *, bool implicit) -> Tensor // aten::expand_as(Tensor self, Tensor other) -> Tensor torch::jit::Node *new_node; // Extract the type from the pytorch IR. c10::TensorTypePtr const self_tensor = node->input(0)->type()->expect(); c10::VaryingShape const self_dims = self_tensor->sizes(); std::int64_t old_elem_count = 0; for (auto optional_int : *self_dims.sizes()) { old_elem_count += *optional_int; } // Extract the type from the pytorch IR. c10::TensorTypePtr const as_tensor = node->input(1)->type()->expect(); c10::VaryingShape const dims = as_tensor->sizes(); // Convert that IR type into a C++ vector of ints. std::vector new_shape; std::int64_t new_elem_count = 0; for (auto optional_int : *dims.sizes()) { new_shape.push_back(*optional_int); new_elem_count += *optional_int; } // Elements don't change so just a reshape. if (new_elem_count == old_elem_count) { new_node = createReshape(graph, node->input(0), new_shape); } else { new_node = createConstantInt(graph, new_shape, {static_cast(new_shape.size())}); new_node = createCast(graph, new_node->output(), c10::kLong); new_node = createExpand(graph, {node->input(0), new_node->output()}); } return new_node; } torch::jit::Node *selectHandler(torch::jit::Graph *graph, torch::jit::Node *node) { // aten::select(Tensor self, int dim, int index) -> Tensor // Note: there is also this overload which is not supported at the moment // aten::select(Tensor[] list, int idx) -> Tensor auto *input = node->input(0); std::int64_t dim = constantToLong(node->input(1)->node()); const auto dims = shapeFromTensor(input); if (dim < 0) { dim += dims.size(); } auto *index_node = node->input(2)->node(); torch::jit::Node *slice_node; if (!isTensorConstant(index_node)) { // Handle dynamic index slice_node = createDynamicslice(graph, {input, index_node->output()}, {dim}, {1}, 1); } else { // Handle static index std::int64_t index = constantToLong(index_node); if (index < 0) { index += dims.at(dim); } slice_node = createSlice(graph, {input}, {index + 1}, {index}, {dim}); } // Reshape to remove the singleton dimenson left in by slice const auto original_shape = shapeFromTensor(node->output()); return createReshape(graph, slice_node->output(), original_shape); } torch::jit::Node *contiguousHandler(torch::jit::Graph *graph, torch::jit::Node *node) { // aten::contiguous(Tensor self, *, MemoryFormat // memory_format=contiguous_format) -> Tensor Returns a copy of the tensor but // in contiguous memory. // // Returns the tensor UNUSED(graph); node->output()->replaceAllUsesWith(node->input(0)); markNodeForDeletion(node); return nullptr; } torch::jit::Node *permuteHandler(torch::jit::Graph *graph, torch::jit::Node *node) { // aten::permute(Tensor self, int[] dims) -> Tensor std::vector permutation = constantToLongVec(node->input(1)->node()); c10::TensorTypePtr const as_tensor = node->input(0)->type()->cast(); c10::VaryingShape dims = as_tensor->sizes(); std::for_each(permutation.begin(), permutation.end(), [&](std::int64_t &val) { if (val < 0) { val = *dims.size() + val; } }); return createTranspose(graph, {node->input(0)}, permutation); } // Get the indices for im2col std::vector getGatherIndices(int64_t orig_rows, int64_t orig_cols, int64_t kernel_size_x, int64_t kernel_size_y, int64_t dilation_x, int64_t dilation_y, int64_t padding_x, int64_t padding_y, int64_t extra_padding, int64_t stride_x, int64_t stride_y) { const auto spatial_rows = (orig_rows + 2 * padding_y - dilation_y * (kernel_size_y - 1) - 1) / stride_y + 1; const auto spatial_cols = (orig_cols + 2 * padding_x - dilation_x * (kernel_size_x - 1) - 1) / stride_x + 1; const auto spatial_row_cols_product = spatial_rows * spatial_cols; const auto numel = spatial_row_cols_product * kernel_size_x * kernel_size_y; std::vector indices; indices.reserve(numel); for (int64_t idx = 0; idx < numel; idx++) { const auto kernel_offset = idx / spatial_row_cols_product; const auto kernel_x_offset = (kernel_offset % kernel_size_x) * dilation_x; const auto kernel_y_offset = (kernel_offset / kernel_size_x) * dilation_y; const auto spatial_offset = idx % spatial_row_cols_product; const auto spatial_x_offset = (spatial_offset % spatial_cols) * stride_x; const auto spatial_y_offset = (spatial_offset / spatial_cols) * stride_y; const auto actual_x = spatial_x_offset + kernel_x_offset; const auto actual_y = spatial_y_offset + kernel_y_offset; const auto in_idx = actual_y * (orig_cols + 2 * padding_x + extra_padding) + actual_x; if (actual_x < 0 || actual_y < 0) { ERROR("Out of range too low"); } if (actual_x < 0 || actual_y < 0 || actual_x >= (orig_cols + 2 * padding_x + 10) || actual_y >= (orig_rows + 2 * padding_y)) { ERROR("Out of range"); } indices.push_back(in_idx); } return indices; } // Reorder the padded im2col input to permit longer slices. // Update supplied indices in place to match: these will have longer // consecutive sequences. torch::jit::Node *reorderBasedOnStride(torch::jit::Graph *graph, torch::jit::Value *padded, const std::vector &data_shape, int64_t stride, int64_t last_dim_size, std::vector *indices) { // Reshape to allow slicing based on index modulo stride auto *reshaped = createReshape(graph, padded, {data_shape[0], data_shape[1], -1, stride}); // Slice and concatenate to order based on module stride std::vector stride_sliced_flattened; stride_sliced_flattened.reserve(stride); for (int64_t start = 0; start < stride; start++) { auto *stride_sliced = createSlice(graph, {reshaped->output()}, {start + 1}, {start}, {3}); auto *stride_flattened = createReshape(graph, stride_sliced->output(), {data_shape[0], data_shape[1], -1}); stride_sliced_flattened.push_back(stride_flattened->output()); } auto *concat = createConcat(graph, stride_sliced_flattened, 2); // Alter the indices to match for (size_t idx = 0; idx < indices->size(); idx++) { const uint64_t old_idx = (*indices)[idx]; (*indices)[idx] = (old_idx % stride) * (last_dim_size / stride) + old_idx / stride; } return concat; } // Convert indices to slices by accumulating consecutive indices into a single // slice. Returns slice values as a pair (start, end). std::vector> indicesToSlices(const std::vector &indices) { ERROR_ON(indices.empty()); // Represents the start and end of each slice in a pair std::vector> slices; int64_t slice_start = indices[0]; for (auto it = indices.begin() + 1; it != indices.end(); it++) { auto previous = *(it - 1); auto current = *it; if (current != previous + 1) { slices.emplace_back(slice_start, previous + 1); slice_start = current; } } // Handle the last slice slices.emplace_back(slice_start, indices.back() + 1); return slices; } torch::jit::Node *im2colHandler(torch::jit::Graph *graph, torch::jit::Node *node) { // aten::im2col(Tensor self, int[2] kernel_size, int[2] dilation, // int[2] padding, int[2] stride) -> Tensor torch::jit::Value *data = node->input(0); const std::vector data_shape = shapeFromTensor(data); ERROR_ON(data_shape.size() != 4); const std::vector kernel_shape = constantToLongVec(node->input(1)->node()); ERROR_ON(kernel_shape.size() != 2); const std::vector dilation = constantToLongVec(node->input(2)->node()); ERROR_ON(dilation.size() != 2); const std::vector padding = constantToLongVec(node->input(3)->node()); ERROR_ON(padding.size() != 2); const std::vector strides = constantToLongVec(node->input(4)->node()); ERROR_ON(strides.size() != 2); // First zero-pad the input // Pytorch gives the padding as being the amount to pad in both // directions. Popart has two arguments for each axis, the amount to pad in // each direction along that axis. In the form (Axis0Left, AxisNLeft..., // Axis0Right, AxisNRight) where left and right refer to the direction // along the axis to add zeros to. std::vector popart_padding{0, 0, padding[0], padding[1], 0, 0, padding[0], padding[1]}; // Increase RHS padding to ensure that the number of cols divides by the // x stride value auto current_width = data_shape[3] + padding[1] * 2; auto extra_padding = strides[1] - (current_width % strides[1]); extra_padding = extra_padding % strides[1]; popart_padding.back() += extra_padding; current_width += extra_padding; auto *padded = createConstantPad(graph, node->input(0), popart_padding, 0., true); auto padded_shape = shapeFromTensor(padded->output()); // Get the indices as if the spatial dimensions had been flattened auto indices = getGatherIndices(data_shape[2], data_shape[3], kernel_shape[1], kernel_shape[0], dilation[1], dilation[0], padding[1], padding[0], extra_padding, strides[1], strides[0]); // Calculate the last dim size as if it was flattened const auto last_dim_size = current_width * (data_shape[2] + padding[0] * 2); // Reorder to allow fewer slices then each index became a slice auto *rearranged = reorderBasedOnStride(graph, padded->output(), data_shape, strides[1], last_dim_size, &indices); const auto slices_start_end = indicesToSlices(indices); // Slice and concat for the reordering std::vector sliced; sliced.reserve(slices_start_end.size()); for (auto slice_start_end : slices_start_end) { sliced.push_back(createSlice(graph, {rearranged->output()}, {slice_start_end.second}, {slice_start_end.first}, {2}) ->output()); } auto *concat = createConcat(graph, sliced, 2); // Finally reshape to match PyTorch's expectation return createReshape( graph, concat->output(), {data_shape[0], data_shape[1] * kernel_shape[0] * kernel_shape[1], -1}); } // Make the scatter reduces indices for col2im at::Tensor getScatterReduceIndices(int64_t num_cols, int64_t orig_rows, int64_t orig_cols, int64_t kernel_size_x, int64_t kernel_size_y, int64_t dilation_x, int64_t dilation_y, int64_t padding_x, int64_t padding_y, int64_t stride_x, int64_t stride_y) { // Add unity dimensions for batch and channel to facilitate tiling later auto indices = at::empty({1, 1, num_cols}, at::dtype(at::ScalarType::Int) .memory_format(c10::MemoryFormat::Contiguous)); auto *indices_ptr = indices.data_ptr(); // The last dim has a mix of all kernel and spatial positions. Calculate // the number of spatial columns. const auto spatial_cols = ((orig_cols + 2 * padding_x - dilation_x * (kernel_size_x - 1) - 1) / stride_x) + 1; // spatial_rows*spatial_cols // (a short cut compared to calculating spatial_rows using the equivalent // expression used for spatial_cols) const auto spatial_row_cols_product = num_cols / (kernel_size_x * kernel_size_y); // Find the original co-ordinate (x, y) from which the value in col_idx was // copied and calculate what the index would be for (int64_t col_idx = 0; col_idx < num_cols; col_idx++) { const auto kernel_offset = col_idx / spatial_row_cols_product; const auto kernel_x_offset = (kernel_offset % kernel_size_x) * dilation_x; const auto kernel_y_offset = (kernel_offset / kernel_size_x) * dilation_y; const auto spatial_offset = col_idx % (spatial_row_cols_product); const auto spatial_x_offset = (spatial_offset % spatial_cols) * stride_x; const auto spatial_y_offset = (spatial_offset / spatial_cols) * stride_y; const auto actual_x = spatial_x_offset + kernel_x_offset - padding_x; const auto actual_y = spatial_y_offset + kernel_y_offset - padding_y; auto index = actual_y * orig_cols + actual_x; // If out of range, use an out of range index. Poplar will skip this // index. if (actual_x < 0 || actual_y < 0 || actual_x >= orig_cols || actual_y >= orig_rows) { index = orig_rows * orig_cols; } *indices_ptr = static_cast(index); indices_ptr++; // NOLINT } return indices; } torch::jit::Node *col2imHandler(torch::jit::Graph *graph, torch::jit::Node *node) { // aten::col2im(Tensor self, int[2] output_size, int[2] kernel_size, // int[2] dilation, int[2] padding, int[2] stride) -> Tensor // This is somewhat of an inverse to im2col: // col2im(im2col(input)) == divisor * input with divisor as a tensor // im2col and col2im were used to speed up convolutions via GEMM. torch::jit::Value *data = node->input(0); std::vector data_shape = shapeFromTensor(data); ERROR_ON(data_shape.size() != 3 && data_shape.size() != 2); const std::vector output_size = constantToLongVec(node->input(1)->node()); ERROR_ON(output_size.size() != 2); const std::vector kernel_shape = constantToLongVec(node->input(2)->node()); ERROR_ON(kernel_shape.size() != 2); const std::vector dilation = constantToLongVec(node->input(3)->node()); ERROR_ON(dilation.size() != 2); const std::vector padding = constantToLongVec(node->input(4)->node()); ERROR_ON(padding.size() != 2); const std::vector stride = constantToLongVec(node->input(5)->node()); ERROR_ON(stride.size() != 2); // We can be given an unbatched input, with one less dimension -- just give it // a dummy batch dim, to unify later processing. const bool unbatched_input = data_shape.size() == 2; if (unbatched_input) { data = createUnsqueeze(graph, {data}, {0})->output(); data_shape.insert(data_shape.begin(), 1); } // The batch and original channel ordering is unaffected by im2col so we can // reshape to factor them out. const auto out_channels = data_shape[1] / (kernel_shape[0] * kernel_shape[1]); const auto num_cols = data_shape[2] * (kernel_shape[0] * kernel_shape[1]); auto *reshaped = createReshape(graph, data, {data_shape[0], out_channels, num_cols}); // Use scatter reduce to add across the relevent positions const auto indices = getScatterReduceIndices( num_cols, output_size[0], output_size[1], kernel_shape[1], kernel_shape[0], dilation[1], dilation[0], padding[1], padding[0], stride[1], stride[0]); auto *indices_const = tensorToConstant(graph, indices); // The indices are shape (1, 1, num_cols) but need to be tiled for the // scatterreduce const auto repeats = at::ones({3}, at::dtype(at::ScalarType::Long) .memory_format(c10::MemoryFormat::Contiguous)); repeats[0] = data_shape[0]; repeats[1] = out_channels; auto *repeats_const = tensorToConstant(graph, repeats); auto *indices_tiled = createTile(graph, {indices_const->output(), repeats_const->output()}); static constexpr bool enable_index_broadcast = true; static constexpr std::int32_t sum_reduce = static_cast(ScatterReduction::Sum); static constexpr std::int32_t axis = 2; auto *scatter_reduced = createScatterreduce(graph, {reshaped->output(), indices_tiled->output()}, output_size[0] * output_size[1], axis, enable_index_broadcast, sum_reduce); auto *res = createReshape( graph, scatter_reduced->output(), {data_shape[0], out_channels, output_size[0], output_size[1]}); // If our input was unbatched, remove the dummy batch dim we added earlier. if (unbatched_input) { res = createSqueeze(graph, {res->output()}, {0}); } return res; } torch::jit::Node *transposeHandler(torch::jit::Graph *graph, torch::jit::Node *node) { // aten::transpose(Tensor self, int dim0, int dim1) -> Tensor std::int64_t dim0 = constantToLong(node->input(1)->node()); std::int64_t dim1 = constantToLong(node->input(2)->node()); c10::TensorTypePtr const as_tensor = node->input(0)->type()->cast(); c10::VaryingShape const dims = as_tensor->sizes(); // Convert that IR type into a C++ vector of ints. In popart the // permutation includes all elements (rotate last two elements with [0, 1, // 3, 2]) whereas in pytorch you only need to specify the dimensions being // moved (same operation, [3, 2]). So we need to make sure the IR reflects // that. std::vector permutation; c10::optional size = dims.size(); ERROR_ON_MSG(!size, std::string("Number of dimensions for tensor %") + node->input(0)->debugName() + " is undefined. " + "About to read uninitialized memory," + " unexpected behaviour happened before transpose."); for (std::uint64_t i = 0; i < *size; ++i) { permutation.push_back(i); } // Allow for python array style access. if (dim0 < 0) { dim0 = *size + dim0; } if (dim1 < 0) { dim1 = *size + dim1; } permutation[dim0] = dim1; permutation[dim1] = dim0; return createTranspose(graph, {node->input(0)}, permutation); } torch::jit::Node *numpyTHandler(torch::jit::Graph *graph, torch::jit::Node *node) { const auto shape = shapeFromTensor(node->input(0)); if (shape.size() < 2) { return node->input(0)->node(); } std::vector permutation; for (std::int64_t i = shape.size() - 1; i >= 0; i--) { permutation.push_back(i); } return createTranspose(graph, {node->input(0)}, permutation); } torch::jit::Node *splitChunkHandler(torch::jit::Graph *graph, torch::jit::Node *node) { // aten::split(Tensor self, int[] split_sizes, int dim=0) -> Tensor[]" // aten::split(Tensor self, int split_sizes, int dim=0) -> Tensor[]" // aten::chunk(Tensor self, int chunks, int dim) -> Tensor[] // aten::unsafe_chunk(Tensor self, int chunks, int dim) -> Tensor[] torch::jit::Symbol const kind = node->kind(); // Get the shape of the input. c10::TensorTypePtr const as_tensor = node->input(0)->type()->expect(); c10::VaryingShape const dims = as_tensor->sizes(); // Pythonic axis translation. const std::int64_t dim = constantToLong(node->input(2)->node()); const std::int64_t axis = dim >= 0 ? dim : *dims.size() + dim; // Size of each split ignoring the remainder at the end. std::vector size_of_each_split; // Split size can either be the number of splits or the size of the // splits. std::optional split_size; if (node->input(1)->node()->kind() == symbols::poptorch::tensor_constant) { ERROR_ON(getNodeScalarType(node->input(1)) != at::ScalarType::Int); split_size = constantToLong(node->input(1)->node()); } if (kind == c10::aten::chunk || kind == c10::aten::unsafe_chunk) { // Chunk takes in the *number of chunks*. Canonicalise it to *size of // chunks*. const auto chunk_dim = *dims[axis]; ERROR_ON_MSG(!split_size.has_value(), "aten::chunk/aten::unsfe_chunk expect to receive " "a single split_size"); const auto n_chunks = *split_size; // Integer division: (dim / n_chunks) with rounding up std::int64_t const slice_size = (chunk_dim + n_chunks - 1) / n_chunks; auto remaining_size = chunk_dim; while (remaining_size >= slice_size) { size_of_each_split.push_back(slice_size); remaining_size -= slice_size; } // If we can't divide into equal chunks, then divide such that all but // the last chunk are the same size, and the last chunk is smaller. // If such a division is not possible, then return one fewer // chunks than specified if (remaining_size > 0) { // Add an extra slice for the remainder. size_of_each_split.push_back(remaining_size); } } else if (split_size) { // Split takes in the size of each chunk. std::int64_t const slice_size = *split_size; for (int i = 0; i < *dims[axis] / slice_size; ++i) { size_of_each_split.push_back(slice_size); } // Add an extra slice for the remainder. if (*dims[axis] % *split_size != 0) { size_of_each_split.push_back(*dims[axis] % *split_size); } } else { size_of_each_split = constantToLongVec(node->input(1)->node()); } // Rolling index to track where we are in the tensor. std::int64_t index = 0; // The result of each slice. std::vector slices; // Slice up according to the canonicalised split vector. for (std::int64_t const slice_size : size_of_each_split) { torch::jit::Node *slice = createSlice( graph, {node->input(0)}, {index + slice_size}, {index}, {axis}); // Add the slice to the graph. slices.push_back(slice->output()); // Move along in the vector dimension. index += slice_size; } auto *list_node = createAndInsertNode(graph, at::prim::ListConstruct, slices); ERROR_ON(node->output()->uses().size() != 1); auto *unpack = node->output()->uses()[0].user; ERROR_ON(unpack->kind() != c10::prim::ListUnpack); ERROR_ON(slices.size() != unpack->outputs().size()); std::vector v; for (std::uint64_t i = 0; i < *dims.size(); ++i) { v.push_back(*dims[i]); } // Propagate types for (size_t i = 0; i < slices.size(); i++) { v[axis] = size_of_each_split[i]; const auto type = slices[i]->type()->expect()->withSizes(v); unpack->output(i)->setType(type); } return list_node; } torch::jit::Node *toHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto tensor_type = node->input(0)->type()->cast(); ERROR_ON_MSG(!tensor_type, "Casting from a non-tensor type not supported, in an aten::to."); // aten::to(Tensor(a) self, Device? device, int? dtype=None, bool // non_blocking=False, bool copy=False) -> Tensor(a|b)" aten::to(Tensor(a) // self, int? dtype=None, bool non_blocking=False, bool copy=False) -> // Tensor(a|b)" aten::to(Tensor(a) self, [args without dtype]) std::optional cast_to; if (node->input(1)->type()->cast() || node->input(1)->type()->cast()) { cast_to = getNodeScalarType(node->output(0)); } if (cast_to.has_value()) { // Avoid promoting to an unsupported type cast_to = coerceToSupportedType(*cast_to); } if (!cast_to.has_value() || cast_to == *tensor_type->scalarType()) { if (cast_to.has_value() && cast_to == *tensor_type->scalarType()) { logging::trace("Ignoring type cast to same type, {}, {}", cast_to.value(), *tensor_type->scalarType()); } node->output()->replaceAllUsesWith(node->input(0)); markNodeForDeletion(node); return nullptr; } return createCast(graph, node->input(0), *cast_to); } torch::jit::Node *upsampleHandler(torch::jit::Graph *graph, torch::jit::Node *node) { // aten::upsample_nearest1d(Tensor self, int[] output_size, float? scales) -> // Tensor // // aten::upsample_nearest2d(Tensor self, int[] output_size, float? // scales_h, float? scales_w) -> Tensor // // aten::upsample_nearest3d(Tensor self, int[] output_size, float? scales_d, // float? scales_h, float? scales_w) -> Tensor // upsample_nearest1d.vec(Tensor input, int[]? output_size, // float[]? scale_factors) // // upsample_nearest2d.vec(Tensor input, int[]? output_size, // float[]? scale_factors) -> Tensor // // upsample_nearest3d.vec(Tensor input, int[]? output_size, // float[]? scale_factors) -> Tensor // upsample_bicubic2d(Tensor self, int[2] output_size, bool align_corners, // float? scales_h=None, float? scales_w=None) -> Tensor // upsample_bicubic2d.vec(Tensor input, int[]? output_size, // bool align_corners, float[]? scale_factors) -> Tensor // // Not supported by Popart yet: // // aten::upsample_linear1d(Tensor self, int[] output_size, bool align_corners, // float? scales) -> Tensor // // aten::upsample_trilinear3d(Tensor self, int[] output_size, bool // align_corners, float? scales_d, float? scales_h, float? scales_w) -> Tensor const auto num_inputs = node->inputs().size(); torch::jit::Value *input = node->input(0); torch::jit::Value *output_size = node->input(1); const bool is_bicubic = node->kind() == c10::aten::upsample_bicubic2d; size_t scales_idx = 2; if (is_bicubic) { const auto align_corners = constantToBool(node->input(2)->node()); ERROR_ON_MSG(align_corners, "Only support align_corners=False."); scales_idx++; } const auto output_rank = shapeFromTensor(node->output()).size(); const auto input_shape = shapeFromTensor(input); const auto input_rank = input_shape.size(); ERROR_ON_MSG(output_rank != input_rank, "Input / output rank mismatch: " << input_rank << " != " << output_rank); // Omit the leading batch and channel dims for computing the scale std::vector scales{1.0, 1.0}; if (num_inputs > scales_idx) { torch::jit::Value *scale1 = node->input(scales_idx); // Handling individual constants? if (isTensorConstant(scale1->node())) { for (size_t i = 0; i < input_rank - 2; i++) { scales.push_back(constantToFloat(node->input(scales_idx + i)->node())); } } else { // Otherwise it's upsample_bicubic2d.vec, just copy the vector of scales const auto scale_list = handleTensorList(scale1->node()); if (!scale_list.empty()) { for (auto *s : scale_list) { scales.push_back(constantToFloat(s->node())); } } } } if (scales.size() == 2) { const auto output_shape = handleTensorList(output_size->node()); for (size_t dim = 2; dim < input_rank; ++dim) { scales.push_back(constantToFloat(output_shape[dim - 2]->node()) / input_shape[dim]); } } torch::jit::Node *scales_node = createConstantFloatLike( graph, input, scales, {static_cast(scales.size())}); torch::jit::Node *roi_node = createConstantFloat32(graph, std::vector(input_rank, 0.0f), {static_cast(input_rank)}); const std::string resize_type = is_bicubic ? "cubic" : "nearest"; static constexpr const char *coordinate_transformation_mode = "half_pixel"; static constexpr float cubic_coeff_a = -0.75f; static constexpr int64_t exclude_outside = 0; static constexpr float extrapolation_value = 0.0f; static constexpr const char *nearest_mode = "pytorch"; return createResize(graph, {input, roi_node->output(), scales_node->output()}, coordinate_transformation_mode, cubic_coeff_a, exclude_outside, extrapolation_value, resize_type, nearest_mode); } torch::jit::Node *upsampleBilinear2dHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *input = node->input(0); auto *output_size = node->input(1); auto *output_scale = node->input(3); const bool align_corners = constantToBool(node->input(2)->node()); const auto scalar_type = getNodeScalarType(input); const auto output_rank = shapeFromTensor(node->output()).size(); const auto input_shape = shapeFromTensor(input); const auto input_rank = input_shape.size(); ERROR_ON_MSG(output_rank != input_rank, "Input / output rank mismatch: " << input_rank << " != " << output_rank); // Omit the leading batch and channel dims for computing the scale std::vector scales{1.0, 1.0}; if (!isNone(output_size)) { const auto output_shape = constantToLongVec(output_size->node()); for (size_t dim = 2; dim < input_rank; ++dim) { scales.push_back(static_cast(output_shape[dim - 2]) / input_shape[dim]); } } else { const auto scalesxy = constantToFloatVec(output_scale->node()); ERROR_ON_MSG(scalesxy[0] != scalesxy[1], "Non-uniform bilinear upsampling not supported"); ERROR_ON_MSG(scalesxy[0] != floor(scalesxy[0]), "Bilinear upsampling with non-integer factor not supported"); scales.push_back(scalesxy[0]); scales.push_back(scalesxy[1]); } const std::vector inputs = {input}; std::string const name = "UpsampleBilinear2d"; std::string const domain = "poptorch.custom_ops"; std::string const attributes( "{\"scaling_factor\":" + std::to_string(scales[2]) + ", " + "\"align_corners\":" + std::to_string(static_cast(align_corners)) + "}"); auto *new_node = createCustomOperation(graph, inputs, name, domain, 1, 1, attributes); new_node->output(0)->setType(c10::TensorType::create( scalar_type, c10::nullopt, c10::nullopt, c10::nullopt)); return new_node; } torch::jit::Node *unsupportedUpsampleHandler(torch::jit::Graph *graph, torch::jit::Node *node) { UNUSED(graph); ERROR("Unsupported upsample mode " << node->kind().toQualString() << ": currently only 'nearest' is supported"); return nullptr; } torch::jit::Node *stackHandler(torch::jit::Graph *graph, torch::jit::Node *node) { const std::int64_t dim = constantToLong(node->input(1)->node()); const std::vector values = handleTensorList(node->input(0)->node()); std::vector transformed_tensors; transformed_tensors.reserve(values.size()); for (auto *value : values) { transformed_tensors.push_back( createUnsqueeze(graph, {value}, {dim})->output()); } return createConcat(graph, transformed_tensors, dim); } torch::jit::Node *intHandler(torch::jit::Graph *graph, torch::jit::Node *node) { return createCast(graph, node->input(0), at::ScalarType::Int); } } // namespace __attribute__((constructor(HANDLER_INIT_PRIORITY))) static void registration() { registerHandler(c10::aten::broadcast_to, expandHandler); registerHandler(c10::aten::expand, expandHandler); registerHandler(c10::aten::expand_as, expandAsHandler); registerHandler(c10::aten::view, reshapeHandler); registerHandler(c10::aten::_unsafe_view, reshapeHandler); registerHandler(c10::aten::unsqueeze, reshapeHandler); registerHandler(c10::aten::flatten, flattenHandler); registerHandler(c10::aten::reshape, reshapeHandler); registerHandler(c10::aten::_reshape_alias, reshapeHandler); registerHandler(c10::aten::select, selectHandler); registerHandler(c10::aten::split, splitChunkHandler); registerHandler(c10::aten::split_with_sizes, splitChunkHandler); registerHandler(c10::aten::chunk, splitChunkHandler); registerHandler(c10::aten::unsafe_chunk, splitChunkHandler); registerHandler(c10::aten::contiguous, contiguousHandler); registerHandler(c10::aten::permute, permuteHandler); registerHandler(c10::aten::transpose, transposeHandler); registerHandler(c10::aten::transpose_, transposeHandler); registerHandler(c10::aten::col2im, col2imHandler); registerHandler(c10::aten::im2col, im2colHandler); registerHandler(c10::aten::numpy_T, numpyTHandler); registerHandler(c10::aten::to, toHandler); registerHandler(c10::aten::type_as, toHandler); registerHandler(c10::aten::upsample_nearest1d, upsampleHandler); registerHandler(c10::aten::upsample_nearest2d, upsampleHandler); registerHandler(c10::aten::upsample_nearest3d, upsampleHandler); registerHandler(c10::aten::upsample_linear1d, unsupportedUpsampleHandler); registerHandler(c10::aten::upsample_bilinear2d, upsampleBilinear2dHandler); registerHandler(c10::aten::upsample_trilinear3d, unsupportedUpsampleHandler); registerHandler(c10::aten::upsample_bicubic2d, upsampleHandler); registerHandler(c10::aten::squeeze, reshapeHandler); registerHandler(c10::aten::as_strided, asStridedHandler); registerHandler(c10::aten::stack, stackHandler); registerHandler(c10::aten::Int, intHandler); } } // namespace poptorch ================================================ FILE: poptorch/source/popart_canonicalization/ScatterReduction.cpp ================================================ // Copyright (c) 2023 Graphcore Ltd. All rights reserved. #include "ScatterReduction.hpp" #include "PopartCanonicalizationUtils.hpp" namespace poptorch { std::int32_t getReductionMethod(torch::jit::Node *node) { const auto reduce = constantToString(node); if (reduce == "sum" || reduce == "add") { return static_cast(ScatterReduction::Sum); } if (reduce == "amax") { return static_cast(ScatterReduction::Max); } if (reduce == "amin") { return static_cast(ScatterReduction::Min); } if (reduce == "mean") { return static_cast(ScatterReduction::Mean); } if (reduce == "prod" || reduce == "multiply") { return static_cast(ScatterReduction::Mul); } ERROR("Unsupported reduction type for scatter_reduce: " << reduce); } float getReductionInitValue(std::int32_t reduce) { float init_val; switch (reduce) { case static_cast(ScatterReduction::Sum): case static_cast(ScatterReduction::Mean): init_val = 0.0; break; case static_cast(ScatterReduction::Mul): init_val = 1.0; break; case static_cast(ScatterReduction::Max): init_val = -std::numeric_limits::infinity(); break; case static_cast(ScatterReduction::Min): init_val = std::numeric_limits::infinity(); break; default: ERROR("Unsupported reduction type for scatter_reduce: " << reduce); break; } return init_val; } } // namespace poptorch ================================================ FILE: poptorch/source/popart_canonicalization/ScatterReduction.hpp ================================================ // Copyright (c) 2023 Graphcore Ltd. All rights reserved. #ifndef SCATTER_REDUCTION_H #define SCATTER_REDUCTION_H #include namespace torch { namespace jit { class Node; } // namespace jit } // namespace torch namespace poptorch { enum class ScatterReduction { Sum = 0, Max, Min, Mul, None, Mean }; std::int32_t getReductionMethod(torch::jit::Node *node); float getReductionInitValue(std::int32_t reduce); } // namespace poptorch #endif ================================================ FILE: poptorch/source/popart_canonicalization/SliceOps.cpp ================================================ // Copyright (c) 2020 Graphcore Ltd. All rights reserved. #include #include #include #include "poptorch_logging/Logging.hpp" #include "../PoptorchStaticInit.hpp" #include "../PoptorchSymbols.hpp" #include "PopartCanonicalizationUtils.hpp" #include "poptorch/DispatchTracer.hpp" #include "poptorch/OpBuilder.hpp" #include "poptorch/Utils.hpp" namespace poptorch { namespace { const char *fail_msg = "The size of the sliced tensor must be a constant for " "each execution of the model when running on the IPU."; // Extract the constant used in the supplied add/subtract node and increase or // decrease size accordingly. Negate reverses the sign. void extractAddSubtractConstant(torch::jit::Node *node, std::int64_t *size, bool negate) { ERROR_ON_MSG(node->kind() != symbols::popart::add && node->kind() != symbols::popart::sub, fail_msg); ERROR_ON(node->inputs().size() != 2); auto *constant = isAnyConstant(node->input(0)->node()) ? node->input(0)->node() : node->input(1)->node(); if (node->kind() == symbols::popart::sub) { negate = !negate; } if (isFloatingPointConstant(constant)) { ERROR(fail_msg << " In this case, there is a float added to the slice " << "indices meaning it may change between runs."); } if (negate) { (*size) -= constantToLong(constant); } else { (*size) += constantToLong(constant); } } // Returns the input of a node which is not a constant, if any. Otherwise, // returns null. Raises an error if there are more than one such input. torch::jit::Node *getOnlyNonConstantInput(torch::jit::Node *node) { torch::jit::Node *only_such_input = nullptr; for (auto *input : node->inputs()) { if (!isAnyConstant(input->node())) { if (only_such_input != nullptr) { logging::trace("dynamicSliceHandler failed due to a node with multiple " "non constant inputs when seeking a shared ancestor " "node. Offending node: {}", *node); ERROR(fail_msg); } only_such_input = input->node(); } } return only_such_input; } // Returns true if the nodes always yield the same output. bool nodesAlwaysSameOutput(torch::jit::Node *a, torch::jit::Node *b) { // Check same kind if (a->kind() != b->kind()) { return false; } // Avoid random nodes if (isNondeterministic(*a) || isNondeterministic(*b)) { return false; } // Check same inputs if (a->inputs().size() != b->inputs().size()) { return false; } const auto *a_it = a->inputs().begin(); const auto *b_it = b->inputs().begin(); for (; a_it != a->inputs().end(); a_it++, b_it++) { if (!nodesAlwaysSameOutput((*a_it)->node(), (*b_it)->node())) { return false; } } // Check same attributes if (a->numAttributes() != b->numAttributes()) { return false; } auto a_attributes_names = a->attributeNames(); for (auto attrib_name : a_attributes_names) { if (!attributeEqual(a, b, attrib_name)) { return false; } } return true; } // Convert any inputs to the specified node which are a cast of another // constant into a single (already cast) constant void resolveCastConstants(torch::jit::Graph *graph, torch::jit::Node *node) { for (auto *input : node->inputs()) { // Move on if it is not a cast situation auto *cast_node = input->node(); if (cast_node->kind() != symbols::popart::cast) { continue; } auto *constant_to_be_cast = cast_node->input()->node(); if (constant_to_be_cast->kind() != symbols::poptorch::tensor_constant) { continue; } // Obtain the tensor and cast auto tensor = getNodeTensorAttrValue(constant_to_be_cast); auto popart_cast_to = cast_node->s(c10::Symbol::attr("to")); auto scalar_type = onnxStrToScalarType(popart_cast_to.c_str()); tensor.to(scalar_type); // Replace node to avoid a cast torch::jit::WithInsertPoint insert_point(node); auto *replacement_node = tensorToConstant(graph, tensor); cast_node->output()->replaceAllUsesWith(replacement_node->output()); markNodeForDeletion(cast_node); markNodeForDeletion(constant_to_be_cast); } } // Follow the inputs of each node until we reach a common ancestor. // Every node in the chain must only have one non-constant input for this to // work. If this were not the case, the setup is unlikely to resolve to a case // in which dynamic slice could work (exceptions include adding an input // multiplied by zero, etc). Therefore, this limitation is not an issue in // practice. void populateAncestory(torch::jit::Graph *graph, std::vector *start_ancestory, std::vector *end_ancestory, torch::jit::Node *start_node, torch::jit::Node *end_node) { torch::jit::Node *start_ancestor = start_node; torch::jit::Node *end_ancestor = end_node; while (start_ancestor != end_ancestor) { // Push back whichever node is later bool end_is_later = end_ancestor->isAfter(start_ancestor); auto **later_node = end_is_later ? &end_ancestor : &start_ancestor; auto *add_to_list = end_is_later ? end_ancestory : start_ancestory; add_to_list->push_back(*later_node); // The algorithm will fail if there is an input that would be a constant but // for a cast. The best solution is to cast the constant to elimate the // cast. resolveCastConstants(graph, *later_node); // Update either start_ancestor or end_ancestor by going a step along the // chain of non-constant inputs *later_node = getOnlyNonConstantInput(*later_node); if (*later_node == nullptr) { logging::trace("dynamicSliceHandler failed due to lack of a shared " "ancestor."); ERROR(fail_msg); } } // Do a sanity check and log the results to a trace ERROR_ON(start_ancestor == nullptr); logging::trace("Shared ancestor: {}\n", *start_ancestor); logging::trace("Start ancestory:"); for (auto it = start_ancestory->rbegin(); it != start_ancestory->rend(); it++) { logging::trace("{}", **it); } logging::trace("End ancestory:"); for (auto it = end_ancestory->rbegin(); it != end_ancestory->rend(); it++) { logging::trace("{}", **it); } } // Remove nodes which are common across the start of both node ancestries void removeCommonNodes(std::vector *start_ancestory, std::vector *end_ancestory) { while (!(start_ancestory->empty() || end_ancestory->empty())) { if (nodesAlwaysSameOutput(start_ancestory->back(), end_ancestory->back())) { start_ancestory->pop_back(); end_ancestory->pop_back(); } else { break; } } if (start_ancestory->empty() && end_ancestory->empty()) { ERROR("The start and end of a slice must be different."); } } // Obtain the size of the slice based on the processed start/end ancestory. // This involves processing add and subtract nodes and their constants. std::int64_t determineSizeConstant(const std::vector &start_ancestory, const std::vector &end_ancestory) { std::int64_t size = 0; for (auto *node : start_ancestory) { if (node->kind() == c10::aten::Int || node->kind() == symbols::popart::cast) { continue; } extractAddSubtractConstant(node, &size, true); } for (auto *node : end_ancestory) { if (node->kind() == c10::aten::Int || node->kind() == symbols::popart::cast) { continue; } extractAddSubtractConstant(node, &size, false); } logging::trace("Size determined to be: {}", size); return size; } std::int64_t inferDynamicSliceSize(torch::jit::Graph *graph, torch::jit::Node *start_node, torch::jit::Node *end_node) { std::vector start_ancestory; std::vector end_ancestory; // Obtain the path from the nodes back to a common node populateAncestory(graph, &start_ancestory, &end_ancestory, start_node, end_node); // Remove any common nodes at the beginning of each ancestory // NB this is used in finding the size of the slice only and does not affect // the start node. removeCommonNodes(&start_ancestory, &end_ancestory); // Calculate the size of the slice std::int64_t size = determineSizeConstant(start_ancestory, end_ancestory); // The == 0 case should be taken care of already but having it here stops // lint errors for dividing by 0. if (size <= 0) { ERROR("Taking a slice of a tensor with the end less than the start is " "not supported."); } return size; } // Handle a slice in which the start is an arbitary (i.e. non constant) input // but the slice is a fixed size torch::jit::Node *dynamicSliceHandler(torch::jit::Graph *graph, torch::jit::Node *node, torch::jit::Node *start_node, std::size_t start_offset, std::int64_t size) { // The dim is as usual std::int64_t dim = constantToLong(node->input(1)->node()); auto length_of_dim = shapeFromTensor(node->input(0))[dim]; ERROR_ON_MSG(length_of_dim % size != 0, "The size of the slice (" << size << ") must be a factor of the slicing " << "dimension (" << length_of_dim << ")."); // Make sure the start_node is a tensor not an int if (start_node->output(start_offset)->type()->kind() == c10::TypeKind::IntType) { start_node = start_node->input()->node(); start_offset = 0; } // Reshape the start node from a scalar to a one-dim and cast to UINT32 start_node = createReshape(graph, start_node->output(start_offset), {1}); start_node = createCast(graph, {start_node->output()}, "UINT32"); auto *new_node = createDynamicslice( graph, {node->input(0), start_node->output()}, {dim}, {size}, 1); // No overlap 1 assumed return new_node; } // Handle an update slice in which the start is an arbitary (i.e. non constant) // input but the slice is a fixed size torch::jit::Node * dynamicUpdateHandler(torch::jit::Graph *graph, torch::jit::Node *node, torch::jit::Node *src_node, torch::jit::Node *start_node, std::size_t src_offset, std::size_t start_offset, std::int64_t size) { // The dim is as usual std::int64_t dim = constantToLong(node->input(2)->node()); // Make sure the start_node is a tensor not an int if (start_node->output(start_offset)->type()->kind() == c10::TypeKind::IntType) { start_node = start_node->input()->node(); start_offset = 0; } // Reshape the start node from a scalar to a one-dim and cast to UINT32 start_node = createReshape(graph, start_node->output(start_offset), {1}); start_node = createCast(graph, {start_node->output()}, "UINT32"); auto *new_node = createDynamicupdate( graph, {node->input(0), start_node->output(), src_node->output(src_offset)}, {dim}, {size}, 1); // No overlap 1 assumed return new_node; } // implements slicing with step by subsampling a slice with unit step torch::jit::Node *subsampleSlice(torch::jit::Graph *graph, torch::jit::Node *slice, int dims, int dim, int step) { if (step != 1) { std::vector strides(dims, static_cast(1)); strides[dim] = step; slice = createSubsample(graph, {slice->output()}, strides); } return slice; } namespace { torch::jit::Node *sliceCommon(torch::jit::Graph *graph, torch::jit::Node *node, torch::jit::Value *input, int64_t dim, torch::jit::Node *start_node, std::size_t start_offset, torch::jit::Node *end_node, int64_t step) { auto dims = shapeFromTensor(input); if (dim < 0) { dim += dims.size(); } // If any of the inputs are not constants, dynamicSlice is required if (!isTensorConstant(start_node) || !isTensorConstant(end_node)) { auto size = inferDynamicSliceSize(graph, start_node, end_node); auto *slice = dynamicSliceHandler(graph, node, start_node, start_offset, size); return subsampleSlice(graph, slice, dims.size(), dim, step); } std::int64_t start = constantToLong(start_node); std::int64_t end = constantToLong(end_node); // If we slice a scalar we should do nothing. if (dims.empty()) { return createIdentity(graph, {input}); } // Based on aten/src/ATen/native/TensorShape.cpp slice() if (start < 0) { start += dims[dim]; } if (end < 0) { end += dims[dim]; } if (start < 0) { start = 0; } else if (start >= dims[dim]) { start = dims[dim]; } if (end < start) { end = start; } else if (end >= dims[dim]) { end = dims[dim]; } auto *slice = createSlice(graph, {input}, {end}, {start}, {dim}); return subsampleSlice(graph, slice, dims.size(), dim, step); } } // namespace torch::jit::Node *sliceHandler(torch::jit::Graph *graph, torch::jit::Node *node) { // aten::slice(Tensor self, int dim, int start, int end, int step) -> Tensor auto *input = node->input(0); auto dim = constantToLong(node->input(1)->node()); auto *start_node = node->input(2)->node(); auto start_offset = node->input(2)->offset(); auto *end_node = node->input(3)->node(); auto *step_node = node->input(4)->node(); ERROR_ON_MSG(!isTensorConstant(step_node), "Slicing step must be a constant"); auto step = constantToLong(step_node); ERROR_ON_MSG(step < 1, "Slicing step must be at least 1"); return sliceCommon(graph, node, input, dim, start_node, start_offset, end_node, step); } torch::jit::Node *ptDynamicSliceHandler(torch::jit::Graph *graph, torch::jit::Node *node) { // poptorch::dynamic_slice(Tensor self, int dim, Tensor start, int size) // -> Tensor auto *input = node->input(0); auto dim = constantToLong(node->input(1)->node()); auto *start_node = node->input(2)->node(); auto start_offset = node->input(2)->offset(); auto *size_node = node->input(3)->node(); auto *step_node = node->input(4)->node(); ERROR_ON_MSG(!isTensorConstant(size_node), "Slicing size must be a constant"); auto size = constantToLong(size_node); ERROR_ON_MSG(size == 0, "The start and end of a slice must be different."); ERROR_ON_MSG(size < 0, "Taking a slice of a tensor with the end less than " "the start is not supported."); ERROR_ON_MSG(!isTensorConstant(step_node), "Slicing step must be a constant"); auto step = constantToLong(step_node); ERROR_ON_MSG(step < 1, "Slicing step must be at least 1"); auto dims = shapeFromTensor(input); auto *slice = dynamicSliceHandler(graph, node, start_node, start_offset, size); return subsampleSlice(graph, slice, dims.size(), dim, step); } torch::jit::Node *ptDynamicUpdateHandler(torch::jit::Graph *graph, torch::jit::Node *node) { // poptorch::dynamic_update(Tensor self, Tensor src, int dim, Tensor start, // int size, int step) -> Tensor auto *src_node = node->input(1)->node(); auto src_offset = node->input(1)->offset(); auto *start_node = node->input(3)->node(); auto start_offset = node->input(3)->offset(); auto *size_node = node->input(4)->node(); ERROR_ON_MSG(!isTensorConstant(size_node), "Slicing size must be a constant"); auto size = constantToLong(size_node); ERROR_ON_MSG(size == 0, "The start and end of a slice must be different."); ERROR_ON_MSG(size < 0, "Taking a slice of a tensor with the end less than " "the start is not supported."); auto *out = dynamicUpdateHandler(graph, node, src_node, start_node, src_offset, start_offset, size); return out; } torch::jit::Node *unbindHandler(torch::jit::Graph *graph, torch::jit::Node *node) { // aten::unbind(Tensor self, int dim) -> Tensor[] auto *x = node->input(0); auto shape = shapeFromTensor(x); int dim = constantToInt(node->input(1)->node()); std::int64_t dim_size = shape[dim]; std::vector tensors; // Select each index in dimension 'dim' of x and add all // slices to a vector for (std::int64_t i = 0; i < dim_size; i++) { auto *inds = wrapInConstant1D(graph, i); auto *gather = createGather(graph, {x, inds}, dim); // Squeeze out the gathered dim auto *squeeze = createSqueeze(graph, {gather->output()}, {dim}); tensors.push_back(squeeze->output()); } return createAndInsertNode(graph, at::prim::ListConstruct, tensors); } torch::jit::Node *narrowHandler(torch::jit::Graph *graph, torch::jit::Node *node) { // narrow(Tensor(a) self, int dim, int start, int length) -> Tensor(a) auto *input = node->input(0); int dim = constantToInt(node->input(1)->node()); auto *start_node = node->input(2)->node(); auto start_offset = node->input(2)->offset(); auto *end_node = node->input(3)->node(); return sliceCommon(graph, node, input, dim, start_node, start_offset, end_node, 1); } torch::jit::Node *unfoldHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *input = node->input(0); const auto input_type = input->type()->expect(); const auto dimension = handleDimensionParam(node->input(1), input_type); const auto size = constantToInt(node->input(2)->node()); const auto step = constantToInt(node->input(3)->node()); return createUnfold(graph, input, dimension, size, step); } } // namespace __attribute__((constructor(HANDLER_INIT_PRIORITY))) static void registration() { registerHandler(c10::aten::slice, sliceHandler); registerHandler(symbols::poptorch::dynamic_slice, ptDynamicSliceHandler); registerHandler(symbols::poptorch::dynamic_update, ptDynamicUpdateHandler); registerHandler(c10::aten::unbind, unbindHandler); registerHandler(c10::aten::narrow, narrowHandler); registerHandler(c10::aten::unfold, unfoldHandler); } } // namespace poptorch ================================================ FILE: poptorch/source/popart_canonicalization/SoftmaxOps.cpp ================================================ // Copyright (c) 2020 Graphcore Ltd. All rights reserved. #include "../PoptorchStaticInit.hpp" #include "PopartCanonicalizationUtils.hpp" #include "poptorch/OpBuilder.hpp" #include "poptorch/Utils.hpp" #include "poptorch_logging/Error.hpp" #include "poptorch_logging/Logging.hpp" namespace poptorch { namespace { template torch::jit::Node *handleSoftmaxOp(torch::jit::Graph *graph, torch::jit::Node *node, SoftmaxFunc &&softmax_fn) { std::vector input_shape = shapeFromTensor(node->input(0)); int64_t rank = static_cast(input_shape.size()); std::int64_t dim = constantToLong(node->input(1)->node()); if (dim < 0) { dim = rank + dim; } if (rank < 2 || dim == rank - 1) { return softmax_fn(graph, {node->input(0)}, dim); } // ONNX (log)softmax up to version 13 specifies that the input is // coerced to 2D where the axis attribute demarcates the flattening dim. // To workaround this we: // // 1. permute the dim arg to the final dimension // 2. evaluate (log)softmax using last dim as the axis // 3. permute result back to the original dimension order. // // Opset 13 brings the ONNX spec in line with the interpretation of the dim // argument as implemented by torch so this may need updating when popart // adds support for opset 13. std::vector perm(rank); std::iota(perm.begin(), perm.end(), 0); std::swap(perm[dim], perm.back()); torch::jit::Node *transpose = createTranspose(graph, {node->input(0)}, perm); torch::jit::Node *sm = softmax_fn(graph, {transpose->output()}, rank - 1); return createTranspose(graph, {sm->output()}, perm); } torch::jit::Node *softmaxHandler(torch::jit::Graph *graph, torch::jit::Node *node) { // "aten::softmax(Tensor self, int dim, int? dtype) -> Tensor" return handleSoftmaxOp(graph, node, createSoftmax); } torch::jit::Node *logSoftmaxHandler(torch::jit::Graph *graph, torch::jit::Node *node) { // "aten::log_softmax(Tensor self, int dim, int? dtype) -> Tensor" return handleSoftmaxOp(graph, node, createLogsoftmax); } } // namespace __attribute__((constructor(HANDLER_INIT_PRIORITY))) static void registration() { registerHandler(c10::aten::softmax, softmaxHandler); registerHandler(c10::aten::_softmax, softmaxHandler); registerHandler(c10::aten::log_softmax, logSoftmaxHandler); registerHandler(c10::aten::_log_softmax, logSoftmaxHandler); } } // namespace poptorch ================================================ FILE: poptorch/source/popart_canonicalization/TensorOps.cpp ================================================ // Copyright (c) 2020 Graphcore Ltd. All rights reserved. #include "../PoptorchStaticInit.hpp" #include "PopartCanonicalizationUtils.hpp" #include "ScatterReduction.hpp" #include "poptorch/DispatchTracer.hpp" #include "poptorch/OpBuilder.hpp" #include "poptorch/PopartCanonicalization.hpp" #include "poptorch/Utils.hpp" #include "poptorch_logging/Error.hpp" #include "poptorch_logging/Logging.hpp" #include "../PoptorchSymbols.hpp" #include namespace poptorch { namespace { torch::jit::Node *sizeHandler(torch::jit::Graph *graph, torch::jit::Node *node) { // aten::size(Tensor input, int dim) -> int std::vector shape = shapeFromTensor(node->input(0)); std::int64_t const dim = constantToLong(node->input(1)->node()); return createConstantInt(graph, {shape[dim]}, {1}); } torch::jit::Node *numToTensorHandler(torch::jit::Graph *graph, torch::jit::Node *node) { // Should be a tensor already ERROR_ON(node->input(0)->node()->kind() != symbols::poptorch::tensor_constant); UNUSED(graph); node->output()->replaceAllUsesWith(node->input(0)); markNodeForDeletion(node); return nullptr; } torch::jit::Node *flipHandler(torch::jit::Graph *graph, torch::jit::Node *node) { // aten::flip(Tensor self, int[] dims) -> Tensor auto *input = node->input(0); // Use output shape because input shape might not exist // if the input is the result of another operation auto input_shape = shapeFromTensor(node->output()); auto dims = constantToLongVec(node->input(1)->node()); for (auto &dim : dims) { if (dim < 0) { dim += input_shape.size(); } } return createReverse(graph, {input}, dims); } // Input tensor of shape [M, N, ...] is repeated in [R1, R2, ...] // dimensions by: // 1) transforming to [1, M, 1, N, ...] // 2) expanding to [R1, M, R2, N, ...] // 3) reshaping to [R1*M, R2*N, ...] torch::jit::Node *repeatHandler(torch::jit::Graph *graph, torch::jit::Node *node) { torch::jit::Value *input = node->input(0); std::vector dim_repeats = constantToLongVec(node->input(1)->node()); std::vector old_shape = shapeFromTensor(input); const std::vector new_shape = shapeFromTensor(node->output()); // If repeat dimensions exceed shape dimensions, pad the front of the // original shape with singleton dimensions so that it can // be expanded std::size_t const padding = dim_repeats.size() > old_shape.size() ? dim_repeats.size() - old_shape.size() : 0; std::vector dim_expands; std::vector transform_shape; for (std::size_t i = 0; i < dim_repeats.size(); i++) { dim_expands.push_back(dim_repeats[i]); std::int64_t const padded_dim = i < padding ? 1 : old_shape[i - padding]; if (padded_dim > 1 && dim_repeats[i] > 1) { transform_shape.push_back(1); dim_expands.push_back(padded_dim); } transform_shape.push_back(padded_dim); } auto *reshape = createReshape(graph, input, transform_shape); auto *expand = createExpand( graph, {reshape->output(), intVectorToIrConstant(graph, dim_expands)}); return createReshape(graph, expand->output(), new_shape); } torch::jit::Node *rollHandler(torch::jit::Graph *graph, torch::jit::Node *node) { // aten::roll(Tensor self, int[1] shifts, int[1] dims=[]) -> Tensor auto *input = node->input(0); auto input_shape = shapeFromTensor(input); auto shifts = constantToLongVec(node->input(1)->node()); auto dims = constantToLongVec(node->input(2)->node()); bool reshape_output = false; if (dims.empty()) { // If dims not provided, a flattened version of the tensor is rolled and // then reshaped back. ERROR_ON_MSG(shifts.size() != 1, "The 'shifts' argument of the roll op must be a scalar when " "'dims' is not specified."); input = createFlatten(graph, {input}, 0)->output(); const int64_t flattened_size = std::accumulate( input_shape.begin(), input_shape.end(), 1, std::multiplies()); input_shape.clear(); input_shape.push_back(1); input_shape.push_back(flattened_size); dims.push_back(1); reshape_output = true; } else { ERROR_ON_MSG(shifts.size() != dims.size(), "The 'shifts' and 'dims' arguments of the roll op must be the " "same size."); } torch::jit::Value *output = input; auto number_of_dims = input_shape.size(); for (size_t i = 0; i < dims.size(); ++i) { auto current_dim = dims.at(i); // Match the torch API of requiring dim in [-len(shape), len(shape)-1] ERROR_ON_MSG( ((static_cast(current_dim) >= number_of_dims) && (current_dim >= 0)) || ((static_cast(-current_dim) > number_of_dims) && (current_dim < 0)), "Dimension out of range at index " << i << " (expected to be in range of [" << -static_cast(number_of_dims) << ", " << number_of_dims - 1 << "], but got " << current_dim << ") in the roll op."); current_dim = (current_dim + number_of_dims) % number_of_dims; auto current_dim_size = input_shape.at(current_dim); // Handle overreaching and negative shifts. auto split = (((-shifts.at(i)) % current_dim_size) + current_dim_size) % current_dim_size; auto *chunks = createSplit(graph, {output}, 2, current_dim, {split, current_dim_size - split}); output = createConcat(graph, {chunks->output(1), chunks->output(0)}, current_dim) ->output(); } if (reshape_output) { return createReshape(graph, output, shapeFromTensor(node->input(0))); } return output->node(); } torch::jit::Node *cloneHandler(torch::jit::Graph *graph, torch::jit::Node *node) { // aten::clone(Tensor self, *, MemoryFormat? memory_format=None) -> Tensor // Identity will just create a copy return createIdentity(graph, {node->input(0)}); } torch::jit::Node *copyHandler(torch::jit::Graph *graph, torch::jit::Node *node) { // aten::copy_(Tensor self, Tensor src, bool non_blocking) -> Tensor auto *dest = node->input(0); auto *src = node->input(1); at::ScalarType const dest_type = getNodeScalarType(dest); at::ScalarType const src_type = getNodeScalarType(src); torch::jit::Node *copy = nullptr; if (src_type == dest_type) { copy = createIdentity(graph, {src}); } else { copy = createCast(graph, src, dest_type); } ERROR_ON(copy == nullptr); copy->output()->setType( copy->output()->type()->expect()->withRequiresGrad( src->type()->expect()->requiresGrad())); return copy; } torch::jit::Node *justReturnFalse(torch::jit::Graph *graph, torch::jit::Node * /*unused*/) { c10::IValue const value{false}; torch::jit::Value *val = insertConstant(graph, value); return val->node(); } torch::jit::Node *linearHandler(torch::jit::Graph *graph, torch::jit::Node *node) { // aten::linear(Tensor input, Tensor weight, Tensor? bias) -> Tensor auto *x = node->input(0); auto *w = node->input(1); auto *b = node->input(2); auto *w_t = createTranspose(graph, {w}, {1, 0}); auto *output = createMatmul(graph, {x, w_t->output()}); if (!isNone(b)) { output = createAdd(graph, {output->output(), b}); } return output; } torch::jit::Node *gatherHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *input = node->input(0); auto tensor_type = input->type()->expect(); auto axis = handleDimensionParam(node->input(1), tensor_type); auto *indices = node->input(2); auto input_shape = shapeFromTensor(input); auto index_shape = shapeFromTensor(indices); auto stride = input_shape[axis]; for (unsigned s = 0; s < input_shape.size(); ++s) { if (s != axis) { ERROR_ON(input_shape[s] < index_shape[s]); } } // Move gather axis to the innermost dim std::vector permutation; const unsigned input_num_dims = input_shape.size(); permutation.resize(input_num_dims); std::iota(permutation.begin(), permutation.end(), 0); permutation.push_back(permutation[axis]); permutation.erase(permutation.begin() + axis); if (axis != input_num_dims - 1) { input = createTranspose(graph, {input}, permutation)->output(); input_shape.push_back(input_shape[axis]); input_shape.erase(input_shape.begin() + axis); } // Flatten the data auto *flatten_input = createFlatten(graph, {input}, 0)->output(); int64_t num_offsets = std::accumulate(index_shape.begin(), index_shape.end(), 1, std::multiplies()); num_offsets /= index_shape[axis]; // Transpose the indices to make them broadcastable with offsets std::vector idx_permutation; idx_permutation.resize(index_shape.size()); std::iota(idx_permutation.begin(), idx_permutation.end(), 0); idx_permutation.insert(idx_permutation.begin(), idx_permutation[axis]); idx_permutation.erase(idx_permutation.begin() + axis + 1); if (axis != 0) { indices = createTranspose(graph, {indices}, idx_permutation)->output(); index_shape.insert(index_shape.begin(), index_shape[axis]); index_shape.erase(index_shape.begin() + (axis + 1)); } // Create shape for offsets that is broadcastable with indices tensor std::vector offset_shape = {index_shape.begin() + 1, index_shape.end()}; // Make the offsets std::vector offsets_val; int64_t num_data = std::accumulate(input_shape.begin(), input_shape.end(), 1, std::multiplies()); num_data /= input_shape[input_num_dims - 1]; torch::jit::Value *offsets; // Case where one or more indices dims size < data size if (num_offsets != num_data) { // Create the offsets tensor from data_size // then slice it to match indices_size auto data_shape = shapeFromTensor(node->input(0)); data_shape.insert(data_shape.begin(), data_shape[axis]); data_shape.erase(data_shape.begin() + (axis + 1)); std::vector temp_offsets_shape = {data_shape.begin() + 1, data_shape.end()}; offsets_val.resize(num_data); std::iota(offsets_val.begin(), offsets_val.end(), 0); for (auto &v : offsets_val) { v *= stride; } offsets = createConstantInt(graph, offsets_val, temp_offsets_shape)->output(); for (unsigned k = 0; k < offset_shape.size(); ++k) { if (offset_shape[k] != temp_offsets_shape[k]) { offsets = createSlice(graph, {offsets}, {offset_shape[k]}, {0}, {k}) ->output(); } } } else { offsets_val.resize(num_offsets); std::iota(offsets_val.begin(), offsets_val.end(), 0); for (auto &v : offsets_val) { v *= stride; } offsets = createConstantInt(graph, offsets_val, {offset_shape})->output(); } auto *new_indices = createAdd(graph, {indices, offsets})->output(); // Gather the elements auto *output = createGather(graph, {flatten_input, new_indices}, 1)->output(); // remove the dim-0 added by gather output = createSqueeze(graph, {output}, {0})->output(); // transpose back to the original indices shape if needed if (axis != 0) { std::iota(idx_permutation.begin(), idx_permutation.end(), 0); idx_permutation.erase(idx_permutation.begin()); idx_permutation.insert(idx_permutation.begin() + axis, 0); output = createTranspose(graph, {output}, idx_permutation)->output(); } return output->node(); } torch::jit::Node *takeAlongDimHandler(torch::jit::Graph *graph, torch::jit::Node *node) { // aten::take_along_dim(Tensor self, Tensor indices, int? dim=None) -> Tensor torch::jit::Value *input = node->input(0); torch::jit::Value *indices = node->input(1); torch::jit::Value *dim = node->input(2); const std::vector input_shape = shapeFromTensor(input); std::vector indices_shape = shapeFromTensor(indices); if (!isNone(dim)) { const auto dim_value = constantToLong(dim->node()); const auto broadcast_to = [&](torch::jit::Value *value, const std::vector &shape) { std::vector shape_values(shape.size(), nullptr); std::transform(shape.cbegin(), shape.cend(), shape_values.begin(), [&](const auto elem) -> torch::jit::Value * { return wrapInConstant1D(graph, elem); }); torch::jit::Value *shape_list = createAndInsertNode(graph, c10::prim::ListConstruct, shape_values) ->output(); auto *broadcasted_value = createHandlerOperation(graph, getHandler(c10::aten::broadcast_to), {value, shape_list}) ->output(); broadcasted_value->setType( value->type()->expect()->withSizes(shape)); return broadcasted_value; }; auto self_sizes = input_shape; // update number of elements at dim as per indices self_sizes.at(dim_value) = indices_shape.at(dim_value); if (auto bcast_shape = at::infer_size(self_sizes, indices_shape); bcast_shape != indices_shape) { indices = broadcast_to(indices, bcast_shape); } // update number of elements at dim as per self indices_shape.at(dim_value) = input_shape.at(dim_value); if (auto bcast_shape = at::infer_size(indices_shape, input_shape); bcast_shape != input_shape) { input = broadcast_to(input, bcast_shape); } } else { const auto flatten = [&](torch::jit::Value *value, const std::vector &shape) -> torch::jit::Value * { const auto rank = shape.size(); if (rank == 1) { return value; } const int64_t num_elems = rank > 1 ? std::accumulate(shape.cbegin(), shape.cend(), 1, std::multiplies()) : 1; return createReshape(graph, value, {num_elems})->output(); }; input = flatten(input, input_shape); indices = flatten(indices, indices_shape); dim = wrapInConstant1D(graph, 0); } return createHandlerOperation(graph, getHandler(c10::aten::gather), {input, dim, indices}); } torch::jit::Node *scatterHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *input = node->input(0); const auto input_type = input->type()->expect(); const auto dim = handleDimensionParam(node->input(1), input_type); auto *index = node->input(2); auto *src = node->input(3); // `scatter` can be passed a single value for `src` as a tensor constant, so // broadcast it up. if (isConstantScalar(src)) { auto *shape = intVectorToIrConstant(graph, shapeFromTensor(index)); const auto input_scalar_type = *input_type->scalarType(); if (input_scalar_type != *src->type()->expect()->scalarType()) { // poplibs scatter requires that `src` have the same data type as input so // cast it if needed src = castToPromoteType(graph, src, input_scalar_type); } src = createExpand(graph, {src, shape})->output(); } if (node->inputs().size() < 4) { return createScatterElements(graph, {input, index, src}, dim); } // reduction type is optional argument const auto reduce = node->inputs().size() < 5 ? static_cast(ScatterReduction::None) : getReductionMethod(node->input(4)->node()); const auto input_shape = shapeFromTensor(input); const auto axis_size = input_shape.at(dim); static constexpr bool enable_index_broadcast = false; return createScatterreduce(graph, {src, index, input}, axis_size, dim, enable_index_broadcast, reduce); } torch::jit::Node *fullCommon(torch::jit::Graph *graph, torch::jit::Value *v, at::ScalarType type, const std::vector &shape) { auto *vn = v->node(); auto stype = coerceToSupportedType(type); if (isTensorConstant(vn) && vn->output()->type()->cast()) { auto v_scalar = getNodeTensorAttrValue(vn).to(stype).item(); return tensorToConstant(graph, at::full(shape, v_scalar, stype)); } auto *v_cast = createCast(graph, v, stype)->output(); auto *c_shape = intVectorToIrConstant(graph, shape); return createExpand(graph, {v_cast, c_shape}); } torch::jit::Node *fullHandler(torch::jit::Graph *graph, torch::jit::Node *node) { // aten::full(int[] size, Scalar fill_value, // ScalarType? dtype=None, Layout? layout=None, // Device? device=None, bool? pin_memory=None) -> Tensor // aten::new_full(Tensor self, int[] size, Scalar fill_value, // ScalarType? dtype=None, Layout? layout=None, // Device? device=None, bool? pin_memory=None) -> Tensor size_t shape_index = 0; if (node->kind() == c10::aten::new_full) { shape_index = 1; } auto *shape = node->input(shape_index + 0); auto *v = node->input(shape_index + 1); auto *dtype = node->input(shape_index + 2); auto lv_shape = constantToLongVec(shape->node()); auto type = c10::ScalarType::Float; if (node->kind() == c10::aten::new_full) { type = getNodeScalarType(node->input(0)); } // The specified dtype takes precedence if (!isNone(dtype)) { type = constantToScalarType(dtype->node()); } return fullCommon(graph, v, type, lv_shape); } torch::jit::Node *fullLikeHandler(torch::jit::Graph *graph, torch::jit::Node *node) { // aten::full_like(Tensor self, Scalar fill_value) -> Tensor auto *v = node->input(1); auto *like = node->output(0); auto like_shape = shapeFromTensor(like); auto like_type = getNodeScalarType(like); return fullCommon(graph, v, like_type, like_shape); } torch::jit::Node *triuHandler(torch::jit::Graph *graph, torch::jit::Node *node) { // aten::triu(Tensor self, int diagonal=0) -> Tensor ERROR("torch.triu is only supported within constant expressions, " "for example torch.ones(3, 3).triu_()."); UNUSED(graph); UNUSED(node); return nullptr; } torch::jit::Node *ipuPrintTensorHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *x = node->input(0); auto title = constantToString(node->input(1)->node()); auto print_gradient = constantToInt(node->input(2)->node()); auto summarise_threshold = constantToInt(node->input(3)->node()); auto edge_items = constantToInt(node->input(4)->node()); auto max_line_width = constantToInt(node->input(5)->node()); auto digits = constantToInt(node->input(6)->node()); auto float_format = constantToInt(node->input(7)->node()); auto separator = constantToString(node->input(8)->node()); auto open_bracket = constantToString(node->input(9)->node()); auto close_bracket = constantToString(node->input(10)->node()); return createPrinttensor(graph, {x}, print_gradient, title, summarise_threshold, edge_items, max_line_width, digits, float_format, *separator.c_str(), *open_bracket.c_str(), *close_bracket.c_str()); } } // namespace __attribute__((constructor(HANDLER_INIT_PRIORITY))) static void registration() { registerHandler(c10::aten::size, sizeHandler); registerHandler(c10::prim::NumToTensor, numToTensorHandler); registerHandler(c10::aten::flip, flipHandler); registerHandler(c10::aten::repeat, repeatHandler); registerHandler(c10::aten::is_complex, justReturnFalse); registerHandler(c10::aten::roll, rollHandler); registerHandler(c10::aten::clone, cloneHandler); registerHandler(c10::aten::copy_, copyHandler); registerHandler(c10::aten::linear, linearHandler); registerHandler(c10::aten::gather, gatherHandler); registerHandler(c10::aten::scatter, scatterHandler); registerHandler(c10::aten::full, fullHandler); registerHandler(c10::aten::new_full, fullHandler); registerHandler(c10::aten::full_like, fullLikeHandler); registerHandler(c10::aten::triu, triuHandler); registerHandler(symbols::poptorch::ipu_print_tensor, ipuPrintTensorHandler); registerHandler(c10::aten::take_along_dim, takeAlongDimHandler); } } // namespace poptorch ================================================ FILE: poptorch/source/popart_canonicalization/pyg_torch_cluster/FpsOp.cpp ================================================ // Copyright (c) 2023 Graphcore Ltd. All rights reserved. #include #include #include "../PopartCanonicalizationUtils.hpp" #include "../PoptorchStaticInit.hpp" #include "../PoptorchSymbols.hpp" #include "poptorch/OpBuilder.hpp" #include "poptorch/Utils.hpp" #include "poptorch_logging/Error.hpp" namespace poptorch { namespace { torch::jit::Node *prepareMinimum(torch::jit::Graph *graph, torch::jit::Value *const lhs, torch::jit::Value *const rhs) { return createMin(graph, {lhs, rhs}); } torch::jit::Node *prepareRowUpdate(torch::jit::Graph *graph, torch::jit::Value *const value, torch::jit::Value *const new_row, torch::jit::Value *const row_num, std::int64_t row_len) { return createDynamicupdate(graph, {value, row_num, new_row}, {0}, {row_len}, 0); } torch::jit::Node *updateIdxs(torch::jit::Graph *graph, torch::jit::Value *const idxs, std::int64_t offset, torch::jit::Value *const new_val) { auto *const offset_node = createConstantLong(graph, {offset}, {1}); return createDynamicupdate(graph, {idxs, offset_node->output(), new_val}, {0}, {1}, 0); } torch::jit::Node *prepareRowSlice(torch::jit::Graph *graph, torch::jit::Value *const value, torch::jit::Value *const row_num, std::int64_t row_len) { auto *const slice = createConstantFloat32(graph, {0.0}, {row_len}); return createDynamicslice(graph, {value, row_num, slice->output()}, {0}, {1}, 1); } torch::jit::Node *prepareArgmax(torch::jit::Graph *graph, torch::jit::Value *const dists, torch::jit::Value *const row_num, std::int64_t row_len) { auto *const dists_row = prepareRowSlice(graph, dists, row_num, row_len); return createArgmax(graph, {dists_row->output()}, 0, 0l); } torch::jit::Node *prepareOutput(torch::jit::Graph *graph, std::int64_t src_len) { const auto zeros = std::vector(src_len, 0l); return createConstantInt(graph, zeros, {src_len}); } torch::jit::Node *prepareStartIdx(torch::jit::Graph *graph, float range_begin, float range_end, bool random_start) { if (random_start) { return createRandomUniform(graph, nullptr, {1}, range_end, range_begin, c10::ScalarType::Int); } return createConstantLong(graph, {static_cast(range_begin)}, {1}); } torch::jit::Node *prepareMaskedColDists(torch::jit::Graph *graph, torch::jit::Value *const dists, torch::jit::Value *const col_idx) { return createDynamiczero(graph, {dists, col_idx}, {1}, {1}); } torch::jit::Node *prepareDists(torch::jit::Graph *graph, torch::jit::Value *const src) { auto *const p = createConstantFloat32(graph, {2.0}, {1}); return createHandlerOperation(graph, getHandler(c10::aten::cdist), {src, src, p->output()}); } torch::jit::Node *maskDists(torch::jit::Graph *graph, torch::jit::Value *const dists, const std::vector &offset, const std::vector &sizes) { auto *const offset_node = createConstantInt(graph, offset, {2}); return createDynamiczero(graph, {dists, offset_node->output()}, {0, 1}, sizes); } torch::jit::Node *prepareMaskedDists(torch::jit::Graph *graph, torch::jit::Value *const src, const std::vector &ptr) { auto *dists = prepareDists(graph, src); if (ptr.size() > 2) { dists = maskDists(graph, dists->output(), {0, ptr[1]}, {ptr[1], ptr.back() - ptr[1]}); for (size_t i = 2; i < ptr.size() - 1; i++) { dists = maskDists(graph, dists->output(), {ptr[i - 1], 0}, {ptr[i] - ptr[i - 1], ptr[i - 1]}); dists = maskDists(graph, dists->output(), {ptr[i - 1], ptr[i]}, {ptr[i] - ptr[i - 1], ptr.back() - ptr[i]}); } dists = maskDists(graph, dists->output(), {ptr[ptr.size() - 2], 0}, {ptr.back() - ptr[ptr.size() - 2], ptr[ptr.size() - 2]}); } return dists; } std::vector calcDeg(const std::vector &ptr, float ratio) { std::vector deg(ptr.size(), 0); for (size_t i = 1; i < ptr.size(); i++) { deg[i] = std::ceil(static_cast(ptr[i] - ptr[i - 1]) * ratio); deg[i] += deg[i - 1]; } return deg; } torch::jit::Node *fpsHandler(torch::jit::Graph *graph, torch::jit::Node *node) { torch::jit::Value *const src = node->input(0); const std::vector ptr = constantToLongVec(node->input(1)->node()); const float ratio = constantToFloat(node->input(2)->node()); const bool random_start = constantToBool(node->input(3)->node()); const std::vector src_shape = shapeFromTensor(src); // 0. Prepare output tensor const auto deg = calcDeg(ptr, ratio); const auto out_len = deg.back(); auto *idxs = prepareOutput(graph, out_len); // 1. Create masked dists (leave only the slices representing batches) auto *dists = prepareMaskedDists(graph, src, ptr); // 2. Iterate over batches defined in deg std::int64_t pos_in_idxs = 0; for (size_t b = 1; b < deg.size(); b++) { // 3. Generate start idx... auto *prev_idx = prepareStartIdx(graph, ptr[b - 1], ptr[b] - 1, random_start); // 4. ...and insert it into the outputs idxs = updateIdxs(graph, idxs->output(), pos_in_idxs++, prev_idx->output()); if (pos_in_idxs == deg[b] || pos_in_idxs == out_len) { continue; } // 5. Zero out the dists column with prev_idx number dists = prepareMaskedColDists(graph, dists->output(), prev_idx->output()); // 6. Get the index of the max value in the currently processed dists row auto *idx = prepareArgmax(graph, dists->output(), prev_idx->output(), src_shape[0]); idxs = updateIdxs(graph, idxs->output(), pos_in_idxs++, idx->output()); while (pos_in_idxs < deg[b] && pos_in_idxs < out_len) { // 7. Zero out the dists column with idx number dists = prepareMaskedColDists(graph, dists->output(), idx->output()); auto *const prev_row = prepareRowSlice(graph, dists->output(), prev_idx->output(), src_shape[0]); auto *const curr_row = prepareRowSlice(graph, dists->output(), idx->output(), src_shape[0]); // 8. Update the currently processed row with the min of the current and // previous row auto *const curr_dists_row = prepareMinimum(graph, prev_row->output(), curr_row->output()); dists = prepareRowUpdate(graph, dists->output(), curr_dists_row->output(), idx->output(), src_shape[0]); prev_idx = idx; // 9. Get the index of the max value in the currently processed dists row idx = prepareArgmax(graph, dists->output(), idx->output(), src_shape[0]); idxs = updateIdxs(graph, idxs->output(), pos_in_idxs++, idx->output()); } } return idxs; } } // namespace __attribute__((constructor(HANDLER_INIT_PRIORITY))) static void registration() { registerHandler(poptorch::symbols::poptorch::fps, fpsHandler); } } // namespace poptorch ================================================ FILE: poptorch/source/popart_canonicalization/pyg_torch_cluster/GridOp.cpp ================================================ // Copyright (c) 2022 Graphcore Ltd. All rights reserved. #include "../PopartCanonicalizationUtils.hpp" #include "../PoptorchStaticInit.hpp" #include "../PoptorchSymbols.hpp" #include "../ScatterReduction.hpp" #include "poptorch/OpBuilder.hpp" #include "poptorch/Utils.hpp" #include "poptorch_logging/Error.hpp" #include "poptorch_logging/Logging.hpp" namespace poptorch { namespace { torch::jit::Node *gridHandler(torch::jit::Graph *graph, torch::jit::Node *node) { auto *pos = node->input(0); auto *size = node->input(1); auto *start = node->input(2); auto *end = node->input(3); std::vector pos_shape = shapeFromTensor(pos); const std::vector size_shape = shapeFromTensor(size); int num_voxels_size = 1; if (pos_shape.size() > 1) { num_voxels_size = std::accumulate(pos_shape.cbegin() + 1, pos_shape.cend(), 1, std::multiplies()); pos_shape = {pos_shape[0], num_voxels_size}; pos = createReshape(graph, pos, pos_shape)->output(); } if (isNone(start)) { start = createReducemin(graph, {pos}, {0}, 0)->output(); } if (isNone(end)) { end = createReducemax(graph, {pos}, {0}, 0)->output(); } pos = createSub(graph, {pos, createUnsqueeze(graph, {start}, {0})->output()}) ->output(); start = createCast(graph, start, c10::kFloat)->output(); end = createCast(graph, end, c10::kFloat)->output(); size = createCast(graph, size, c10::kFloat)->output(); auto *ones = wrapInConstantVec(graph, {1}); auto *zeros = wrapInConstantVec(graph, {0}); auto *num_voxels = createDiv(graph, {createSub(graph, {end, start})->output(), size}) ->output(); num_voxels = createCast(graph, num_voxels, c10::kInt)->output(); num_voxels = createAdd(graph, {num_voxels, wrapInConstantVec(graph, {1})})->output(); num_voxels->setType(num_voxels->type()->expect()->withSizes( {num_voxels_size})); num_voxels = createHandlerOperation(graph, getHandler(c10::aten::cumprod), {num_voxels, zeros}) ->output(); num_voxels = createConcat(graph, {ones, num_voxels}, 0)->output(); num_voxels = createSlice(graph, {num_voxels}, {size_shape.at(0)}, {0}, {0})->output(); num_voxels->setType(num_voxels->type()->expect()->withSizes( {size_shape.at(0)})); pos = createCast(graph, pos, c10::kFloat)->output(); size = createReshape(graph, size, {1, std::accumulate(size_shape.cbegin(), size_shape.cend(), 1, std::multiplies())}) ->output(); auto *out = createDiv(graph, {pos, size})->output(); out = createCast(graph, out, c10::kInt)->output(); out = createMul(graph, {out, num_voxels})->output(); return createReducesum(graph, {out}, {1}, 0); } } // namespace __attribute__((constructor(HANDLER_INIT_PRIORITY))) static void registration() { registerHandler(torch_cluster::grid, gridHandler); } } // namespace poptorch ================================================ FILE: poptorch/source/popart_canonicalization/pyg_torch_cluster/NearestOp.cpp ================================================ // Copyright (c) 2023 Graphcore Ltd. All rights reserved. #include #include #include "../PopartCanonicalizationUtils.hpp" #include "../PoptorchStaticInit.hpp" #include "../PoptorchSymbols.hpp" #include "poptorch/OpBuilder.hpp" #include "poptorch/Utils.hpp" #include "poptorch_logging/Error.hpp" namespace poptorch { namespace { std::tuple batchSizes(const torch::jit::Value *x, const torch::jit::Value *y, std::vector &batch_x, std::vector &batch_y) { if (!std::is_sorted(batch_x.cbegin(), batch_x.cend())) { throw std::invalid_argument("'batch_x' is not sorted"); } if (!std::is_sorted(batch_y.cbegin(), batch_y.cend())) { throw std::invalid_argument("'batch_y' is not sorted"); } std::int64_t batch_x_size = batch_x.size(); std::int64_t batch_y_size = batch_y.size(); if (batch_x_size == 0 && (batch_y_size != 0)) { batch_x_size = shapeFromTensor(x)[0]; batch_x = std::vector(batch_x_size, 0); } if (batch_y_size == 0 && (batch_x_size != 0)) { batch_y_size = shapeFromTensor(y)[0]; batch_y = std::vector(batch_y_size, 0); } return {batch_x_size, batch_y_size}; } std::tuple, std::vector> batchShapes(torch::jit::Graph *graph, const std::vector &x_shape, const std::vector &y_shape, torch::jit::Value *&batch_x, torch::jit::Value *&batch_y) { std::vector batch_x_shape = shapeFromTensor(batch_x); std::vector batch_y_shape = shapeFromTensor(batch_y); if (batch_x_shape[0] == 0 && (batch_y_shape[0] != 0)) { batch_x_shape = {x_shape[0]}; const std::vector data(batch_x_shape[0], 0); batch_x = createConstantLong(graph, data, batch_x_shape)->output(); } if (batch_y_shape[0] == 0 && (batch_x_shape[0] != 0)) { batch_y_shape = {y_shape[0]}; const std::vector data(batch_y_shape[0], 0); batch_y = createConstantLong(graph, data, batch_y_shape)->output(); } return {batch_x_shape, batch_y_shape}; } std::vector prepareInputTensor(torch::jit::Graph *graph, torch::jit::Value *&input) { auto input_shape = shapeFromTensor(input); if (input_shape.size() == 1) { const auto input_accum_shape = std::accumulate( input_shape.cbegin(), input_shape.cend(), 1, std::multiplies()); input_shape = std::vector{input_accum_shape, 1}; input = createReshape(graph, input, input_shape)->output(); } return input_shape; } void validateInputShapes(const std::vector &x_shape, const std::vector &y_shape) { if (x_shape.size() > 2 || y_shape.size() > 2) { throw std::invalid_argument( "Inputs `x` and `y` should be max 2D tensors, while `x` has " + std::to_string(x_shape.size()) + " dims and `y` has " + std::to_string(y_shape.size()) + " dims."); } if (x_shape[1] != y_shape[1]) { throw std::invalid_argument( "Inputs shapes inconsistent x.shape[1]=" + std::to_string(x_shape[1]) + " vs. y.shape[1]=" + std::to_string(y_shape[1])); } } std::vector uniqueConsecutive(std::vector batch) { auto last = std::unique(batch.begin(), batch.end()); batch.erase(last, batch.end()); return batch; } void validateBatchIndices(const std::vector &batch_x, const std::vector &batch_y) { const auto unique_batch_x = uniqueConsecutive(batch_x); const auto unique_batch_y = uniqueConsecutive(batch_y); if (unique_batch_x != unique_batch_y) { throw std::invalid_argument("Some batch indices occur in 'batch_x' " "that do not occur in 'batch_y'"); } } void validateSizes(std::int64_t x_size, std::int64_t y_size, std::int64_t batch_x_size, std::int64_t batch_y_size) { if (x_size != batch_x_size) { throw std::invalid_argument("x.size(0) == batch_x.size(0)"); } if (y_size != batch_y_size) { throw std::invalid_argument("y.size(0) == batch_y.size(0)"); } } void validateShapes(const std::vector &x_shape, const std::vector &y_shape, const std::vector &batch_x_shape, const std::vector &batch_y_shape) { if (batch_x_shape.size() != 1 || x_shape.front() != batch_x_shape.front()) { throw std::invalid_argument("x.size(0) == batch_x.size(0)"); } if (batch_y_shape.size() != 1 || y_shape.front() != batch_y_shape.front()) { throw std::invalid_argument("y.size(0) == batch_y.size(0)"); } } void rescaleInputs(torch::jit::Graph *graph, torch::jit::Value *&x, torch::jit::Value *&y, const std::vector &x_shape, const std::vector &y_shape) { static constexpr bool keepdims = false; torch::jit::Value *const min_x = createReducemin(graph, {x}, {0, 1}, static_cast(keepdims)) ->output(); torch::jit::Value *const min_y = createReducemin(graph, {y}, {0, 1}, static_cast(keepdims)) ->output(); torch::jit::Value *const min_xy = createMin(graph, {min_x, min_y})->output(); x = createSub(graph, {x, min_xy})->output(); y = createSub(graph, {y, min_xy})->output(); torch::jit::Value *const max_x = createReducemax(graph, {x}, {0, 1}, static_cast(keepdims)) ->output(); torch::jit::Value *const max_y = createReducemax(graph, {y}, {0, 1}, static_cast(keepdims)) ->output(); torch::jit::Value *const max_xy = createMax(graph, {max_x, max_y})->output(); x = createDiv(graph, {x, max_xy})->output(); x->setType(x->type()->expect()->withSizes(x_shape)); y = createDiv(graph, {y, max_xy})->output(); y->setType(y->type()->expect()->withSizes(y_shape)); } void concatFeatures(torch::jit::Graph *graph, torch::jit::Value *&input, const std::vector &input_shape, std::vector &batch, std::int64_t D) { std::transform(batch.cbegin(), batch.cend(), batch.begin(), [&D](std::int64_t value) { return 2 * D * value; }); torch::jit::Value *batch_tensor = createConstantLong(graph, batch, {static_cast(batch.size()), 1}) ->output(); input = createConcat(graph, {input, batch_tensor}, 1)->output(); const std::vector concat_shape{input_shape[0], input_shape[1] + 1}; input->setType( input->type()->expect()->withSizes(concat_shape)); } void concatFeatures(torch::jit::Graph *graph, torch::jit::Value *&input, const std::vector &input_shape, torch::jit::Value *&batch, std::int64_t batch_size, std::int64_t D) { const std::vector data(batch_size, 2 * D); const std::vector batch_shape{batch_size, 1}; torch::jit::Value *multiplier = createConstantInt(graph, data, batch_shape)->output(); batch = createReshape(graph, batch, batch_shape)->output(); batch = createMul(graph, {multiplier, batch})->output(); input = createConcat(graph, {input, batch}, 1)->output(); const std::vector concat_shape{input_shape[0], input_shape[1] + 1}; input->setType( input->type()->expect()->withSizes(concat_shape)); } torch::jit::Node *vq(torch::jit::Graph *graph, torch::jit::Value *const x, torch::jit::Value *const y) { auto *const p = createConstantFloat32(graph, {2.0}, {1}); auto *const distances = createHandlerOperation( graph, getHandler(c10::aten::cdist), {x, y, p->output()}); return createArgmin(graph, {distances->output()}, 1 /*axis*/, 0 /*keepdims*/); } torch::jit::Node *nearestBatchListHandler(torch::jit::Graph *graph, torch::jit::Node *node) { torch::jit::Value *x = node->input(0); torch::jit::Value *y = node->input(1); std::vector batch_x = constantToLongVec(node->input(2)->node()); std::vector batch_y = constantToLongVec(node->input(3)->node()); const auto x_shape = prepareInputTensor(graph, x); const auto y_shape = prepareInputTensor(graph, y); validateInputShapes(x_shape, y_shape); const auto [batch_x_size, batch_y_size] = batchSizes(x, y, batch_x, batch_y); if ((batch_x_size != 0) && (batch_y_size != 0)) { validateBatchIndices(batch_x, batch_y); validateSizes(x_shape[0], y_shape[0], batch_x_size, batch_y_size); rescaleInputs(graph, x, y, x_shape, y_shape); const std::int64_t d = x_shape.back(); concatFeatures(graph, x, x_shape, batch_x, d); concatFeatures(graph, y, y_shape, batch_y, d); } return vq(graph, x, y); } torch::jit::Node *nearestHandler(torch::jit::Graph *graph, torch::jit::Node *node) { torch::jit::Value *x = node->input(0); torch::jit::Value *y = node->input(1); torch::jit::Value *batch_x = node->input(2); torch::jit::Value *batch_y = node->input(3); const auto x_shape = prepareInputTensor(graph, x); const auto y_shape = prepareInputTensor(graph, y); validateInputShapes(x_shape, y_shape); const auto [batch_x_shape, batch_y_shape] = batchShapes(graph, x_shape, y_shape, batch_x, batch_y); if (!batch_x_shape.empty() && !batch_y_shape.empty()) { // No validation of batch indices as we can't assert from Poplar validateShapes(x_shape, y_shape, batch_x_shape, batch_y_shape); rescaleInputs(graph, x, y, x_shape, y_shape); const std::int64_t d = x_shape.back(); concatFeatures(graph, x, x_shape, batch_x, batch_x_shape[0], d); concatFeatures(graph, y, y_shape, batch_y, batch_y_shape[0], d); } return vq(graph, x, y); } } // namespace __attribute__((constructor(HANDLER_INIT_PRIORITY))) static void registration() { registerHandler(poptorch::symbols::poptorch::nearest, nearestHandler); registerHandler(poptorch::symbols::poptorch::nearest_batch_list, nearestBatchListHandler); } } // namespace poptorch ================================================ FILE: poptorch/source/type_and_constant_canonicalization/AddListNumElements.cpp ================================================ // Copyright (c) 2021 Graphcore Ltd. All rights reserved. #include #include #include #include "poptorch/Utils.hpp" #include "poptorch_logging/Error.hpp" namespace poptorch { namespace type_and_constant_canonicalization { namespace { void recursivelySwitchType(torch::jit::Node *node, const torch::jit::TypePtr &new_type) { for (auto use : node->output()->uses()) { ERROR_ON(use.user->kind() == c10::prim::ListConstruct); // No known JIT model causes this, but one may emerge in which case // this algorithm will need to handle it. ERROR_ON(use.user->kind() == c10::prim::TupleUnpack); if (use.user->kind() == c10::prim::TupleConstruct) { const auto &tuple_elements = use.user->output()->type()->expect()->elements(); std::vector new_types; new_types.reserve(tuple_elements.size()); std::copy(tuple_elements.begin(), tuple_elements.end(), std::back_inserter(new_types)); // This will be the list or nested tuple containing list new_types[use.offset] = new_type; auto new_tuple_type = c10::TupleType::create(new_types); use.user->output()->setType(new_tuple_type); recursivelySwitchType(use.user, new_tuple_type); } } } } // namespace void addListNumElements(torch::jit::Graph *graph, bool revert) { logging::LogContext ctx_func("addListNumElements"); for (torch::jit::Node *node : graph->nodes()) { logging::LogContext ctx("processing " + nodeToString(node)); if (node->kind() == c10::prim::ListConstruct) { auto list_inputs = node->inputs(); // Lists should never be nested as the JIT tracer does not support, // but always good to check in case. for (auto *input : list_inputs) { ERROR_ON(input->type()->kind() == c10::TypeKind::ListType); } c10::TypePtr new_type; if (revert) { // Revert back to the orgiinal type auto lot_type = node->output()->type()->expect(); new_type = lot_type->getOriginalListType(); } else { // Switch to a ListTypeWithNumElements auto orig_type = node->output()->type()->expect(); auto num_elements = list_inputs.size(); new_type = std::make_shared( orig_type->getElementType(), num_elements); } node->output()->setType(new_type); // Any tuples which have te list need fixing. recursivelySwitchType(node, new_type); } } } } // namespace type_and_constant_canonicalization } // namespace poptorch ================================================ FILE: poptorch/source/type_and_constant_canonicalization/CanonicaliseConstants.cpp ================================================ // Copyright (c) 2020 Graphcore Ltd. All rights reserved. #include #include #include #include #include #include #include #include #include #include "poptorch_logging/Error.hpp" #include "poptorch_logging/Logging.hpp" #include "poptorch/DispatchTracer.hpp" #include "poptorch/OpBuilder.hpp" #include "poptorch/TypeAndConstantCanonicalization.hpp" #include "poptorch/Utils.hpp" #include "../PoptorchSymbols.hpp" namespace poptorch { namespace type_and_constant_canonicalization { namespace { // Returns true for node kinds which change compiler state. These need to be // removed for any host side tensors but otherwise does not make connected // node a PopART only node. bool compilerStateChangingKind(const torch::jit::NodeKind &kind) { return (kind == symbols::poptorch::begin_ipu_block || kind == symbols::poptorch::end_ipu_block || kind == symbols::poptorch::set_available_memory || kind == symbols::poptorch::push_name_scope || kind == symbols::poptorch::set_matmul_serialization); } bool popartOnlyNode(const torch::jit::NodeKind &kind) { return (!compilerStateChangingKind(kind) && kind != c10::prim::Constant && kind != c10::prim::TupleConstruct && kind != c10::prim::ListConstruct && kind != c10::prim::TupleUnpack && kind != c10::prim::ListUnpack && kind != c10::prim::Return); } // Check whether the node is (eventually) used host side, IPU or both UseOfNode getUseOfNode(torch::jit::Node *n, bool check_node_kind_itself = true) { // Check the kind of the node itself (for when not called on a prim constant). // This could be disabled explicitly by the caller. if (check_node_kind_itself && popartOnlyNode(n->kind())) { return UseOfNode::PopARTOnly; } if (check_node_kind_itself && n->kind() == c10::prim::Return) { return UseOfNode::HostSideOnly; } bool popart_use = false; bool host_use = false; // Check all outputs std::vector to_check; to_check.push_back(n); while (!to_check.empty()) { auto *cur_node = to_check.back(); to_check.pop_back(); for (auto *output : cur_node->outputs()) { for (auto use : output->uses()) { auto use_kind = use.user->kind(); if (use_kind == c10::prim::Return) { // This must be host use as we have not reached an op which would be // run on popart yet. host_use = true; } else if (popartOnlyNode(use_kind) || use_kind == symbols::poptorch::set_available_memory || use_kind == symbols::poptorch::set_matmul_serialization) { popart_use = true; } else { // We only need to check the node further if it is neither returned // nor used by a Popart op to_check.push_back(use.user); } } } } if (!host_use && !popart_use) { // Some nodes such as begin_ipu_block will simply remove the tensor so make // it a default tensor_constant for simplicity. return UseOfNode::PopARTOnly; } if (host_use && popart_use) { return UseOfNode::HostSideAndPopART; } if (host_use) { return UseOfNode::HostSideOnly; } return UseOfNode::PopARTOnly; } void replaceWithConstantTensor(torch::jit::Graph *graph, torch::jit::Node *n, const at::Tensor &t) { ERROR_ON(n->kind() != c10::prim::Constant); torch::jit::WithInsertPoint const insert_point(n); const WithNodeMetadata meta(n); poptorch::UseOfNode const use_of_node = getUseOfNode(n); auto *new_node = tensorToConstant(graph, t, use_of_node); for (size_t use_idx = 0; use_idx < n->output()->uses().size(); use_idx++) { auto u = n->output()->uses()[use_idx]; u.user->replaceInput(u.offset, new_node->output()); use_idx--; } } void warnDoubleOutOfRange(double val, torch::jit::Node *n) { if (val > std::numeric_limits::max() || val < std::numeric_limits::lowest()) { static std::uint64_t log_repeat = 0; logging::warn(log_repeat, "{}: torch.float64 constant cannot be " "represented as a torch.float32", nodeToString(n)); } } void warnLongOutOfRange(int64_t val, torch::jit::Node *n) { if (val > std::numeric_limits::max() || val < std::numeric_limits::lowest()) { static std::uint64_t log_repeat = 0; logging::warn(log_repeat, "{}: torch.int64 constant cannot be represented " "as a torch.int32", nodeToString(n)); } } void handleNumberConstant(torch::jit::Graph *graph, torch::jit::Node *n) { if (n->output()->type()->isSubtypeOf(c10::BoolType::get())) { replaceWithConstantTensor( graph, n, at::native::scalar_tensor(*torch::jit::constant_as(n->output()), at::kInt, c10::nullopt, at::kCPU)); } else { auto so = torch::jit::constant_as(n->output()); ERROR_ON(!so.has_value()); auto s = *so; c10::ScalarType dtype; if (s.isFloatingPoint()) { warnDoubleOutOfRange(s.toDouble(), n); dtype = at::kFloat; } else if (s.isIntegral(false)) { dtype = at::kInt; // Handle magic number 9223372036854775807 if (s.toLong() == std::numeric_limits::max()) { s = std::numeric_limits::max(); logging::info("{}: Using max value for torch.int32 in place of max " "value for torch.int64", nodeToString(n)); } else { warnLongOutOfRange(s.toLong(), n); } } else { ERROR("Unsupported constant type"); } auto wrapped_number = at::native::scalar_tensor(s, dtype, c10::nullopt, at::kCPU); wrapped_number.unsafeGetTensorImpl()->set_wrapped_number(true); replaceWithConstantTensor(graph, n, wrapped_number); } } void handleTensorConstant(torch::jit::Graph *graph, torch::jit::Node *n) { auto tensor_type = n->output()->type()->expect(); if (!tensor_type->scalarType().has_value()) { ERROR("Tensor constant without type"); } at::Tensor tensor; if (n->kindOf(c10::attr::value) == torch::jit::AttributeKind::ts) { tensor = getNodeTensorAttrValue(n); } else { ERROR_ON_MSG(n->kindOf(c10::attr::value) != torch::jit::AttributeKind::t, "[Internal] expected type 't' or 'ts' but got " << torch::jit::toString(n->kindOf(c10::attr::value))); tensor = n->t(c10::attr::value); } ERROR_ON(!tensor.defined()); const bool was_wrapped = tensor.unsafeGetTensorImpl()->is_wrapped_number(); if (tensor.scalar_type() == at::ScalarType::Double) { warnDoubleOutOfRange( *reinterpret_cast(tensor.unsafeGetTensorImpl()->data()), n); tensor = tensor.to(at::ScalarType::Float); } if (tensor.scalar_type() == at::ScalarType::Long) { warnLongOutOfRange( *reinterpret_cast(tensor.unsafeGetTensorImpl()->data()), n); tensor = tensor.to(at::ScalarType::Int); } // This gets lost in conversion and changes implicit casting if not set // (Must use an if as set_wrapped_number(false) can cause a PyTorch internal // error) if (was_wrapped) { tensor.unsafeGetTensorImpl()->set_wrapped_number(true); } replaceWithConstantTensor(graph, n, tensor); } void handleStringConstant(torch::jit::Graph *graph, torch::jit::Node *n) { std::string const s = n->s(c10::attr::value); std::vector shape_vec; shape_vec.push_back(s.length()); auto t = at::empty({shape_vec}, at::dtype(at::ScalarType::Char) .memory_format(c10::MemoryFormat::Contiguous)); std::memcpy(t.data_ptr(), s.c_str(), s.length() * sizeof(char)); replaceWithConstantTensor(graph, n, t); } // Visit an ivalue which is a tuple or list constant and single type constant // nodes and list/tuple constructs to replace it class ListTupleVisitor { enum class State { IN_TUPLE, IN_LIST }; // Maintain the information about the list or tuple at each level struct ListOrTupleInfo { ListOrTupleInfo(State state_, size_t elements_left_, c10::TypePtr container_type_) : state(state_), elements_left(elements_left_), container_type(std::move(container_type_)) {} // Whether or not the visitor is currently in a list or a tuple State state; // The number of elenents left to be visited (before a List/TupleConstruct) size_t elements_left; // The type of the list/tuple, preserved from first visit ahead of // constructing the list or tuple c10::TypePtr container_type; // All the nodes to be input to the List/TupleConstruct std::vector container_nodes; }; public: explicit ListTupleVisitor(torch::jit::Graph *graph) : _graph(graph), _last_node(nullptr) {} // We never return true as we visit every element bool operator()(const c10::IValue &i_value) { if (i_value.isGenericDict()) { ERROR("Dicts are not supported in constant canonicalisation."); } // Handle the visting of a list or tuple: actual creation will happen // once all its elements have been visited if (i_value.isTuple() || i_value.isList()) { handleListOrTuple(i_value); return false; } // Handle an element which is not a tuple or list handleConstant(i_value); // There will not be a further visit marking the completition of a tuple // or list, so this must be handled after the final constant. // In addition, in a nested scenario, this might trigger for then once // e.g. (1, (2, (3, 4))) will lead this block running three times. while (_info_stack.top().elements_left == 0) { handleTupleOrListConstruction(); if (_info_stack.empty()) { // All tuples and lists have been constructed break; } } return false; } const std::vector &getAllConstNodes() { return _all_const_nodes; } torch::jit::Node *getLastNode() { if (_last_node == nullptr) { // There is no last node: it means the list or tuple construction hasn't // been triggered (For example if it's an empty list/tuple). handleTupleOrListConstruction(); ERROR_ON(_last_node == nullptr); } return _last_node; } private: // Handle a list of the tuple: this involves merely recording the state, type // and number of elements as the inputs to a List/TupleConstruct will not have // been constructed at this point. void handleListOrTuple(const c10::IValue &i_value) { if (i_value.isTuple()) { _info_stack.emplace(State::IN_TUPLE, i_value.toTuple()->elements().size(), i_value.type()); } else { _info_stack.emplace(State::IN_LIST, i_value.toListRef().size(), i_value.type()); } } // Handle a tensor or numeric constant. This adds a constant of the same type // to the graph, which will later be canonicalised to a tensor constant. // Though this means that there will be an extra canonicalisation step, it // minimises code duplication. All constants are added to "_all_const_nodes" // for the later canonicalisation. void handleConstant(const c10::IValue &i_value) { ERROR_ON(_info_stack.empty()); auto *new_const = _graph->create(c10::prim::Constant); if (i_value.isTensor()) { new_const->output()->inferTypeFrom(i_value.toTensor()); setNodeTensorAttrValue(new_const, i_value.toTensor()); } else if (i_value.isInt()) { new_const->output()->setType(c10::IntType::get()); new_const->i_(c10::attr::value, i_value.toInt()); } else if (i_value.isDouble()) { new_const->output()->setType(c10::FloatType::get()); new_const->f_(c10::attr::value, i_value.toDouble()); } else if (i_value.isBool()) { new_const->output()->setType(c10::BoolType::get()); new_const->i_( c10::attr::value, static_cast(i_value.toBool())); } else if (i_value.isNone()) { // Assign NoneType so that the node can be skipped over // during constant canonicalization new_const->output()->setType(c10::NoneType::get()); } else { ERROR("Unsupported type for constant: " << i_value); } insertNodeInGraph(_graph, new_const); _info_stack.top().container_nodes.push_back(new_const); _all_const_nodes.push_back(new_const); _info_stack.top().elements_left--; } // Handle the actual constructions of a list or tuple once the last element // has been visited. void handleTupleOrListConstruction() { torch::jit::Node *construct_node; switch (_info_stack.top().state) { case State::IN_TUPLE: construct_node = _graph->create(c10::prim::TupleConstruct); break; case State::IN_LIST: construct_node = _graph->create(c10::prim::ListConstruct); break; default: ERROR("Unreachable"); } for (auto *element : _info_stack.top().container_nodes) { construct_node->addInput(element->output()); } construct_node->output()->setType(_info_stack.top().container_type); insertNodeInGraph(_graph, construct_node); _info_stack.pop(); if (!_info_stack.empty()) { ERROR_ON(_info_stack.top().elements_left < 1); _info_stack.top().elements_left--; // The container is itself an element of the previous container _info_stack.top().container_nodes.push_back(construct_node); } else { // Store the final node for access outside the visit _last_node = construct_node; } } torch::jit::Graph *_graph; std::stack _info_stack; std::vector _all_const_nodes; torch::jit::Node *_last_node; }; void handleListOrTuple(torch::jit::Graph *graph, torch::jit::Node *n, std::unordered_set *to_delete) { torch::jit::WithInsertPoint const insert_point(n); const WithNodeMetadata meta(n); // Use the visitor to turn the single list/tuple constant into many // constants and List/TupleConstructs. ListTupleVisitor visitor(graph); const auto &tuple_ivalue = n->ival(c10::attr::value); tuple_ivalue.visit(std::function( std::reference_wrapper(visitor))); // Find the very last node added and use it to replace the original node auto *replacement_node = visitor.getLastNode(); auto *replacement_node_out = replacement_node->output(); replacement_node_out->setType(n->output()->type()); n->output()->replaceAllUsesWith(replacement_node_out); // The nodes added in the visitor match those of constants not in lists/tuples // *before* canonicalisation (to permit code reuse). Hence, we canonicalise // in the same way. for (auto *prim_const : visitor.getAllConstNodes()) { torch::jit::WithInsertPoint const insert_point_prim_const(prim_const); const WithNodeMetadata prim_meta(prim_const); // If there are NoneTypes we can skip those if (prim_const->output()->type() != c10::NoneType::get()) { if (prim_const->output()->type()->isSubtypeOf(c10::TensorType::get())) { handleTensorConstant(graph, prim_const); } else { handleNumberConstant(graph, prim_const); } to_delete->insert(prim_const); } } } void recursivelySelectHostAndIPUSideConstants( torch::jit::Node *node_to_process, torch::jit::Node *host_side_replacement, torch::jit::Node *ipu_side_replacement, std::unordered_set *to_delete) { for (size_t output_idx = 0; output_idx < node_to_process->outputs().size(); output_idx++) { auto *output = node_to_process->output(output_idx); while (!output->uses().empty()) { auto use = output->uses()[0]; switch (getUseOfNode(use.user)) { case UseOfNode::HostSideOnly: use.user->replaceInput(use.offset, host_side_replacement->output(output_idx)); break; case UseOfNode::PopARTOnly: use.user->replaceInput(use.offset, ipu_side_replacement->output(output_idx)); break; case UseOfNode::HostSideAndPopART: auto *graph = use.user->owningGraph(); torch::jit::WithInsertPoint const insert_point(use.user); const WithNodeMetadata meta(use.user); auto same_value = [](torch::jit::Value *value) { return value; }; auto *host_side_node = graph->createClone(use.user, same_value); host_side_node->replaceInput(use.offset, host_side_replacement->output(output_idx)); insertNodeInGraph(graph, host_side_node); auto *ipu_side_node = graph->createClone(use.user, same_value); ipu_side_node->replaceInput(use.offset, ipu_side_replacement->output(output_idx)); insertNodeInGraph(graph, ipu_side_node); recursivelySelectHostAndIPUSideConstants(use.user, host_side_node, ipu_side_node, to_delete); to_delete->insert(use.user); // Prevent further cloning while (!use.user->inputs().empty()) { use.user->removeInput(0); } break; } } } } // Find any host_and_ipu_side_tensor_constant constants and perform the // necessary splitting void rectifyHostAndIPUSideConstants( torch::jit::Graph *graph, std::unordered_set *to_delete) { logging::LogContext const ctx_func("rectifyHostAndIPUSideConstants"); for (auto *node : graph->nodes()) { logging::LogContext const ctx("processing " + nodeToString(node)); if (node->kind() != symbols::poptorch::host_and_ipu_side_tensor_constant) { continue; } // Create two new nodes auto t = getNodeTensorAttrValue(node); torch::jit::WithInsertPoint const insert_point(node); const WithNodeMetadata meta(node); torch::jit::Node *host_side_node = createAndInsertNode( graph, symbols::poptorch::host_side_tensor_constant); host_side_node->output()->inferTypeFrom(t); setNodeTensorAttrValue(host_side_node, t); torch::jit::Node *ipu_node = createAndInsertNode(graph, symbols::poptorch::tensor_constant); ipu_node->output()->inferTypeFrom(t); setNodeTensorAttrValue(ipu_node, t); recursivelySelectHostAndIPUSideConstants(node, host_side_node, ipu_node, to_delete); to_delete->insert(node); } } void removeStateChangingNodesFromHostSideBranch( torch::jit::Graph *graph, std::unordered_set *to_delete) { logging::LogContext const ctx_func( "removeStateChangingNodesFromHostSideBranch"); for (auto *node : graph->nodes()) { logging::LogContext const ctx("processsing " + nodeToString(node)); if (node->kind() != symbols::poptorch::host_side_tensor_constant) { continue; } std::vector to_process; to_process.push_back(node); while (!to_process.empty()) { auto *cur_node = to_process.back(); to_process.pop_back(); auto outputs = cur_node->outputs(); for (auto *output : outputs) { for (auto use : output->uses()) { to_process.push_back(use.user); } } if (!compilerStateChangingKind(cur_node->kind())) { continue; } // The number of outputs may be less e.g. begin_ipu_block, but otherwise // any output to be replaced must match the input for this to work. for (size_t output_idx = 0; output_idx < cur_node->outputs().size(); output_idx++) { cur_node->output(output_idx) ->replaceAllUsesWith(cur_node->input(output_idx)); } to_delete->insert(cur_node); } } } void canonicaliseIfConstant(torch::jit::Graph *graph, torch::jit::Node *node, std::unordered_set *to_delete) { logging::LogContext const ctx("processing " + nodeToString(node)); if (node->kind() == c10::aten::size) { // This will be made a constant in the size handler node->output()->setType( c10::TensorType::create(c10::ScalarType::Int, c10::nullopt, 1, false)); } // If it's not a constant or if it doesn't have a value (i.e is None) or if // it's a Device if (node->kind() != c10::prim::Constant || !node->hasAttribute(c10::attr::value) || node->output()->type()->isSubtypeOf(c10::DeviceObjType::get())) { return; } if (node->output()->type()->isSubtypeOf(c10::NumberType::get()) || node->output()->type()->isSubtypeOf(c10::BoolType::get())) { logging::LogContext const ctx2("handling as number constant"); handleNumberConstant(graph, node); } else if (node->output()->type()->isSubtypeOf(c10::TensorType::get())) { logging::LogContext const ctx2("handling as tensor constant"); handleTensorConstant(graph, node); } else if (node->output()->type()->isSubtypeOf(c10::StringType::get())) { logging::LogContext const ctx2("handling as string constant"); handleStringConstant(graph, node); } else if (node->output()->type()->isSubtypeOf(c10::ListType::ofBools())) { // Only known case is the result of an evaluated constexpr logging::LogContext const ctx2("handling as bool list constant"); handleListOrTuple(graph, node, to_delete); } else if (node->output()->type()->isSubtypeOf(c10::ListType::ofFloats())) { // Only known case is the result of an evaluated constexpr logging::LogContext const ctx2("handling as float list constant"); handleListOrTuple(graph, node, to_delete); } else if (node->output()->type()->isSubtypeOf(c10::ListType::ofInts())) { // Only known case is the result of an evaluated constexpr logging::LogContext const ctx2("handling as int list constant"); handleListOrTuple(graph, node, to_delete); } else if (node->output()->type()->isSubtypeOf(c10::ListType::ofTensors())) { // Only known case is the result of an evaluated constexpr logging::LogContext const ctx2("handling a tensor list constant"); handleListOrTuple(graph, node, to_delete); } else if (node->output()->type()->isSubtypeOf(c10::ListType::create( c10::OptionalType::create(c10::TensorType::get())))) { logging::LogContext const ctx2("handling an optional tensor list constant"); handleListOrTuple(graph, node, to_delete); } else if (node->output()->type()->cast()) { handleListOrTuple(graph, node, to_delete); } else { ERROR("Unsupported type " << node->output()->type()->str()); } to_delete->insert(node); } } // namespace void canonicaliseConstants(torch::jit::Graph *graph) { logging::LogContext const ctx_func("CanonicaliseConstants"); std::unordered_set to_delete; for (auto *node : graph->nodes()) { canonicaliseIfConstant(graph, node, &to_delete); } searchAndPossiblyDestroy(to_delete); to_delete.clear(); rectifyHostAndIPUSideConstants(graph, &to_delete); searchAndPossiblyDestroy(to_delete); to_delete.clear(); removeStateChangingNodesFromHostSideBranch(graph, &to_delete); searchAndPossiblyDestroy(to_delete); } } // namespace type_and_constant_canonicalization } // namespace poptorch ================================================ FILE: poptorch/source/type_and_constant_canonicalization/CastUnsupportedInputs.cpp ================================================ // Copyright (c) 2020 Graphcore Ltd. All rights reserved. #include "torch/csrc/jit/ir/ir.h" #include "poptorch_logging/Error.hpp" #include "poptorch_logging/Logging.hpp" #include "poptorch/OpBuilder.hpp" #include "poptorch/TypeAndConstantCanonicalization.hpp" #include "poptorch/Utils.hpp" #include "../PoptorchSymbols.hpp" namespace poptorch { namespace type_and_constant_canonicalization { namespace { void processInputTensor(torch::jit::Graph *graph, torch::jit::Value *input) { auto tensor_type = input->type()->expect(); auto current_type = tensor_type->scalarType().value(); at::ScalarType new_type = coerceToSupportedType(current_type); if (current_type == at::ScalarType::BFloat16) { new_type = at::ScalarType::Half; } else if (new_type == current_type) { // No need for a host side cast return; } auto *earliest_user = findEarliestUser(input); if (earliest_user == nullptr) { logging::warn("Graph contains an unused input %{} : {}", input->debugName(), *tensor_type); return; } // This is an identity op but used just to make sure the implicit cast // does not end up promoting to a Double/Long auto *new_node = graph->create(symbols::poptorch::host_side_cast); insertNodeBeforeNode(new_node, earliest_user); input->replaceAllUsesWith(new_node->output()); new_node->addInput(input); new_node->output()->setType(tensor_type->withScalarType(new_type)); } } // namespace void castUnsupportedInputs(torch::jit::Graph *graph) { auto collapsed_inputs = collapsedGraphInputHierachy(graph); for (auto *input : collapsed_inputs) { if (input != nullptr) { processInputTensor(graph, input); } } } } // namespace type_and_constant_canonicalization } // namespace poptorch ================================================ FILE: poptorch/source/type_and_constant_canonicalization/CheckAndChangeOutputTypes.cpp ================================================ // Copyright (c) 2020 Graphcore Ltd. All rights reserved. #include #include #include "poptorch_logging/Error.hpp" #include "poptorch_logging/Logging.hpp" #include "poptorch/OpBuilder.hpp" #include "poptorch/TypeAndConstantCanonicalization.hpp" #include "poptorch/Utils.hpp" #include "../PoptorchSymbols.hpp" namespace poptorch { namespace type_and_constant_canonicalization { namespace { constexpr bool supportedType(const at::ScalarType type) { return (type == at::ScalarType::Int || type == at::ScalarType::Long || type == at::ScalarType::Half || type == at::ScalarType::Float || type == at::ScalarType::Double || type == at::ScalarType::Bool || type == at::ScalarType::BFloat16 || type == at::ScalarType::Char || type == at::ScalarType::Byte || type == at::ScalarType::Short); } bool isBeforeHostSideCast(const torch::jit::Node *n) { if (n->kind() == c10::prim::TupleUnpack || n->kind() == c10::prim::ListUnpack) { // Recurse through unpacks until we find a host_side_cast or otherwise // return false for (const torch::jit::Value *output : n->outputs()) { if (output->uses().size() != 1) { continue; } if (isBeforeHostSideCast(output->uses()[0].user)) { return true; } } } // Otherwise, the presence or lack of a host_side_cast will indicate whether // to return true or false return n->kind() == symbols::poptorch::host_side_cast; } void warnNonNativeSupport(torch::jit::Node *node, const char *unsupported_type) { // Ignore nodes for which the type is inconsequential if (node->kind() == c10::aten::argmax || node->kind() == c10::aten::argmin || node->kind() == c10::aten::contiguous || node->kind() == c10::aten::chunk || node->kind() == c10::aten::detach || node->kind() == c10::aten::expand || node->kind() == c10::aten::expand_as || node->kind() == c10::aten::flatten || node->kind() == c10::aten::ones || node->kind() == c10::aten::ones || node->kind() == c10::aten::permute || node->kind() == c10::aten::reshape || node->kind() == c10::aten::roll || node->kind() == c10::aten::select || node->kind() == c10::aten::slice || node->kind() == c10::aten::split || node->kind() == c10::aten::stack || node->kind() == c10::aten::squeeze || node->kind() == c10::aten::transpose || node->kind() == c10::aten::unsqueeze || node->kind() == c10::aten::upsample_nearest1d || node->kind() == c10::aten::upsample_nearest2d || node->kind() == c10::aten::upsample_nearest3d || node->kind() == c10::aten::upsample_linear1d || node->kind() == c10::aten::upsample_bilinear2d || node->kind() == c10::aten::upsample_trilinear3d || node->kind() == c10::aten::upsample_bicubic2d || node->kind() == c10::aten::view || node->kind() == c10::aten::zeros || node->kind() == c10::prim::NumToTensor) { return; } static std::unordered_set warned_types; if (warned_types.find(unsupported_type) == warned_types.end()) { logging::warn( "{}: {} is not supported natively on IPU, loss of " "range/precision may occur. We will only warn on the first instance.", nodeToString(node), unsupported_type); warned_types.insert(unsupported_type); } } void maybeReplaceOutputType(torch::jit::Node *node, torch::jit::Value *output, c10::TensorType *current_type, const at::ScalarType unsupported_dtype, const at::ScalarType replacement_dtype, const char *torch_type_str) { if (current_type->scalarType() != unsupported_dtype) { return; } // Constants will be retyped later if (node->kind() != c10::prim::Constant) { warnNonNativeSupport(node, torch_type_str); output->setType(current_type->withScalarType(replacement_dtype)); } // Ensure no casting to it if (node->kind() == c10::aten::to) { // Possible locations of dtype int depending on the aten::to arity auto num_inputs = node->inputs().size(); size_t dtype_index = 0; if (num_inputs == 5 || num_inputs == 8) { dtype_index = 1; } else if (num_inputs == 6) { dtype_index = 2; } else { // Must be another aten::to signature return; } auto int_type = node->input(dtype_index)->type()->cast(); ERROR_ON_MSG(!int_type, "Expected integer type as dtype input at index " << dtype_index << " for " << nodeToString(node)); auto replacement = static_cast(replacement_dtype); auto *input = node->input(dtype_index)->node(); if (node->input(dtype_index)->uses().size() == 1) { // Type constant is only used once, change its value input->i_(c10::attr::value, replacement); } else { // Create a new constant as the constant is used elsewhere auto no_inputs = [](torch::jit::Value *value) { ERROR("A constant should have no inputs"); return value; // ensures correct output type }; auto *new_node = node->owningGraph()->createClone(input, no_inputs); new_node->i_(c10::attr::value, replacement); node->replaceInput(dtype_index, new_node->output()); insertNodeBeforeNode(new_node, node); } logging::info("Replacing cast to {} with cast to {} for {}", c10::toString(unsupported_dtype), c10::toString(replacement_dtype), nodeToString(node)); } } void checkAndChangeOutputTypesForOutput(torch::jit::Node *node, torch::jit::Value *output) { auto tensor_type = output->type()->cast(); // Ignore other return types e.g. NumberTypes for constants if (!tensor_type) { return; } ERROR_ON_MSG(!tensor_type->scalarType().has_value(), "Returning an unknown tensor dtype is not supported.\n"); ERROR_ON_MSG(!supportedType(*tensor_type->scalarType()), "Returning a torch." << c10::toString(*tensor_type->scalarType()) << " is not supported.\n"); maybeReplaceOutputType(node, output, tensor_type.get(), at::ScalarType::Double, at::ScalarType::Float, "torch.float64"); maybeReplaceOutputType(node, output, tensor_type.get(), at::ScalarType::Long, at::ScalarType::Int, "torch.int64"); maybeReplaceOutputType(node, output, tensor_type.get(), at::ScalarType::BFloat16, at::ScalarType::Half, "torch.bfloat16"); } } // namespace void checkAndChangeOutputTypes(torch::jit::Graph *graph) { logging::LogContext const ctx_func("CheckAndChangeOutputTypes"); for (auto *n : graph->nodes()) { // Some unpacks will happen before the host side cast, so ignore them here if (isBeforeHostSideCast(n)) { continue; } logging::LogContext const ctx("processing " + nodeToString(n)); for (auto *output : n->outputs()) { logging::LogContext const ctx_2(output->debugName()); checkAndChangeOutputTypesForOutput(n, output); } } } } // namespace type_and_constant_canonicalization } // namespace poptorch ================================================ FILE: poptorch/source/type_and_constant_canonicalization/EvaluateConstexprs.cpp ================================================ // Copyright (c) 2020 Graphcore Ltd. All rights reserved. #include #include #include #include #include #include #include #include #include "poptorch_logging/Error.hpp" #include "poptorch_logging/Logging.hpp" #include "../PoptorchSymbols.hpp" #include "poptorch/OpBuilder.hpp" #include "poptorch/TypeAndConstantCanonicalization.hpp" #include "poptorch/Utils.hpp" namespace poptorch { namespace type_and_constant_canonicalization { namespace { size_t numNodesInGraph(const torch::jit::Graph *g) { return std::distance(g->nodes().begin(), g->nodes().end()); } size_t numValuesInGraph(const torch::jit::Graph *g) { size_t num_values = 0; for (const auto *node : g->nodes()) { num_values += node->outputs().size(); } return num_values; } const c10::Symbol exclude_node_attr = c10::Symbol::attr("exclude_node"); void markForExclusion(torch::jit::Node *node) { node->i_(exclude_node_attr, 1); } void recursivelyMarkInputsForExclusion(torch::jit::Node *node) { if (node->kind() == c10::prim::Param) { return; } if (node->hasAttribute(exclude_node_attr) && node->i(exclude_node_attr) > 0) { return; } markForExclusion(node); for (auto *input : node->inputs()) { recursivelyMarkInputsForExclusion(input->node()); } } bool isMarkedForExclusion(torch::jit::Node *node) { return node->hasAttribute(exclude_node_attr) && node->i(exclude_node_attr) > 0; } void unmarkForExclusion(torch::jit::Node *node) { node->removeAttribute(exclude_node_attr); } class ConstExprEvaluator { public: explicit ConstExprEvaluator(torch::jit::Graph *g) : _graph(g), _nodes_map(numNodesInGraph(g)), _values_map(numValuesInGraph(g)) {} // Guarantees no re-hashing: does not matter if the hash map is sparse void evaluate(); private: void markSubgraphNodesForExclusion(); void copyAllConstNodesToConstexprGraph(); void removeExclusionAttributes(); void addNodeInputArgUpcast(torch::jit::Node *new_node); void removeLoneConstants(); void evaluateConstExprGraph(torch::jit::Stack *stack); void replaceWithConstants(const torch::jit::Stack &stack); void removeUnusedNodes(); bool nodeIsConstExpr(const torch::jit::Node &node) const; void copyNodeToConstexprGraph(torch::jit::Node *node); static void setAmbiguousValuesToFloatOrHalf(torch::jit::Value *value); // Original graph torch::jit::Graph *_graph; // Graph containing constant expressions which can be evaluated std::shared_ptr _constexpr_graph; // Map the nodes and inputs between the two graphs // original -> constexpr std::unordered_map _nodes_map; std::unordered_map _values_map; // Keep a list of the values in the original graph to be replaced by constants std::unordered_set _ins_to_make_consts; }; void ConstExprEvaluator::evaluate() { ERROR_ON_MSG(_constexpr_graph, "ConstExprEvaluator::evaluate should only be run once"); _constexpr_graph = std::make_shared(); // Copy all nodes which can be evaluated as a constant expression into a new // graph with exception of subgraph nodes. In addition, set outputs of the // new graph where required markSubgraphNodesForExclusion(); copyAllConstNodesToConstexprGraph(); removeExclusionAttributes(); // We do not want to evaluate lone constants only to replace them with an // identical constants removeLoneConstants(); // Evaluate the constexpr graph saving the outputs to stack torch::jit::Stack stack; evaluateConstExprGraph(&stack); // Replace outputs in the original graph, with the constants calculated from // the constexpr graph replaceWithConstants(stack); // Remove nodes which are now unused, in the original graph removeUnusedNodes(); } void ConstExprEvaluator::markSubgraphNodesForExclusion() { // Keep track of subgraphs to avoid evaluating constexprs that are part of // a subgraph. int num_unclosed_subgraphs = 0; for (auto *node : _graph->nodes()) { if (node->kind() == symbols::poptorch::start_for_loop || node->kind() == symbols::poptorch::start_if_block || node->kind() == symbols::poptorch::start_else_block) { num_unclosed_subgraphs++; // All nodes that eventually end up as subgraph inputs also need // to be excluded. recursivelyMarkInputsForExclusion(node->input()->node()); continue; } if (node->kind() == symbols::poptorch::end_for_loop) { ERROR_ON(num_unclosed_subgraphs <= 0); num_unclosed_subgraphs--; continue; } if (node->kind() == symbols::poptorch::end_if_block) { ERROR_ON(num_unclosed_subgraphs <= 0); // if..else block stores 2 subgraphs, one for each branch. num_unclosed_subgraphs -= 2; continue; } if (num_unclosed_subgraphs > 0) { markForExclusion(node); } } ERROR_ON(num_unclosed_subgraphs != 0); } void ConstExprEvaluator::copyAllConstNodesToConstexprGraph() { logging::LogContext const ctx_func("ConstExprEvaluator"); std::vector nodes_plus_return; for (auto *node : _graph->nodes()) { nodes_plus_return.push_back(node); } nodes_plus_return.push_back(_graph->return_node()); for (auto *node : nodes_plus_return) { logging::LogContext const ctx("processing " + nodeToString(node)); if (!isMarkedForExclusion(node) && nodeIsConstExpr(*node)) { copyNodeToConstexprGraph(node); } else { for (auto *input : node->inputs()) { // Add any outputs to the const expression graph if (_values_map.count(input) == 1 && _ins_to_make_consts.count(input) == 0) { _ins_to_make_consts.emplace(input); _constexpr_graph->registerOutput(_values_map[input]); } } } } logging::trace("Constexpr graph: {}", *_constexpr_graph); } void ConstExprEvaluator::removeExclusionAttributes() { for (auto *node : _graph->nodes()) { if (isMarkedForExclusion(node)) { unmarkForExclusion(node); } } } namespace { std::optional getUpcastIndexArg(torch::jit::Node *new_node) { const auto kind = new_node->kind(); if (kind == c10::aten::scatter || kind == c10::aten::scatter_ || kind == c10::aten::scatter_add || kind == c10::aten::scatter_add_ || kind == c10::aten::scatter_reduce || kind == c10::aten::scatter_reduce_ || kind == torch_scatter::scatter_max || kind == torch_scatter::scatter_min || kind == torch_scatter::scatter_mul) { return 2; } if (kind == c10::aten::take_along_dim) { return 1; } return std::nullopt; } void addInputUpcast(torch::jit::Graph *graph, torch::jit::Node *new_node, std::size_t arg_index) { torch::jit::Value *input = new_node->input(arg_index); torch::jit::Node *cast = createAndInsertCastOp(graph, input, at::ScalarType::Long); new_node->replaceInputWith(input, cast->output()); } } // namespace void ConstExprEvaluator::addNodeInputArgUpcast(torch::jit::Node *new_node) { if (const auto index = getUpcastIndexArg(new_node); index) { addInputUpcast(_constexpr_graph.get(), new_node, index.value()); } } void ConstExprEvaluator::removeLoneConstants() { for (auto *node : _graph->nodes()) { if (!node->inputs().empty()) { continue; } if (node->outputs().size() != 1) { continue; } if (_nodes_map.find(node) == _nodes_map.end()) { continue; } auto *new_node = _nodes_map[node]; auto uses = new_node->output()->uses(); if (uses.size() != 1) { continue; } if (uses[0].user != _constexpr_graph->return_node()) { continue; } // The node is on its own in the consextpr graph and there is no point // replacing it with another single node _constexpr_graph->eraseOutput(uses[0].offset); new_node->destroy(); _nodes_map.erase(node); _values_map.erase(node->output()); _ins_to_make_consts.erase(node->output()); } } void ConstExprEvaluator::evaluateConstExprGraph(torch::jit::Stack *stack) { torch::jit::Code const code(_constexpr_graph, ""); torch::jit::InterpreterState state(code); state.run(*stack); ERROR_ON(_ins_to_make_consts.size() != stack->size()); } void ConstExprEvaluator::replaceWithConstants(const torch::jit::Stack &stack) { // Cache the mapping of output value to stack output index std::map constexpr_value_to_out_idx; for (size_t idx = 0; idx < _constexpr_graph->outputs().size(); idx++) { constexpr_value_to_out_idx[_constexpr_graph->outputs()[idx]] = idx; } for (auto *value : _ins_to_make_consts) { // Find the matching stack output for the input from the constexpr auto *constexpr_value = _values_map[value]; // Obtain the resolved value from the stack auto resolved_value = stack.at(constexpr_value_to_out_idx[constexpr_value]); if (resolved_value.isTensor()) { resolved_value = resolved_value.toTensor().contiguous(); } // Insert a constant to replace the original node and replace all uses torch::jit::WithInsertPoint const insert_point(value->node()); const WithNodeMetadata meta(value->node()); torch::jit::Value *new_const = insertConstant(_graph, resolved_value); value->replaceAllUsesWith(new_const); } } bool ConstExprEvaluator::nodeIsConstExpr(const torch::jit::Node &node) const { // If a node has no outputs, it may be a sentinel if (node.outputs().empty()) { return false; } // update_param_inplace has an output but will fail on node.hasSideEffects() if (node.kind() == symbols::poptorch::update_param_inplace) { return false; } // Random nodes or nodes with side effects cannot be constants if (isNondeterministic(node) || node.hasSideEffects()) { return false; } // Either the node has no inputs, or all inputs are outputs of nodes already // copied to the constexpres_graph for (const auto *input : node.inputs()) { if (_values_map.count(input) == 0) { return false; } } return true; } void ConstExprEvaluator::removeUnusedNodes() { // Iterate in reverse so that each node has no users for (auto node_it = _graph->nodes().rbegin(); node_it != _graph->nodes().end(); node_it++) { if (_nodes_map.count(*node_it) != 0u) { node_it.destroyCurrent(); } } } void ConstExprEvaluator::copyNodeToConstexprGraph(torch::jit::Node *node) { auto *new_node = _constexpr_graph->createClone( node, [this](torch::jit::Value *v) { return this->_values_map[v]; }, false); for (auto *input : new_node->inputs()) { auto maybe_device = input->type()->cast(); if (maybe_device) { // All code should be running on CPU here input->node()->s_(c10::attr::value, "cpu"); } } const WithNodeMetadata meta(new_node); addNodeInputArgUpcast(new_node); insertNodeInGraph(_constexpr_graph.get(), new_node); // The CPU backend in some case (e.g aten::expand) will alter a tensor's // strides, whereas the IPU will always keep all the tensors contiguous. // This means all reshapes can be lowered to view ops on the IPU, but // not necessarily on the CPU, so just to be safe we replace all view // ops by reshape ops in the const expr graph. if (new_node->kind() == c10::aten::view) { auto *view = new_node; new_node = view->replaceWithNewSymbol(c10::aten::reshape); view->destroy(); } _nodes_map[node] = new_node; // Map the old outputs to the new const auto *old_it = node->outputs().begin(); const auto *new_it = new_node->outputs().begin(); for (; old_it != node->outputs().end(); old_it++, new_it++) { ERROR_ON(new_it == new_node->outputs().end()); _values_map[*old_it] = *new_it; } } } // namespace void evaluateConstexprs(torch::jit::Graph *graph) { ConstExprEvaluator evaluator(graph); evaluator.evaluate(); } } // namespace type_and_constant_canonicalization } // namespace poptorch ================================================ FILE: poptorch/source/type_and_constant_canonicalization/MakeConstantIntParams.cpp ================================================ // Copyright (c) 2020 Graphcore Ltd. All rights reserved. #include #include #include "poptorch_logging/Error.hpp" #include "poptorch_logging/Logging.hpp" #include "poptorch/OpBuilder.hpp" #include "poptorch/TypeAndConstantCanonicalization.hpp" #include "poptorch/Utils.hpp" namespace poptorch { namespace type_and_constant_canonicalization { void makeConstantIntParams(torch::jit::Graph *graph, std::vector ¶meter_names, std::vector &traced_parameter_tensors) { logging::LogContext ctx_func("makeConstantIntParams"); // _parameters in Lower to popart is traced_parameter_tensors here. std::size_t num_inputs = graph->inputs().size() - traced_parameter_tensors.size(); std::vector to_delete; std::size_t index = 0; for (torch::jit::Value *value : graph->inputs()) { if (index < num_inputs) { index++; continue; } logging::LogContext ctx("processing " + parameter_names[index - num_inputs]); // _parameters in Lower to popart is traced_parameter_tensors here. auto tensor = traced_parameter_tensors[index - num_inputs]; if (value->type()->kind() == c10::TypeKind::TensorType) { auto tensor_type = value->type()->expect(); auto current_type = tensor_type->scalarType().value(); if (!c10::isFloatingType(current_type)) { // Some nodes might not be used, we skip them if so. torch::jit::Node *earliest_user = findEarliestUser(value); if (earliest_user == nullptr) { continue; } torch::jit::WithInsertPoint insert_point(earliest_user); if (current_type == at::ScalarType::Long) { tensor = tensor.to(at::ScalarType::Int); } auto *new_node = tensorToConstant(graph, tensor); logging::trace("makeConstantIntParams removing graph input %{} and " "adding constant value %{}", value->debugName(), new_node->output()->debugName()); for (size_t use_idx = 0; use_idx < value->uses().size(); use_idx++) { auto u = value->uses()[use_idx]; u.user->replaceInput(u.offset, new_node->output()); use_idx--; } ERROR_ON(!value->uses().empty()); to_delete.push_back(index); } } else { // There is no known case of a parameter or buffer being a type other than // TensorType after tracing. Log a warning to assist debugging if a case // is found. logging::warn("Non tensor parameter/buffer identified: {}", parameter_names[index - num_inputs]); } index++; } // Delete highest index first not to invalidate the later indices. ERROR_ON(!std::is_sorted(to_delete.begin(), to_delete.end())); for (auto it = to_delete.rbegin(); it != to_delete.rend(); ++it) { size_t del_i = *it; size_t del_i_params = del_i - num_inputs; parameter_names.erase(parameter_names.begin() + del_i_params); traced_parameter_tensors.erase(traced_parameter_tensors.begin() + del_i_params); graph->eraseInput(del_i); } } } // namespace type_and_constant_canonicalization } // namespace poptorch ================================================ FILE: poptorch_compiler/pytorch_bridge/CMakeLists.txt ================================================ file(GLOB_RECURSE poptorch_compiler_public_headers "${CMAKE_CURRENT_SOURCE_DIR}/include/*.hpp*") add_library(poptorch_compiler SHARED IpuSession.cpp ) target_link_libraries(poptorch_compiler PRIVATE poptorch_logging ) set_property(TARGET poptorch_compiler PROPERTY CXX_STANDARD 17) set_target_properties(poptorch_compiler PROPERTIES PUBLIC_HEADER "${poptorch_compiler_public_headers}") target_include_directories(poptorch_compiler PUBLIC $ $ ) install(TARGETS poptorch_compiler LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/pytorch_bridge ) ================================================ FILE: poptorch_compiler/pytorch_bridge/IpuSession.cpp ================================================ // Copyright (c) 2022 Graphcore Ltd. All rights reserved. #include "pytorch_bridge/IpuSession.hpp" #include #include #include #include #include "pytorch_bridge/DebugInfo.hpp" #include namespace poptorch_ir { namespace { std::size_t dataSize(Type element_type) { switch (element_type) { case Type::BOOL: case Type::CHAR: case Type::UNSIGNED_CHAR: return 1; case Type::SHORT: case Type::UNSIGNED_SHORT: case Type::HALF: case Type::BFLOAT16: return 2; case Type::INT: case Type::UNSIGNED_INT: case Type::FLOAT: return 4; case Type::NONE: case Type::UNDEFINED: break; } ERROR("No type"); } class StaticIpuSession : public IIpuSession { public: Buffer allocate(const TensorType &type) override { auto data_size = dataSize(type.element_type) * type.getNumElements(); return Buffer(std::make_shared>(data_size)); } void copyDataFromCpuSource(Buffer &ipu_dest, const char *cpu_data) override { const auto &ipu_data = ipu_dest.getCpuData(); ERROR_ON(!ipu_data); std::copy(cpu_data, cpu_data + ipu_data->size(), ipu_data->data()); } void copyDataToCpu(char *cpu_dest, Buffer &ipu_src) override { const auto &ipu_data = ipu_src.getCpuData(); ERROR_ON(!ipu_data); std::copy(ipu_data->data(), ipu_data->data() + ipu_data->size(), cpu_dest); } void copyDataOnDevice(Buffer &dest, const Buffer &src) override { const auto &dest_data = dest.getCpuData(); const auto &src_data = src.getCpuData(); ERROR_ON(dest_data->size() != src_data->size()); std::copy(src_data->data(), src_data->data() + src_data->size(), dest_data->data()); } }; } // namespace Buffer::Buffer(CpuBuffer buf) noexcept : _store(std::move(buf)) {} Buffer &Buffer::operator=(CpuBuffer buf) noexcept { _store = std::move(buf); return *this; } const CpuBuffer &Buffer::getCpuData() { ERROR_ON(!std::holds_alternative(_store)); return std::get(_store); } const CpuBuffer &Buffer::getCpuData() const { ERROR_ON(!std::holds_alternative(_store)); return std::get(_store); } bool Buffer::hasData() const { return !std::holds_alternative(_store); } std::shared_ptr createStaticSession() { return std::make_shared(); } } // namespace poptorch_ir ================================================ FILE: poptorch_compiler/pytorch_bridge/include/pytorch_bridge/CompilerOptions.hpp ================================================ // Copyright (c) 2022 Graphcore Ltd. All rights reserved. #ifndef POPTORCH_COMPILER_PYTORCH_BRIDGE_COMPILER_OPTIONS_HPP_ #define POPTORCH_COMPILER_PYTORCH_BRIDGE_COMPILER_OPTIONS_HPP_ #include namespace poptorch { struct CompilerOptions { struct Dispatcher { // NOTE: std::string-s are avoided here due to ABI issues std::vector> source_location_excludes; bool check_added_ops = true; }; Dispatcher dispatcher; }; } // namespace poptorch #endif ================================================ FILE: poptorch_compiler/pytorch_bridge/include/pytorch_bridge/CompilerTypes.hpp ================================================ // Copyright (c) 2021 Graphcore Ltd. All rights reserved. #ifndef POPTORCH_COMPILER_PYTORCH_BRIDGE_COMPILER_TYPES_HPP_ #define POPTORCH_COMPILER_PYTORCH_BRIDGE_COMPILER_TYPES_HPP_ #include #include #include #include #include #include namespace poptorch_ir { // Host blob of memory containing data to transfer to the IPU. using CpuBuffer = std::shared_ptr>; // A token representing an SSA value on our side. PyTorch records it's // tensors->TensorId and we record TensorId->mlir::Value. This stops either side // from depending directly on each others internal representation. using TensorId = std::uint32_t; // This is identical except that it is known to be valid for it to be none_id using OptionalTensorId = std::uint32_t; // So we can signal that a tensor was invalid (Just for so unimplemented // functions can return something right now.) constexpr TensorId tensor_error_id = std::numeric_limits::max(); // The tensor is none (e.g. optional parameter/return) and this is not an error constexpr TensorId none_id = std::numeric_limits::max() - 1; // How to calculate which floating-point outputs require gradients (others // types will always have this set to false.) enum class RequiresGradType { OR_INPUTS, // OR together all the input tensor requires_grad values FALSE // always false }; struct ODSTensorResult { std::vector tensor_ids; std::vector requires_grad_types; }; // When returning an MLIR op, each return could be compulsory, optional or // variadic tensor under the MLIR Operation Definition Specification (ODS). // Using a vector for each return allows each return to be optional or variadic. using ODSTensorResults = std::vector; enum class Type : std::uint8_t { BOOL, CHAR, UNSIGNED_CHAR, SHORT, UNSIGNED_SHORT, INT, UNSIGNED_INT, HALF, FLOAT, BFLOAT16, NONE, UNDEFINED, }; struct TensorType { std::vector shape; Type element_type; std::int64_t getNumElements() const { return std::accumulate(shape.begin(), shape.end(), std::int64_t{1}, std::multiplies<>()); } }; struct StreamInfo { std::vector name; CpuBuffer buff; TensorType type; std::string_view nameStringView() const { return std::string_view(name.data(), name.size()); } }; } // namespace poptorch_ir #endif ================================================ FILE: poptorch_compiler/pytorch_bridge/include/pytorch_bridge/DebugInfo.hpp ================================================ // Copyright (c) 2022 Graphcore Ltd. All rights reserved. #ifndef POPTORCH_COMPILER_PYTORCH_BRIDGE_DEBUG_INFO_HPP_ #define POPTORCH_COMPILER_PYTORCH_BRIDGE_DEBUG_INFO_HPP_ #include #include namespace poptorch_ir { struct GraphDebugInfo { // Note these are shared with the tensor details std::shared_ptr> initial_graph; std::shared_ptr> cached_graph; }; struct TensorDebugInfo { GraphDebugInfo debug_info; std::size_t output_idx = 0; }; } // namespace poptorch_ir #endif // POPTORCH_COMPILER_PYTORCH_BRIDGE_DEBUG_INFO_HPP_ ================================================ FILE: poptorch_compiler/pytorch_bridge/include/pytorch_bridge/IpuSession.hpp ================================================ // Copyright (c) 2022 Graphcore Ltd. All rights reserved. #ifndef POPTORCH_COMPILER_PYTORCH_BRIDGE_IPU_SESSION_HPP_ #define POPTORCH_COMPILER_PYTORCH_BRIDGE_IPU_SESSION_HPP_ #include #include #include #include #include "pytorch_bridge/CompilerTypes.hpp" #include "pytorch_bridge/DebugInfo.hpp" #include namespace poptorch_ir { struct FunctionIO { std::vector inputs; std::vector outputs; }; class Buffer { // TODO(T70841): since Buffer is stored as a shared pointer it should be // possible at least stop CpuBuffer being a shared pointer. std::variant _store = std::monostate{}; public: Buffer() = default; explicit Buffer(CpuBuffer buf) noexcept; Buffer &operator=(CpuBuffer buf) noexcept; const CpuBuffer &getCpuData(); const CpuBuffer &getCpuData() const; bool hasData() const; }; class IIpuSession { public: virtual ~IIpuSession() = default; virtual Buffer allocate(const TensorType &type) = 0; virtual void copyDataFromCpuSource(Buffer &ipu_dest, const char *cpu_src) = 0; virtual void copyDataToCpu(char *cpu_dest, Buffer &ipu_src) = 0; virtual void copyDataOnDevice(Buffer &dest, const Buffer &src) = 0; }; std::shared_ptr createStaticSession(); } // namespace poptorch_ir #endif // POPTORCH_COMPILER_PYTORCH_BRIDGE_IPU_SESSION_HPP_ ================================================ FILE: poptorch_err/CMakeLists.txt ================================================ cmake_minimum_required(VERSION 3.14 FATAL_ERROR) project(poptorch_err) set(CMAKE_POSITION_INDEPENDENT_CODE ON) add_library(poptorch_exception_info INTERFACE) target_include_directories(poptorch_exception_info INTERFACE exception_info) add_library(poptorch_err STATIC "source/ExceptionHandling.cpp") target_include_directories(poptorch_err SYSTEM PUBLIC $ $ ) file(GLOB_RECURSE poptorch_err_public_headers "${CMAKE_CURRENT_SOURCE_DIR}/include/*.hpp*" "exception_info/*.hpp*") set_target_properties(poptorch_err PROPERTIES PUBLIC_HEADER "${poptorch_err_public_headers}") target_link_libraries(poptorch_err PUBLIC torch poptorch_exception_info PRIVATE popart_compiler poptorch_logging) install(TARGETS poptorch_err LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/poptorch_err ) ================================================ FILE: poptorch_err/exception_info/poptorch_err/ExceptionInfo.hpp ================================================ // Copyright (c) 2022 Graphcore Ltd. All rights reserved. #pragma once #include namespace poptorch { enum class ErrorCategory { RuntimeRecoverable, RuntimeUnrecoverable, Other }; /* * A subclass of this class is used to pass exception information across the ABI * boundary between popart_compiler and the pybind11 interface. It has to use * POD data types to cross the boundary successfully. We then unpack it into a * PoptorchError on the pybind11 side and rethrow it. */ class ExceptionInfo { public: virtual ~ExceptionInfo(); const virtual char *what() const = 0; const virtual char *type() const = 0; virtual int64_t stackDepth() const = 0; const virtual char *stack(int64_t level) const = 0; const virtual char *filename() const = 0; virtual uint64_t line() const = 0; const virtual char *recoveryAction() const = 0; virtual ErrorCategory category() const = 0; }; } // namespace poptorch ================================================ FILE: poptorch_err/include/poptorch_err/ExceptionHandling.hpp ================================================ // Copyright (c) 2022 Graphcore Ltd. All rights reserved. #pragma once #include #include #include #include #include "poptorch_err/ExceptionInfo.hpp" namespace poptorch { /* * The function convertToPoptorchExceptionOrRethrow() processes all the * exception types we're interested in, extracts detail, and marshals them as * instances of PoptorchErrorInfo. The exceptions we're not interested in are * re-thrown as-is. */ struct PoptorchErrorInfo { public: ErrorCategory category; std::string filename; uint64_t line; std::string type; std::string recovery_action; std::string message; std::string long_message; std::vector stack; std::string location; }; PoptorchErrorInfo convertToPoptorchExceptionOrRethrow(const std::exception_ptr &e, bool catch_all, const std::string &catch_file, uint64_t catch_line); } // namespace poptorch /* * This template wraps a function in our try..catch block. It's done this way * so it's less likely that someone will add an entry point without wrapping * it in a try..catch block - the path of least resistance is to copy-paste * the pybind11 def() line which will include the PTC() macro. * This doesn't work for class member functions wrapped by pybind11, which have * to be manually wrapped in a try-catch block. */ template struct PoptorchCatchWrapperImpl; template struct PoptorchCatchWrapperImpl { static R wrap(Args... args) { try { return f(args...); } catch (...) { // TODO(T71675): find a way to pass catch_file / catch_line g(poptorch::convertToPoptorchExceptionOrRethrow(std::current_exception(), catch_all, "unknown", 0)); } } }; ================================================ FILE: poptorch_err/source/ExceptionHandling.cpp ================================================ // Copyright (c) 2022 Graphcore Ltd. All rights reserved. #include "poptorch_err/ExceptionHandling.hpp" #include #include #include "popart_compiler/Compiler.hpp" #include "poptorch_logging/Error.hpp" #include "poptorch_logging/Logging.hpp" #define ERROR_LOG "poptorch_error.log" namespace poptorch { ExceptionInfo::~ExceptionInfo() {} static const int max_log_line_length = 80; PoptorchErrorInfo convertToPoptorchExceptionOrRethrow( const std::exception_ptr &eptr, bool catch_all, const std::string &catch_file, uint64_t catch_line) { ErrorCategory category = ErrorCategory::Other; std::string filename; uint64_t line; std::string type; std::string recovery_action; std::string message; std::vector stack; std::string location; filename = catch_file; line = catch_line; try { popart_compiler::rethrowPopartOrPoplarException(eptr, catch_file.c_str(), catch_line); std::rethrow_exception(eptr); } catch (const ExceptionInfo &ei) { filename = ei.filename(); line = ei.line(); category = ei.category(); type = ei.type(); message = ei.what(); for (int i = 0; i < ei.stackDepth(); i++) { stack.emplace_back(ei.stack(i)); } recovery_action = ei.recoveryAction(); } catch (const poptorch::logging::Error &ex) { logging::trace("Full error: {}", ex.what()); message = ex.what(); type = "poptorch_cpp_error"; filename = ex.file(); line = ex.line(); message = ex.message(); } catch (const std::out_of_range &ex) { message = ex.what(); type = "std::out_of_range"; } catch (const std::exception &ex) { if (!catch_all) { throw; } message = ex.what(); type = "std::exception"; } if (std::count(std::begin(message), std::end(message), '\n') > max_log_line_length) { std::ofstream log; log.open(ERROR_LOG); log << message; log.close(); message = "See " ERROR_LOG " for details"; } std::stringstream swhat; swhat << "In " << filename << ":" << line << ": '" << type << "': " << message; if (category == ErrorCategory::RuntimeRecoverable) { swhat << "\nRecovery action required: " << recovery_action; } auto ctx = poptorch::logging::LogContext::context(); if (ctx) { location = ctx.get(); if (!location.empty()) { swhat << "\nError raised in:\n" << location; } } PoptorchErrorInfo pe; pe.long_message = swhat.str(); pe.category = category; pe.filename = filename; pe.line = line; pe.type = type; pe.recovery_action = recovery_action; pe.message = message; pe.stack = stack; pe.location = location; poptorch::logging::LogContext::resetContext(); return pe; } } // namespace poptorch ================================================ FILE: poptorch_geometric/CMakeLists.txt ================================================ cmake_minimum_required(VERSION 3.14 FATAL_ERROR) project(poptorch-geometric) set(INSTALL_POPPYG_PYDIR ${CMAKE_INSTALL_PREFIX}/poptorch_geometric) add_subdirectory(python) add_custom_target(poptorch_geometric_wheel WORKING_DIRECTORY ${CMAKE_INSTALL_PREFIX} COMMAND python3 ${PROJECT_SOURCE_DIR}/../scripts/generate_poppyg_package.py bdist_wheel --output-dir ${CMAKE_INSTALL_PREFIX}/dist --python-dir ${INSTALL_POPPYG_PYDIR} ) add_custom_target(poptorch_geometric_sdist WORKING_DIRECTORY ${CMAKE_INSTALL_PREFIX} COMMAND python3 ${PROJECT_SOURCE_DIR}/../scripts/generate_poppyg_package.py sdist --output-dir ${CMAKE_INSTALL_PREFIX}/dist --python-dir ${INSTALL_POPPYG_PYDIR} ) install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/README.md DESTINATION .) ================================================ FILE: poptorch_geometric/License.txt ================================================ The MIT License (MIT) Copyright (c) 2023 Graphcore Limited Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: poptorch_geometric/MANIFEST.in ================================================ include *.py include *.toml include License.txt ================================================ FILE: poptorch_geometric/README.md ================================================ # poptorch-geometric Set of extensions for PyTorch Geometric, enabling GNN models to be trained, evaluated and used on the Graphcore IPU. :warning: This project is under active development. All APIs should be considered volatile and any feedback is welcome. ================================================ FILE: poptorch_geometric/config.buildenv.py ================================================ # Copyright (c) 2022 Graphcore Ltd. All rights reserved. installers.add(PipRequirements("requirements.txt")) ================================================ FILE: poptorch_geometric/poptorch_geometric_third_party_licenses.txt ================================================ The PopTorch Geometric package includes code from the following third party projects: PyTorch Geometric ----------------- Copyright (c) 2023 PyG Team Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. PyTorch Cluster --------------- Copyright (c) 2020 Matthias Fey Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: poptorch_geometric/pyproject.toml ================================================ [build-system] requires = [ "setuptools>=42", "wheel" ] build-backend = "setuptools.build_meta" ================================================ FILE: poptorch_geometric/python/CMakeLists.txt ================================================ include(GNUInstallDirs) file(GLOB poppyg_python_files "${CMAKE_CURRENT_SOURCE_DIR}/*.py") # __init__.py needs to be edited by set_version.py so don't copy it over. list(REMOVE_ITEM poppyg_python_files "${CMAKE_CURRENT_SOURCE_DIR}/__init__.py") install(CODE " execute_process( COMMAND python3 ${PROJECT_SOURCE_DIR}/../scripts/set_version.py --input-file ${CMAKE_CURRENT_SOURCE_DIR}/__init__.py ${CMAKE_CURRENT_BINARY_DIR}/__init__.py WORKING_DIRECTORY ${PROJECT_SOURCE_DIR} RESULT_VARIABLE RETVAL OUTPUT_VARIABLE OUTPUT ERROR_VARIABLE OUTPUT) if(RETVAL AND NOT RETVAL EQUAL 0) message(FATAL_ERROR \"set_version.py FAILED: \${OUTPUT}\") endif()") install(FILES ${CMAKE_CURRENT_BINARY_DIR}/__init__.py DESTINATION "${INSTALL_POPPYG_PYDIR}") install(FILES ${poppyg_python_files} py.typed DESTINATION "${INSTALL_POPPYG_PYDIR}") install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/ops DESTINATION "${INSTALL_POPPYG_PYDIR}") install(CODE " execute_process( COMMAND python3 ${PROJECT_SOURCE_DIR}/../scripts/generate_poppyg_package.py install --output-dir ${CMAKE_INSTALL_PREFIX} --python-dir ${INSTALL_POPPYG_PYDIR} WORKING_DIRECTORY ${PROJECT_SOURCE_DIR} RESULT_VARIABLE RETVAL OUTPUT_VARIABLE OUTPUT ERROR_VARIABLE OUTPUT) if(RETVAL AND NOT RETVAL EQUAL 0) message(FATAL_ERROR \"generate_poppyg_package.py FAILED: \${OUTPUT}\") endif()") ================================================ FILE: poptorch_geometric/python/__init__.py ================================================ # Copyright (c) 2022 Graphcore Ltd. All rights reserved. import importlib from .collate import make_exclude_keys from .common import call_once from .dataloader import (FixedSizeDataLoader, FixedSizeStrategy, OverSizeStrategy) from .fixed_size_options import FixedSizeOptions from .types import PyGArgsParser, registerCustomArgParsers from .utils import TrainingStepper, set_aggregation_dim_size from .override import _TorchGeometricOpsSubstitutionManager __version__ = "@VERSION@-@SNAPSHOT@" __all__ = [ '__version__', 'FixedSizeDataLoader', 'FixedSizeOptions', 'FixedSizeStrategy', 'set_aggregation_dim_size', 'TrainingStepper', 'make_exclude_keys', 'OverSizeStrategy', 'PyGArgsParser' ] @call_once def registerOverrideManager(): poplar_executor_spec = importlib.util.find_spec( "poptorch._poplar_executor") if poplar_executor_spec is not None: loader = poplar_executor_spec.loader if loader is not None: poplar_executor = loader.load_module() poplar_executor._OverwriteContextManager.registerSubsitutionManager( # pylint: disable=protected-access _TorchGeometricOpsSubstitutionManager) registerOverrideManager() registerCustomArgParsers() ================================================ FILE: poptorch_geometric/python/cluster_loader.py ================================================ # Copyright (c) 2022 Graphcore Ltd. All rights reserved. from __future__ import annotations # noqa: F407 from typing import Optional from torch_geometric.loader import ClusterData from poptorch_geometric.collate import CombinedBatchingCollater from poptorch_geometric.fixed_size_options import FixedSizeOptions from poptorch_geometric.pyg_cluster_loader import \ FixedSizeClusterLoader as PyGFixedSizeClusterLoader from poptorch_geometric.pyg_dataloader import OverSizeStrategy import poptorch class FixedSizeClusterLoader(PyGFixedSizeClusterLoader, poptorch.DataLoader): r"""A data loader which merges data objects from a :py:class:`torch_geometric.loader.ClusterData` to a mini-batch of clusters and pads node and edge features so tensors across all batches have constant shapes. Args: cluster_data (ClusterData): The cluster from which to load the data. fixed_size_options (FixedSizeOptions, optional): A :py:class:`poptorch_geometric.fixed_size_options.FixedSizeOptions` object which holds the maximum number of nodes, edges and other options required to pad the mini-batches, produced by the data loader, to a fixed size. batch_size (int, optional): The number of nodes per mini-batch to load. (default: :obj:`1`) over_size_strategy (OverSizeStrategy, optional): The behaviour if a sample cannot fit in the fixed-size mini-batch. By default, if the required number of samples cannot fit into the fixed-sized mini-batch, nodes and edges will be removed from the mini-batch to achieve the specified fixed size. (default: `poptorch_geometric.OverSizeStrategy.TrimNodesAndEdges`) add_pad_masks (bool, optional): If :obj:`True`, mask objects are attached to mini-batch result. They represents three levels of padding: - :obj:`graphs_mask` - graph level mask - :obj:`nodes_mask` - node level mask - :obj:`edges_mask` - edge level mask Mask objects indicate which elements in the mini-batch are real (represented by :obj:`True`) and which were added as padding (represented by :obj:`False`). (default: :obj:`True`) options (poptorch.Options, optional): The additional PopTorch options to be passed to :py:class:`poptorch.DataLoader`. (default: :obj:`None`) **kwargs (optional): The additional arguments of :py:class:`poptorch.DataLoader`. """ def __init__( self, cluster_data: ClusterData, fixed_size_options: FixedSizeOptions, batch_size: int = 1, over_size_strategy: OverSizeStrategy = OverSizeStrategy. TrimNodesAndEdges, add_pad_masks: Optional[bool] = True, options: Optional[poptorch.Options] = None, **kwargs, ): self.batch_size = batch_size if options is None: # Create IPU default options options = poptorch.Options() super().__init__(cluster_data=cluster_data, fixed_size_options=fixed_size_options, batch_size=batch_size, over_size_strategy=over_size_strategy, add_pad_masks=add_pad_masks, options=options, **kwargs) def _create_collater(self, **collater_args): collater = super()._create_collater(**collater_args) return CombinedBatchingCollater(mini_batch_size=self.batch_size, collater=collater) ================================================ FILE: poptorch_geometric/python/collate.py ================================================ # Copyright (c) 2022 Graphcore Ltd. All rights reserved. from enum import Enum from typing import Any, Dict, Generator, List, Optional, Tuple, Union from functools import singledispatch try: from functools import singledispatchmethod except ImportError: from singledispatchmethod import singledispatchmethod from itertools import chain import torch from torch_geometric.data import Batch, Data, HeteroData from torch_geometric.data.data import BaseData from torch_geometric.typing import EdgeType, NodeType from torch_geometric.transforms import Pad from poptorch_geometric.fixed_size_options import FixedSizeOptions from poptorch_geometric.pyg_collate import Collater from poptorch_geometric.common import DataBatch, HeteroDataBatch from poptorch._utils import combine_batch_tensors_gen from . import types __all__ = ['FixedSizeCollater', 'CombinedBatchingCollater'] def make_exclude_keys(include_keys: Union[List[str], Tuple[str, ...]], data: BaseData) -> Tuple[str, ...]: return tuple(set(data.keys) - set(include_keys)) def _divide_evenly_formula(amount: int, pieces: int) -> List[int]: minimum = amount // pieces extra = amount - minimum * pieces return [minimum + (1 if i < extra else 0) for i in range(pieces)] @singledispatch def _divide_evenly(data, num_pad_graphs, num_pad_nodes, num_pad_edges): # pylint: disable=unused-argument raise ValueError(f'Unsupported data type: {type(data)}') @_divide_evenly.register(Data) def _(_, num_pad_graphs: int, num_pad_nodes: int, num_pad_edges: int) -> Tuple[List[int], List[int]]: return _divide_evenly_formula(num_pad_nodes, num_pad_graphs), _divide_evenly_formula( num_pad_edges, num_pad_graphs) @_divide_evenly.register(HeteroData) def _(_, num_pad_graphs: int, num_pad_nodes: Dict[NodeType, int], num_pad_edges: Dict[EdgeType, int] ) -> Tuple[List[Dict[NodeType, int]], List[Dict[EdgeType, int]]]: def calc_pads(num_pad_elems): pad_elems = [dict() for i in range(num_pad_graphs)] for type_, pad_val in num_pad_elems.items(): pad_per_graph = _divide_evenly_formula(pad_val, num_pad_graphs) for graph_idx, graph_pad in enumerate(pad_per_graph): pad_elems[graph_idx][type_] = graph_pad return pad_elems pad_nodes = calc_pads(num_pad_nodes) pad_edges = calc_pads(num_pad_edges) return pad_nodes, pad_edges @singledispatch def _generate_data_to_pad(data_to_pad_dict): raise ValueError(f'Unsupported data type: {type(data_to_pad_dict)}') @_generate_data_to_pad.register(Data) def _(data_to_pad_dict: dict) -> Data: return Data.from_dict(data_to_pad_dict) @_generate_data_to_pad.register(HeteroData) def _(data_to_pad_dict: dict) -> HeteroData: return HeteroData(data_to_pad_dict) def _reset_dim(shape: torch.Size, key: str = None) -> List[int]: shape = list(shape) if len(shape) > 1: shape[1 if key == 'edge_index' else 0] = 0 else: return list([0]) return shape def _reset_attr(value: Any, key: str = None) -> Any: """Reset value to the default of its type. In case of torch.Tensor, it returns a tensor with one of the dims set to 0. The dim is determined based on the key. """ if isinstance(value, torch.Tensor): # NOTE: It has to be torch.zeros - creating a Tensor directly # (through torch.tensor) with 0 in shape ends up in creating a # tensor with wrong dimensions. return torch.zeros(_reset_dim(value.shape, key), dtype=value.dtype) return type(value)() def _create_preserve_mask(num_elems: int, num_elems_to_trim: int, slices: List[slice]) -> List[bool]: # Prevent deletion of all elements from a single graph. removable_nodes_mask = torch.ones(num_elems, dtype=torch.bool) for data_slice in slices: if data_slice.start < data_slice.stop: mask_slice = removable_nodes_mask[data_slice] mask_slice[torch.randint(high=len(mask_slice), size=(1, ))] = False indices = torch.arange(0, num_elems)[removable_nodes_mask] # Randomly select elements to remove. prune_indices = indices[torch.randperm( len(indices))][:num_elems_to_trim].type(torch.long) preserve_mask = torch.ones(num_elems, dtype=torch.bool) preserve_mask[prune_indices] = False return preserve_mask def data_slice_gen(num_list: List[int]) -> Generator[slice, None, None]: start = 0 end = 0 for num in num_list: end += num yield slice(start, end) start = end def create_slices_and_preserve_mask( max_num: int, num_list: List[int] ) -> Tuple[Generator[slice, None, None], List[bool]]: num_real = sum(num_list) # There is nothing to prune. if num_real < max_num: return None, None num_to_trim = num_real - max_num slices = list(data_slice_gen(num_list)) # Prepare the mask of randomly chosen to remove. preserve_mask = _create_preserve_mask(num_real, num_to_trim, slices) return slices, preserve_mask @singledispatch def _any_negative(value: int) -> bool: return value < 0 @_any_negative.register(dict) def _(value: dict) -> bool: return any(v < 0 for v in value.values()) @singledispatch def _any_positive(value: int) -> bool: return value > 0 @_any_positive.register(dict) def _(value: dict) -> bool: return any(v > 0 for v in value.values()) @singledispatch def _check_if_over_size(num_pad: int, num_total: int, type_str: str, oversize_error: str): if _any_negative(num_pad): raise RuntimeError( oversize_error.format(type_str=type_str, trim_fn=f"trim_{type_str}", type_value=num_total)) @_check_if_over_size.register(dict) def _(num_pad: dict, num_total: dict, type_str: str, oversize_error: str): for k, v in num_pad.items(): if v < 0: raise RuntimeError( oversize_error.format(type_str=f"{k} {type_str}", trim_fn=f"trim_{type_str}", type_value=num_total[k])) class FixedSizeCollater(Collater): r"""Collates a batch of graphs as a :py:class:`torch_geometric.data.Batch` of fixed-size tensors. Calling an instance of this class adds an additional graphs with the necessary number of nodes and edges to pad the batch so that tensors have the size corresponding to the maximum numbers of graphs, nodes and edges specified during initialisation. Calling an instance of this class can result in :py:exc:`RuntimeError` if the number of graphs (if set), nodes or edges in the batch is larger than the requested limits. Args: fixed_size_options (FixedSizeOptions, optional): A :py:class:`poptorch_geometric.fixed_size_options.FixedSizeOptions` object which holds the maximum number of nodes, edges and other options required to pad the batches, produced by collater, to a fixed size. add_masks_to_batch (bool, optional): If set to :obj:`True`, masks object are attached to batch result. They represents three levels of padding: - :obj:`graphs_mask` - graph level mask - :obj:`nodes_mask` - node level mask - :obj:`edges_mask` - edge level mask Mask objects indicates which elements in the batch are real (represented by :obj:`True` value) and which were added as a padding (represented by :obj:`False` value). (default: :obj:`False`) trim_nodes (bool, optional): If set to :obj:`True`, randomly prune nodes from batch to fulfill the condition of :obj:`num_nodes`. (default: :obj:`False`) trim_edges (bool, optional): If set to :obj:`True`, randomly prune edges from batch to fulfill the condition of :obj:`num_edges`. (default: :obj:`False`) follow_batch (list or tuple, optional): Creates assignment batch vectors for each key in the list. (default: :obj:`None`) exclude_keys (list or tuple, optional): The keys to exclude from the graphs in the output batch. (default: :obj:`None`) """ def __init__( self, fixed_size_options: FixedSizeOptions, add_masks_to_batch: Optional[bool] = False, trim_nodes: Optional[bool] = False, trim_edges: Optional[bool] = False, follow_batch: Optional[Union[List[str], Tuple[str, ...]]] = None, exclude_keys: Optional[Union[List[str], Tuple[str, ...]]] = None, ) -> None: super().__init__(follow_batch, exclude_keys) self.opts = fixed_size_options self.add_masks_to_batch = add_masks_to_batch self.trim_nodes = trim_nodes self.trim_edges = trim_edges self.labels_type = None class LabelsType(Enum): GRAPH_LVL = 0 NODE_LVL = 1 def __call__(self, data_list: List[BaseData]) -> Batch: if not self.opts.is_hetero() and isinstance(data_list[0], HeteroData): self.opts.to_hetero(data_list[0].node_types, data_list[0].edge_types) if not isinstance(data_list, list): raise TypeError(f'Expected list, got {type(data_list).__name__}.') if (isinstance(data_list[0], Data) and hasattr(data_list[0], 'y') and data_list[0].y is not None): y0_equal_num_nodes = all(data.y.shape[0] == data.num_nodes for data in data_list) y0_equal_ones = all(data.y.shape[0] == 1 for data in data_list) if y0_equal_num_nodes and not y0_equal_ones: self.labels_type = self.LabelsType.NODE_LVL elif y0_equal_ones and not y0_equal_num_nodes: self.labels_type = self.LabelsType.GRAPH_LVL else: assert False, "Incorrect input data. The size of the shape" \ "of labels `y` must be either the number" \ "of nodes or the number of graphs" num_real_graphs = len(data_list) num_pad_graphs = self.opts.num_graphs - num_real_graphs if num_pad_graphs < 0: raise RuntimeError( "The maximum number of graphs requested doesn't allocate" " enough room for all the graphs in the batch plus at least" " one extra graph required for padding the batch to a fixed" " size. The number of graphs received for batching is" f" {num_real_graphs + 1}, including at least one padding" " graph, but space for only" f" {num_pad_graphs + num_real_graphs} graphs has been" " requested.") num_all_graphs = num_real_graphs + num_pad_graphs num_real_nodes, num_pad_nodes, num_real_edges, num_pad_edges = \ self._calc_pad_limits(data_list) if self.trim_nodes and _any_negative(num_pad_nodes): data_list = self._prune_nodes(data_list) num_real_nodes, num_pad_nodes, num_real_edges, num_pad_edges = \ self._calc_pad_limits(data_list) if self.trim_edges and _any_negative(num_pad_edges): data_list = self._prune_edges(data_list) num_real_nodes, num_pad_nodes, num_real_edges, num_pad_edges = \ self._calc_pad_limits(data_list) oversize_error = ( "The fixed sizes given don't allocate enough space for the" " number of {type_str} required to fit" f" {num_real_graphs} sample(s) into a batch" f" ({num_pad_graphs + num_real_graphs} including extra padded" " graph(s)). Increase the maximum number of {type_str}, currently" " set to {type_value}, or set `{trim_fn}` to remove any" " excess {type_str} to achieve the given maximum number of" " {type_str}.") _check_if_over_size(num_pad_nodes, self.opts.num_nodes, "nodes", oversize_error) _check_if_over_size(num_pad_edges, self.opts.num_edges, "edges", oversize_error) num_nodes_or_edges_positive = _any_positive( num_pad_nodes) or _any_positive(num_pad_edges) if num_pad_graphs == 0 and num_nodes_or_edges_positive: raise RuntimeError( f'Requested to pad a batch to {num_all_graphs} graphs but ' \ f'collater got a list of {num_real_graphs} graphs and ' \ 'cannot create additional graphs to pad nodes and edges.') if num_pad_graphs and num_nodes_or_edges_positive: data = data_list[0] # Divide padding nodes and edges evenly between padding graphs. pad_nodes_by_graph, pad_edges_by_graph = _divide_evenly( data, num_pad_graphs, num_pad_nodes, num_pad_edges) data_to_pad_dict = self._create_structure_dict(data) for nodes, edges in zip(pad_nodes_by_graph, pad_edges_by_graph): padded_data = self._create_padded_data(data_list, data_to_pad_dict, nodes, edges) data_list.append(padded_data) batch = super().__call__(data_list) if self.add_masks_to_batch: padded_data_list = data_list[-num_pad_graphs:] self._add_masks(batch, num_all_graphs, num_real_graphs, num_real_nodes=num_real_nodes, num_real_edges=num_real_edges, padded_data_list=padded_data_list) return batch @singledispatchmethod def _add_masks(self, batch, num_all_graphs, num_real_graphs, **kwargs): raise ValueError(f'Unsupported data type: {type(batch)}') @_add_masks.register(DataBatch) def _(self, batch: DataBatch, num_all_graphs: int, num_real_graphs: int, **kwargs) -> None: # num_real_nodes: int, num_real_edges: int num_real_nodes = kwargs['num_real_nodes'] num_real_edges = kwargs['num_real_edges'] graphs_mask = torch.arange(num_all_graphs) < num_real_graphs nodes_mask = torch.arange(self.opts.num_nodes) < num_real_nodes edges_mask = torch.arange(self.opts.num_edges) < num_real_edges setattr(batch, 'graphs_mask', graphs_mask) setattr(batch, 'nodes_mask', nodes_mask) setattr(batch, 'edges_mask', edges_mask) @_add_masks.register(HeteroDataBatch) def _(self, batch: HeteroDataBatch, num_all_graphs: int, num_real_graphs: int, **kwargs) -> None: # padded_data_list: List[HeteroDataBatch]): padded_data_list = kwargs['padded_data_list'] graphs_mask = torch.arange(num_all_graphs) < num_real_graphs setattr(batch, 'graphs_mask', graphs_mask) num_padded_nodes_list = [0] * len(batch.node_stores) num_padded_edges_list = [0] * len(batch.edge_stores) for padded_data in padded_data_list: for idx, node_store in enumerate(padded_data.node_stores): num_padded_nodes_list[idx] += node_store.num_nodes for idx, edge_store in enumerate(padded_data.edge_stores): num_padded_edges_list[idx] += edge_store.num_edges def set_mask(stores, num_padded_list, num_attr, mask_attr): for attr, num_padded in zip(stores, num_padded_list): num_elems = getattr(attr, num_attr) mask = torch.arange(num_elems) < (num_elems - num_padded) setattr(attr, mask_attr, mask) set_mask(batch.node_stores, num_padded_nodes_list, 'num_nodes', 'nodes_mask') set_mask(batch.edge_stores, num_padded_edges_list, 'num_edges', 'edges_mask') def _calc_pad_limits( self, data_list: List[BaseData] ) -> Union[Tuple[int, int, int, int], Tuple[Dict[NodeType, int], Dict[NodeType, int], Dict[NodeType, int], Dict[NodeType, int]]]: # Check if all elements in data_list are of the same type data_list_types = [type(d) for d in data_list] assert data_list_types[:-1] == data_list_types[1:] return self._calc_pad_limits_body(data_list[0], data_list) @singledispatchmethod def _calc_pad_limits_body(self, data, data_list): # pylint: disable=unused-argument raise ValueError(f'Unsupported data type: {type(data)}') @_calc_pad_limits_body.register(Data) def _(self, _, data_list: List[Data]) -> Tuple[int, int, int, int]: def calc_pad_limits_attr(data_list, attr, fixed_size): data_num_attr = sum(getattr(d, attr) for d in data_list) num_pad_attr = fixed_size - data_num_attr return data_num_attr, num_pad_attr num_real_nodes, num_pad_nodes = calc_pad_limits_attr( data_list, 'num_nodes', self.opts.num_nodes) num_real_edges, num_pad_edges = calc_pad_limits_attr( data_list, 'num_edges', self.opts.num_edges) return num_real_nodes, num_pad_nodes, num_real_edges, num_pad_edges @_calc_pad_limits_body.register(HeteroData) def _(self, _, data_list: List[HeteroData] ) -> Tuple[Dict[NodeType, int], Dict[NodeType, int], Dict[EdgeType, int], Dict[EdgeType, int]]: real_nodes_nums = dict() pad_nodes_nums = dict() real_edges_nums = dict() pad_edges_nums = dict() for data_ in data_list: for node_type in data_.node_types: num_real_nodes = real_nodes_nums.get( node_type, 0) + data_[node_type].x.shape[0] real_nodes_nums[node_type] = num_real_nodes if isinstance(self.opts.num_nodes, dict): assert node_type in self.opts.num_nodes, ( f"Node type {node_type} exists in the data" " but not in the fixed size options. Ensure" " your fixed size options specify a `num_nodes`" f" for node type {node_type}.") num_pad_nodes = self.opts.num_nodes[ node_type] - num_real_nodes else: num_pad_nodes = self.opts.num_nodes - num_real_nodes pad_nodes_nums[node_type] = num_pad_nodes for edge_type in data_.edge_types: num_real_edges = real_edges_nums.get( edge_type, 0) + data_[edge_type].edge_index.shape[1] real_edges_nums[edge_type] = num_real_edges if isinstance(self.opts.num_edges, dict): assert edge_type in self.opts.num_edges, ( f"Edge type {edge_type} exists in the data" " but not in the fixed size options. Ensure" " your fixed size options specify a `num_edges`" f" for edge type {edge_type}.") num_pad_edges = self.opts.num_edges[ edge_type] - num_real_edges else: num_pad_edges = self.opts.num_edges - num_real_edges pad_edges_nums[edge_type] = num_pad_edges return real_nodes_nums, pad_nodes_nums, real_edges_nums, pad_edges_nums def _create_padded_data( self, data_list: List[BaseData], data_to_pad_dict: Dict[Union[NodeType, EdgeType, str], Any], num_nodes: int, num_edges: int) -> BaseData: """Create a new empty data instance (type specified based on the 'data_list' input) padded to num_nodes and num_edges. """ data = data_list[0] data_type = type(data) data_to_pad = _generate_data_to_pad.dispatch(data_type)( data_to_pad_dict) pad_op = Pad(max_num_nodes=num_nodes, max_num_edges=num_edges, node_pad_value=self.opts.node_pad_value, edge_pad_value=self.opts.edge_pad_value, exclude_keys=self.exclude_keys) padded_data = pad_op(data_to_pad) # Because Pad op does not pad graph values, this needs to be done # in a separate step. self._pad_graph_values(padded_data, data) return padded_data def _prune_edges(self, data_list): return self._prune_edges_body(data_list[0], data_list) @singledispatchmethod def _prune_edges_body(self, data, data_list): # pylint: disable=unused-argument raise ValueError(f'Unsupported data type: {type(data)}') @_prune_edges_body.register(Data) def _(self, _, data_list: List[Data]) -> List[Data]: edge_slices, preserve_edges_mask = create_slices_and_preserve_mask( self.opts.num_edges, [d.num_edges for d in data_list]) # There is nothing to prune. if edge_slices is None: return data_list # Apply the preservation masks to the data_list to finally trim edges. return [ data.edge_subgraph(preserve_edges_mask[slc]) for data, slc in zip(data_list, edge_slices) ] @_prune_edges_body.register(HeteroData) def _(self, data: HeteroData, data_list: List[HeteroData]) -> List[HeteroData]: edge_types = data.edge_types preserve_edges_masks_dict = dict() edge_slices_dict = dict() for edge_type in edge_types: edge_slices, preserve_edges_mask = create_slices_and_preserve_mask( self.opts.num_edges[edge_type], [d[edge_type].edge_index.shape[1] for d in data_list]) preserve_edges_masks_dict[edge_type] = preserve_edges_mask edge_slices_dict[edge_type] = edge_slices return [ data.edge_subgraph({ edge_type: preserve_edges_masks_dict[edge_type][ edge_slices_dict[edge_type][idx]] for edge_type in edge_types if edge_slices_dict[edge_type] is not None }) for idx, data in enumerate(data_list) ] def _prune_nodes(self, data_list): return self._prune_nodes_body(data_list[0], data_list) @singledispatchmethod def _prune_nodes_body(self, data, data_list): # pylint: disable=unused-argument raise ValueError(f'Unsupported data type: {type(data)}') @_prune_nodes_body.register(Data) def _(self, _, data_list: List[BaseData]) -> List[BaseData]: num_graphs_to_trim = len(data_list) if self.opts.num_nodes < num_graphs_to_trim: raise RuntimeError( f'The number of nodes to trim to ({self.opts.num_nodes})' ' is less than the number of graphs in the batch' f' ({num_graphs_to_trim}), which would result in empty' ' graphs.') nodes_slices, preserve_nodes_mask = create_slices_and_preserve_mask( self.opts.num_nodes, [d.num_nodes for d in data_list]) # There is nothing to prune. if nodes_slices is None: return data_list # Apply the preservation masks to the data_list to finally trim nodes. return [ data.subgraph(preserve_nodes_mask[slice]) for data, slice in zip(data_list, nodes_slices) ] @_prune_nodes_body.register(HeteroData) def _(self, data: HeteroData, data_list: List[HeteroData]) -> List[HeteroData]: node_types = data.node_types num_graphs_to_trim = len(data_list) preserve_nodes_masks_dict = dict() node_slices_dict = dict() for node_type in node_types: if self.opts.num_nodes[node_type] < num_graphs_to_trim: raise RuntimeError( f'The number of nodes to trim to ({self.opts.num_nodes})' f' for node type {node_type} is less than the number' f' of graphs in the batch ({num_graphs_to_trim}), which' ' would result in empty graphs.') node_slices, preserve_nodes_mask = create_slices_and_preserve_mask( self.opts.num_nodes[node_type], [d[node_type].num_nodes for d in data_list]) preserve_nodes_masks_dict[node_type] = preserve_nodes_mask node_slices_dict[node_type] = node_slices return [ data.subgraph({ node_type: preserve_nodes_masks_dict[node_type][ node_slices_dict[node_type][idx]] for node_type in data.node_types if node_slices_dict[node_type] is not None }) for idx, data in enumerate(data_list) ] @singledispatchmethod def _create_structure_dict(self, data): """Create a dict representing the structure of the input data. Dict keys correspond to the 'data' keys, its values are all defaulted. """ raise ValueError(f'Unsupported data type: {type(data)}') @_create_structure_dict.register(Data) def _(self, data: Data) -> Dict[NodeType, Any]: if self.labels_type == self.LabelsType.NODE_LVL: check = lambda key: (key == 'y' and self.labels_type == self. LabelsType.NODE_LVL) or (data.is_node_attr( key) or data.is_edge_attr(key)) else: check = lambda key: data.is_node_attr(key) or data.is_edge_attr(key ) out = dict() for key, val in data.to_dict().items(): if check(key): out[key] = _reset_attr(val, key) return out @_create_structure_dict.register(HeteroData) def _(self, data: HeteroData) -> Dict[Union[NodeType, EdgeType], Any]: out = dict() for key, attr in data._global_store.to_dict().items(): # pylint: disable=protected-access out[key] = _reset_attr(attr) for key, attr in chain(data.node_items(), data.edge_items()): out[key] = { k: torch.zeros(_reset_dim(v.shape, k), dtype=data[key][k].dtype) for k, v in attr.to_dict().items() if isinstance(v, torch.Tensor) } return out @singledispatchmethod def _pad_graph_values(self, padded_data, original_data): raise ValueError( f'Unsupported pair of data types: {type(padded_data)}, ' f'{type(original_data)}') @_pad_graph_values.register(Data) def _(self, padded_data: Data, original_data: Data) -> None: if self.labels_type == self.LabelsType.NODE_LVL: check = lambda key: ( key == 'y' and self.labels_type == self.LabelsType.GRAPH_LVL ) or not (original_data.is_node_attr(key) or original_data. is_edge_attr(key)) else: check = lambda key: not (original_data.is_node_attr(key) or original_data.is_edge_attr(key)) for key, value in original_data(): if key in self.exclude_keys: continue if check(key): self._pad_graph_values_body(padded_data, original_data, key, value) @_pad_graph_values.register(HeteroData) def _(self, padded_data: HeteroData, original_data: HeteroData) -> None: for key, value in original_data._global_store.items(): # pylint: disable=protected-access if key in self.exclude_keys: continue self._pad_graph_values_body(padded_data, original_data, key, value) def _pad_graph_values_body(self, padded_data: BaseData, original_data: BaseData, key: Any, value: Any) -> None: if not torch.is_tensor(value): padded_data[key] = self.opts.pad_graph_defaults.get( key, original_data[key]) else: pad_shape = list(value.shape) pad_value = self.opts.graph_pad_value padded_data[key] = value.new_full(pad_shape, pad_value) class CombinedBatchingCollater: r"""Manages the combined batch size defined as :obj:`mini_batch_size * device_iterations * replication_factor * gradient_accumulation`. This class is intended to be used in combination with the :class:`poptorch.DataLoader`. Args: collater (Collater): The collater transforming the list of :class:`torch_geometric.data.Data` objects to a :obj:`torch_geometric.data.Batch` object. mini_bach_size (int, optional): The size of mini batch. If not provided, the length of the list provided when calling an instance of this class is used. (default: :obj:`None`) """ def __init__( self, collater: Collater, mini_batch_size: Optional[int] = None, ) -> None: super().__init__() self.mini_batch_size = mini_batch_size self.collater = collater self.parser = types.PyGArgsParser() def __call__(self, batch: List[BaseData]) -> Batch: num_items = len(batch) mini_batch_size = (self.mini_batch_size if self.mini_batch_size is not None else num_items) assert num_items % mini_batch_size == 0, \ 'Invalid batch size. ' \ f'Got {num_items} graphs and ' \ f'`mini_batch_size={mini_batch_size}`.' num_mini_batches = num_items // mini_batch_size def batch_slice(batch_id): stride = mini_batch_size start = batch_id * stride return slice(start, start + stride) batches = [ self.collater(batch[batch_slice(batch_id)]) for batch_id in range(num_mini_batches) ] batch_tensors = [ list(self.parser.yieldTensors(batch)) for batch in batches ] return self.parser.reconstruct( batches[0], combine_batch_tensors_gen(batch_tensors)) ================================================ FILE: poptorch_geometric/python/common.py ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. from torch_geometric.data import Batch, Data, HeteroData DataBatch = type(Batch(_base_cls=Data)) HeteroDataBatch = type(Batch(_base_cls=HeteroData)) def call_once(f): def wrapper(*args, **kwargs): if not wrapper.has_run: wrapper.has_run = True return f(*args, **kwargs) return None wrapper.has_run = False return wrapper ================================================ FILE: poptorch_geometric/python/dataloader.py ================================================ # Copyright (c) 2022 Graphcore Ltd. All rights reserved. from __future__ import annotations # noqa: F407 from typing import List, Optional, Tuple, Union from torch_geometric.data import Dataset import poptorch from poptorch_geometric.collate import CombinedBatchingCollater from poptorch_geometric.fixed_size_options import FixedSizeOptions from poptorch_geometric.pyg_dataloader import DataLoader as PyGDataLoader from poptorch_geometric.pyg_dataloader import FixedSizeDataLoader as PyGFixedSizeDataLoader from poptorch_geometric.pyg_dataloader import FixedSizeStrategy, OverSizeStrategy class DataLoader(PyGDataLoader, poptorch.DataLoader): r"""A data loader which merges data objects from a :py:class:`torch_geometric.data.Dataset` to a mini-batch. Data objects can be either of type :py:class:`~torch_geometric.data.Data` or :py:class:`~torch_geometric.data.HeteroData`. Args: dataset (Dataset): The dataset from which to load the data. batch_size (int, optional): How many samples per batch to load. (default: :obj:`1`) shuffle (bool, optional): If set to :obj:`True`, the data will be reshuffled at every epoch. (default: :obj:`False`) follow_batch (List[str], optional): Creates assignment batch vectors for each key in the list. (default: :obj:`None`) exclude_keys (List[str], optional): Will exclude each key in the list. (default: :obj:`None`) options (poptorch.Options, optional): The additional PopTorch options to be passed to :py:class:`poptorch.DataLoader`. (default: :obj:`None`) **kwargs (optional): Additional arguments of :py:class:`poptorch.DataLoader`. """ def __init__( self, dataset: Dataset, batch_size: int = 1, shuffle: bool = False, follow_batch: Optional[Union[List[str], Tuple[str, ...]]] = None, exclude_keys: Optional[Union[List[str], Tuple[str, ...]]] = None, options: Optional[poptorch.Options] = None, **kwargs, ): self.batch_size = batch_size if options is None: options = poptorch.Options() super().__init__(dataset=dataset, batch_size=batch_size, shuffle=shuffle, follow_batch=follow_batch, exclude_keys=exclude_keys, options=options, **kwargs) def _create_collater(self, **collater_args): base_collater = super()._create_collater(**collater_args) return CombinedBatchingCollater(mini_batch_size=self.batch_size, collater=base_collater) class FixedSizeDataLoader(PyGFixedSizeDataLoader, poptorch.DataLoader): r"""A data loader which merges data objects from :py:class:`poptorch.Dataset` into a mini-batch and pads node and edge features so tensors across all mini-batches have the same shapes. Data objects can be either of type :py:class:`~torch_geometric.data.Data` or :py:class:`~torch_geometric.data.HeteroData`. Args: dataset (Dataset): The :class:`~torch_geometric.data.Dataset` instance from which to load the graph samples. batch_size (int, optional): The number of graph samples to load in each mini-batch. This should be at least :obj:`2` to allow for creating at least one padding graph. (default: :obj:`2`) shuffle (bool, optional): If :obj:`True`, the data will be reshuffled at every epoch. (default: :obj:`False`) fixed_size_options (FixedSizeOptions, optional): A :py:class:`poptorch_geometric.fixed_size_options.FixedSizeOptions` object which holds the maximum number of nodes, edges and other options required to pad the mini-batches, produced by the data loader, to a fixed size. If not specified, this will be determined from the provided dataset. (default: :obj:`None`) fixed_size_strategy (FixedSizeStrategy, optional): The strategy to use to achieve fixed-size mini-batches. By default, each mini-batch will contain a fixed number of real graphs (`batch_size` - 1) plus one single graph for padding. (default: `poptorch_geometric.FixedSizeStrategy.PadToMax`) over_size_strategy (OverSizeStrategy, optional): The behaviour if a sample cannot fit in the fixed-size mini-batch. By default, if the required number of samples cannot fit into the fixed-sized batch an error will be raised. (default: `poptorch_geometric.OverSizeStrategy.Error`) add_pad_masks (bool, optional): If :obj:`True`, mask objects are attached to mini-batch result. They represents three levels of padding: - :obj:`graphs_mask` - graph level mask - :obj:`nodes_mask` - node level mask - :obj:`edges_mask` - edge level mask Mask objects indicate which elements in the mini-batch are real (represented by :obj:`True`) and which were added as padding (represented by :obj:`False`). (default: :obj:`True`) follow_batch (list or tuple, optional): Creates assignment batch vectors for each key in the list. (default: :obj:`None`) exclude_keys (list or tuple, optional): Keys to exclude from the batch. (default: :obj:`None`) options (poptorch.Options, optional): The additional PopTorch options to be passed to :py:class:`poptorch.DataLoader`. (default: :obj:`None`) **kwargs (optional): Additional arguments of :py:class:`poptorch.DataLoader`. """ def __init__( self, dataset: Dataset, batch_size: int = 2, shuffle: bool = False, fixed_size_options: Optional[FixedSizeOptions] = None, fixed_size_strategy: FixedSizeStrategy = FixedSizeStrategy. PadToMax, over_size_strategy: OverSizeStrategy = OverSizeStrategy.Error, add_pad_masks: Optional[bool] = True, follow_batch: Optional[Union[List[str], Tuple[str, ...]]] = None, exclude_keys: Optional[Union[List[str], Tuple[str, ...]]] = None, options: Optional[poptorch.Options] = None, **kwargs, ): if options is None: # Create IPU default options options = poptorch.Options() super().__init__(dataset=dataset, batch_size=batch_size, shuffle=shuffle, fixed_size_options=fixed_size_options, fixed_size_strategy=fixed_size_strategy, over_size_strategy=over_size_strategy, add_pad_masks=add_pad_masks, follow_batch=follow_batch, exclude_keys=exclude_keys, options=options, **kwargs) def _create_collater(self, **collater_args): base_collater = super()._create_collater(**collater_args) if self.batch_sampler is not None: mini_batch_size = None else: mini_batch_size = self.padded_batch_size - 1 return CombinedBatchingCollater(mini_batch_size=mini_batch_size, collater=base_collater) ================================================ FILE: poptorch_geometric/python/fixed_size_options.py ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. from typing import Any, Dict, List, Optional, Union from torch.utils.data import DataLoader from torch_geometric.data import Dataset, HeteroData from torch_geometric.data.summary import Summary from torch_geometric.typing import EdgeType, NodeType def validate_num_graphs(num_graphs): if num_graphs < 2: raise ValueError("The number of graphs in the batch must be" " at least 2. This is to ensure the batch" " contains at least 1 real graph and a graph" " reserved for padding the batch to a fixed size.") class FixedSizeOptions: r"""Class that holds the specification of how a data loader can be padded up to a fixed size. This includes the number of nodes and edges to pad a batch, produced using this specification, to a maximum number. Args: num_nodes (int or dict): The number of nodes after padding a batch. In heterogeneous graphs, this can be a dictionary denoting the number of nodes for specific node types. num_edges (int or dict, optional): The number of edges after padding a batch. In heterogeneous graphs, this can be a dictionary denoting the number of edges for specific edge types. (default: :obj:`num_nodes * (num_nodes - 1)`) num_graphs (int, optional): The total number of graphs in the padded batch. This should be at least :obj:`2` to allow for creating at least one padding graph. The default value is :obj:`2` accounting for a single real graph and a single padded graph in a batch. (default: :obj:`2`) node_pad_value (float, optional): The fill value to use for node features. (default: :obj:`0.0`) edge_pad_value (float, optional): The fill value to use for edge features. (default: :obj:`0.0`) graph_pad_value (float, optional): The fill value to use for graph features. (default: :obj:`0.0`) pad_graph_defaults (dict, optional): The default values that will be assigned to the keys of types different to :class:`torch.Tensor` in the newly created padding graphs. (default: :obj:`None`) """ def __init__(self, num_nodes: Union[int, Dict[NodeType, int]], num_edges: Optional[Union[int, Dict[EdgeType, int]]] = None, num_graphs: int = 2, node_pad_value: Optional[float] = None, edge_pad_value: Optional[float] = None, graph_pad_value: Optional[float] = None, pad_graph_defaults: Optional[Dict[str, Any]] = None): self.num_nodes = num_nodes if num_edges: self.num_edges = num_edges else: # Assume fully connected graph. total_num_nodes = sum(self.num_nodes.values()) if isinstance( num_nodes, dict) else num_nodes self.num_edges = total_num_nodes * (total_num_nodes - 1) validate_num_graphs(num_graphs) self.num_graphs = num_graphs self.total_num_nodes_hetero = None self.total_num_edges_hetero = None self.node_pad_value = 0.0 if node_pad_value is None else node_pad_value self.edge_pad_value = 0.0 if edge_pad_value is None else edge_pad_value self.graph_pad_value = (0.0 if graph_pad_value is None else graph_pad_value) self.pad_graph_defaults = ({} if pad_graph_defaults is None else pad_graph_defaults) def is_hetero(self): """Returns whether the specified number of nodes and edges are in heterogeneous form, ie a number for each node and edge type.""" return (isinstance(self.num_nodes, dict) and isinstance(self.num_edges, dict)) def to_hetero(self, node_types: List[NodeType], edge_types: List[EdgeType]): """Converts a single specified number of nodes and edges to a heterogeneous form, a number for each node and edge type.""" if not isinstance(self.num_nodes, dict): self.num_nodes = {k: self.num_nodes for k in node_types} if not isinstance(self.num_edges, dict): self.num_edges = {k: self.num_edges for k in edge_types} return self @property def total_num_nodes(self): """The total number of nodes summed for all the node types.""" if self.is_hetero(): if self.total_num_nodes_hetero is None: self.total_num_nodes_hetero = sum(self.num_nodes.values()) return self.total_num_nodes_hetero return self.num_nodes @property def total_num_edges(self): """The total number of nodes summed for all the edge types.""" if self.is_hetero(): if self.total_num_edges_hetero is None: self.total_num_edges_hetero = sum(self.num_edges.values()) return self.total_num_edges_hetero return self.num_edges @classmethod def from_dataset(cls, dataset: Dataset, batch_size: int, sample_limit: Optional[int] = None, progress_bar: Optional[bool] = None): """Returns a `FixedSizeOptions` object which is a valid set of options for the given dataset, ensuring that the number of nodes and edges allocated are enough for the dataset given a particular batch size.""" validate_num_graphs(batch_size) if sample_limit is None: sample_limit = len(dataset) dataset_summary = Summary.from_dataset(dataset, progress_bar=progress_bar) def get_max_for_batch_size(batch_size, sample_max): return int(sample_max) * (batch_size - 1) + 1 if dataset_summary.num_nodes_per_type: max_nodes_per_batch = { k: get_max_for_batch_size(batch_size, v.max) for k, v in dataset_summary.num_nodes_per_type.items() } else: max_nodes_per_batch = get_max_for_batch_size( batch_size, dataset_summary.num_nodes.max) if dataset_summary.num_edges_per_type: max_edges_per_batch = { k: get_max_for_batch_size(batch_size, v.max) for k, v in dataset_summary.num_edges_per_type.items() } else: max_edges_per_batch = get_max_for_batch_size( batch_size, dataset_summary.num_edges.max) return FixedSizeOptions( num_nodes=max_nodes_per_batch, num_edges=max_edges_per_batch, num_graphs=batch_size, ) @classmethod def from_loader(cls, loader: DataLoader, sample_limit: int = 1000): """Returns a `FixedSizeOptions` object which is a valid set of options for the given data loader, ensuring that the number of nodes and edges allocated are approximately enough for the mini-batches produced by this data loader. As the underlying loader is unlikely to produce an exhaustive combination of samples in a mini-batch, the `FixedSizeOptions` returned can only be an approximation of the maximum values required.""" is_hetero_data = isinstance(next(iter(loader)), HeteroData) max_num_graphs = 0 max_num_nodes = dict() if is_hetero_data else 0 max_num_edges = dict() if is_hetero_data else 0 def loop_with_limit(loader, limit): count = 0 while True: for sample in loader: if count >= limit: return count += 1 yield sample for data in loop_with_limit(loader, sample_limit): if is_hetero_data: for node_type in data.node_types: max_num_nodes[node_type] = max( max_num_nodes.get(node_type, 0), data[node_type].num_nodes) for edge_type in data.edge_types: max_num_edges[edge_type] = max( max_num_edges.get(edge_type, 0), data[edge_type].num_edges) else: max_num_nodes = max(max_num_nodes, data.num_nodes) max_num_edges = max(max_num_edges, data.num_edges) if hasattr(data, "num_graphs"): max_num_graphs = max(max_num_graphs, data.num_graphs) else: max_num_graphs = 1 # Allocate space for padding max_num_graphs += 1 if is_hetero_data: max_num_nodes = {k: v + 1 for k, v in max_num_nodes.items()} max_num_edges = {k: v + 1 for k, v in max_num_edges.items()} else: max_num_nodes += 1 max_num_edges += 1 return FixedSizeOptions( num_nodes=max_num_nodes, num_edges=max_num_edges, num_graphs=max_num_graphs, ) def __repr__(self): return (f"{self.__class__.__name__}(" f"num_nodes={self.num_nodes}" " (At least one node reserved for padding), " f"num_edges={self.num_edges}" " (At least one edge reserved for padding), " f"num_graphs={self.num_graphs}" " (At least one graph reserved for padding), " f"node_pad_value={self.node_pad_value}, " f"edge_pad_value={self.edge_pad_value}, " f"graph_pad_value={self.graph_pad_value}, " f"pad_graph_defaults={self.pad_graph_defaults})") ================================================ FILE: poptorch_geometric/python/masker.py ================================================ # Copyright (c) 2022 Graphcore Ltd. All rights reserved. """ Provides an interface that reduces coupling between padding which happens in the dataloader and the masking which needs to happen in the model. The idea is fairly simple: the dataloader defines the masking strategy for nodes, edges, and graphs. The IPU GNNs consume that interface, and it is easy to make the mask operations no-ops for compatibility with other hardware. ### Expected usage pattern ```python import torch_geometric as pyg from torch import nn import poptorch class IpuGNN(pyg.SomeGNN): def __init__(self, masker: Masker = NoMasker(), *args, **kwargs): super().__init__(*args, **kwargs) self.masker = masker # New line to support this pattern self.loss = nn.CrossEntropyLoss() def forward(self, node_mask, y, *args, **kwargs): '''Common poptorch usage pattern of needing to put the loss in the forward''' node_prediction = super().forward(*args, **kwargs) # clear interface for model code to program to masked_pred = self.masker.node_masker(node_prediction, node_mask) if self.training: return masked_pred, self.loss(y, masked_pred) return masked_pred options = poptorch.Options() dataloader = poptorch_geometric.create_dataloader( dataset=dataset, num_nodes=6000, options=options, fixed_size=True, collater_args={ "num_edges": 12000, }, ) model = IpuGNN(dataloader.masker) train_model = poptorch.TrainingModel(model, options=options, ...) for data in dataloader: # Need to pass the mask as an extra argument. train_model(data.node_mask, data.y, ...) ``` ### Expected benefit The big benefit is it pushes the responsibility of writing the masking functions to the same piece of code that also implements the padding and generates the node mask. It means consumers of a dataloader don't have to worry about implementation details. """ import abc from typing import Callable, Optional, Tuple, Union import torch Entries = Union[torch.Tensor, Tuple[torch.Tensor, ...]] Mask = Optional[torch.Tensor] Layer = Callable[[torch.Tensor], torch.Tensor] DecoratedLayer = Callable[[torch.Tensor], torch.Tensor] class Masker(abc.ABC): """ The masker provides a way to decouple the model from the implementation of the dataloading. We provide a stable interface for masking padded data and graphs. Dataloaders that implement padding should also generates masking functions for you by either implementing this :class:`Masker` interface or by composing a `layer_mask` attribute to the class. Models which are compatible can then use those masks as intermediate layers before the loss or before pooling operations to avoid the back propagation: ```python class Net(Module): def __init__(self, layer_mask): self.node_layer = pyg.GraphConv() self.masker = layer_mask self.loss = nn.loss() def forward(self, x, y, mask): x = self.node_layer(x) x = self.node_layer(x) x = self.node_layer(x) pred = self.masker.node_masker(x, mask=mask) return loss(y, pred) ``` By implementing this interface we let the user change their dataloading Pipeline without having to go into the code of model. Note: Code in the node, edge and graph masker will be run on the IPU and needs to be compatible with torch.jit.trace. """ @abc.abstractmethod def node_masker(self, node_entries: Entries, mask: Mask = None) -> Entries: """Masks out nodes which were added by padding/batching/clustering""" @abc.abstractmethod def edge_masker(self, edge_entries: Entries, mask: Mask = None) -> Entries: """Masks out edges which were added by padding/batching/clustering""" @abc.abstractmethod def graph_masker(self, graph_entries: Entries, mask: Mask = None) -> Entries: """Masks out graphs which were added by padding/batching/clustering""" class NoMasker(Masker): """A null op masker to give when masking is unnecessary""" def node_masker(self, node_entries: Entries, mask: Mask = None) -> Entries: return node_entries def edge_masker(self, edge_entries: Entries, mask: Mask = None) -> Entries: return edge_entries def graph_masker(self, graph_entries: Entries, mask: Mask = None) -> Entries: return graph_entries class LayerMasker(abc.ABC): """ The layer masker provides a way to decouple the model from the implementation of the dataloading. We provide a stable interface for masking layers which need to operated on padded data and graphs. Note: This is an alternative proposal to the :class:`Masker` above. It differs by proposing we use decoration of the layers instead of calling in between the layers. The decoration approach might help handle cases where a lot of masking is necessary by decorating layers defined in the `__init__` of a `Module` removing the need for changing the forward method. This default implementation is sufficient for layers which only take tensors that will be masked according to the same attribute (node, edge, or graph) this will not handle a layer which needs two tensors one related to edges and one related to nodes. """ def __init__(self, masker: Masker) -> None: super().__init__() self.masker = masker @abc.abstractmethod def node_masker(self, layer: Layer) -> DecoratedLayer: def masked_layer(*args, mask=None): return layer(*self.masker.node_masker(args, mask=mask)) return masked_layer @abc.abstractmethod def edge_masker(self, layer: Layer) -> DecoratedLayer: def masked_layer(*args, mask=None): return layer(*self.masker.edge_masker(args, mask=mask)) return masked_layer @abc.abstractmethod def graph_masker(self, layer: Layer) -> DecoratedLayer: def masked_layer(*args, mask=None): return layer(*self.masker.graph_masker(args, mask=mask)) return masked_layer class PreLayerMasker(LayerMasker): """Simplest Layer masker""" # pylint: disable=useless-super-delegation def node_masker(self, layer: Layer) -> DecoratedLayer: return super().node_masker(layer) def edge_masker(self, layer: Layer) -> DecoratedLayer: return super().edge_masker(layer) def graph_masker(self, layer: Layer) -> DecoratedLayer: return super().graph_masker(layer) ================================================ FILE: poptorch_geometric/python/neighbor_loader.py ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. from typing import Callable, Dict, List, Optional, Tuple, Union import torch from torch_geometric.data import Data, FeatureStore, GraphStore, HeteroData from torch_geometric.data.data import BaseData from torch_geometric.loader import NeighborLoader from torch_geometric.loader.utils import get_input_nodes from torch_geometric.sampler import NeighborSampler from torch_geometric.sampler.base import SubgraphType from torch_geometric.typing import EdgeType, InputNodes, OptTensor import poptorch from poptorch_geometric.collate import FixedSizeCollater from poptorch_geometric.collate import CombinedBatchingCollater from poptorch_geometric import OverSizeStrategy from poptorch_geometric.fixed_size_options import FixedSizeOptions class PyGFixedSizeNeighborLoader(torch.utils.data.DataLoader): def __init__( self, data: Union[Data, HeteroData, Tuple[FeatureStore, GraphStore]], num_neighbors: Union[List[int], Dict[EdgeType, List[int]]], input_nodes: InputNodes = None, input_time: OptTensor = None, replace: bool = False, directed: bool = True, disjoint: bool = False, temporal_strategy: str = 'uniform', time_attr: Optional[str] = None, transform: Optional[Callable] = None, transform_sampler_output: Optional[Callable] = None, is_sorted: bool = False, filter_per_worker: bool = True, subgraph_type: SubgraphType = SubgraphType.directional, batch_size: int = 1, neighbor_sampler: Optional[NeighborSampler] = None, over_size_strategy: OverSizeStrategy = OverSizeStrategy. TrimNodesAndEdges, fixed_size_options: FixedSizeOptions = None, add_pad_masks: Optional[bool] = False, follow_batch: Optional[Union[List[str], Tuple[str, ...]]] = None, exclude_keys: Optional[Union[List[str], Tuple[str, ...]]] = None, options: Optional[poptorch.Options] = None, **kwargs, ): kwargs['batch_size'] = batch_size self.neighbour_loader = NeighborLoader( data, num_neighbors, input_nodes=input_nodes, input_time=input_time, replace=replace, subgraph_type=subgraph_type, directed=directed, disjoint=disjoint, temporal_strategy=temporal_strategy, time_attr=time_attr, transform=transform, transform_sampler_output=transform_sampler_output, is_sorted=is_sorted, filter_per_worker=filter_per_worker, neighbor_sampler=neighbor_sampler, **kwargs) self.input_type, input_nodes = get_input_nodes(data, input_nodes) if fixed_size_options is None: fixed_size_options = FixedSizeOptions.from_loader( self.neighbour_loader) collater_args = {} collater_args['fixed_size_options'] = fixed_size_options collater_args['add_masks_to_batch'] = add_pad_masks collater_args['follow_batch'] = follow_batch collater_args['exclude_keys'] = exclude_keys collater_args['trim_nodes'] = (over_size_strategy in ( OverSizeStrategy.TrimNodes, OverSizeStrategy.TrimNodesAndEdges)) collater_args['trim_edges'] = (over_size_strategy in ( OverSizeStrategy.TrimEdges, OverSizeStrategy.TrimNodesAndEdges)) kwargs['options'] = options collater = self._create_collater(**collater_args) super().__init__(dataset=range(input_nodes.size(0)), collate_fn=collater, **kwargs) def __collate__(self, index): out = self.nativeCollate(index) out = self.fixedSizeCollate(out) return out def _create_collater(self, **collater_args): self.fixed_size_collater = FixedSizeCollater(**collater_args) return self.__collate__ def nativeCollate(self, index): out = self.neighbour_loader(index) return out def fixedSizeCollate(self, data_list: List[BaseData]): # Some keys are not handled correctly by FixedSizeCollater # so they need to be temporarily removed sample_batch_size = data_list[self.input_type].pop( "batch_size") if self.input_type else data_list.pop("batch_size") input_id = data_list[self.input_type].pop( "input_id") if self.input_type else data_list.pop("input_id") out = self.fixed_size_collater([data_list]) # Restore previously removed keys if self.input_type: out[self.input_type].batch_size = sample_batch_size out[self.input_type].input_id = input_id else: out.batch_size = sample_batch_size out.input_id = input_id return out class FixedSizeNeighborLoader(PyGFixedSizeNeighborLoader, poptorch.DataLoader): r"""A data loader which merges data objects from a :py:class:`torch_geometric.loader.NeighborLoader` to a mini-batch and pads node and edge features so tensors across all batches have constant shapes. Args: data (Any): A :class:`~torch_geometric.data.Data`, :class:`~torch_geometric.data.HeteroData`, or (:class:`~torch_geometric.data.FeatureStore`, :class:`~torch_geometric.data.GraphStore`) data object. num_neighbors (List[int] or Dict[Tuple[str, str, str], List[int]]): The number of neighbours to sample for each node in each iteration. If an entry is set to :obj:`-1`, all neighbors will be included. In heterogeneous graphs, it may also take a dictionary denoting the number of neighbours to sample for each individual edge type. input_nodes (torch.Tensor or str or Tuple[str, torch.Tensor]): The indices of nodes for which neighbours are sampled to create mini-batches. Needs to be either given as a :obj:`torch.LongTensor` or :obj:`torch.BoolTensor`. If set to :obj:`None`, all nodes will be considered. In heterogeneous graphs, this needs to be passed as a tuple that holds the node type and node indices. (default: :obj:`None`) input_time (torch.Tensor, optional): Optional values to override the timestamp for the input nodes given in :obj:`input_nodes`. If not set, it will use the timestamps in :obj:`time_attr` as default (if present). The :obj:`time_attr` needs to be set for this to work. (default: :obj:`None`) replace (bool, optional): If set to :obj:`True`, will sample with replacement. (default: :obj:`False`) subgraph_type (SubgraphType or str, optional): The type of the returned subgraph. If set to :obj:`"directional"`, the returned subgraph only holds the sampled (directed) edges which are necessary to compute representations for the sampled seed nodes. If set to :obj:`"bidirectional"`, sampled edges are converted to bidirectional edges. If set to :obj:`"induced"`, the returned subgraph contains the induced subgraph of all sampled nodes. (default: :obj:`"directional"`) disjoint (bool, optional): If set to :obj:`True`, each seed node will create its own disjoint subgraph. If set to :obj:`True`, mini-batch outputs will have a :obj:`batch` vector holding the mapping of nodes to their respective subgraph. This will get automatically set to :obj:`True` in the case of temporal sampling. (default: :obj:`False`) temporal_strategy (str, optional): The sampling strategy when using temporal sampling (:obj:`"uniform"`, :obj:`"last"`). If set to :obj:`"uniform"`, it will sample uniformly across neighbours that fulfill temporal constraints. If set to :obj:`"last"`, will sample the last `num_neighbors` that fulfill temporal constraints. (default: :obj:`"uniform"`) time_attr (str, optional): The name of the attribute that denotes timestamps for the nodes in the graph. If set, temporal sampling will be used so that neighbours are guaranteed to fulfill temporal constraints; that is, neighbours have an earlier or equal timestamp than the centre node. (default: :obj:`None`) transform (callable, optional): A function/transform that takes in a sampled mini-batch and returns a transformed version. (default: :obj:`None`) transform_sampler_output (callable, optional): A function/transform that takes in a :class:`torch_geometric.sampler.SamplerOutput` and returns a transformed version. (default: :obj:`None`) is_sorted (bool, optional): If set to :obj:`True`, assumes that :obj:`edge_index` is sorted by column. If :obj:`time_attr` is set, additionally requires that rows are sorted by to time within individual neighbourhoods. This avoids internal re-sorting of the data and can improve runtime and memory efficiency. (default: :obj:`False`) filter_per_worker (bool, optional): This is left for argument compatibility with :obj:`NeighborLoader`. The passed value is ignored, FixedSizeNeighborLoader acts like filter_per_worker=True fixed_size_options (FixedSizeOptions, optional): A :py:class:`poptorch_geometric.fixed_size_options.FixedSizeOptions` object which holds the maximum number of nodes, edges and other options required to pad the mini-batches, produced by the data loader, to a fixed size. batch_size (int, optional): The number of nodes per mini-batch to load. (default: :obj:`1`) over_size_strategy (OverSizeStrategy, optional): The behaviour if a sample cannot fit in the fixed-size mini-batch. By default, if the required number of samples cannot fit into the fixed-sized mini-batch, nodes and edges will be removed from the mini-batch to achieve the specified fixed size. (default: `poptorch_geometric.OverSizeStrategy.TrimNodesAndEdges`) add_pad_masks (bool, optional): If :obj:`True`, mask objects are attached to mini-batch result. They represents three levels of padding: - :obj:`graphs_mask`: graph level mask - :obj:`nodes_mask`: node level mask - :obj:`edges_mask`: edge level mask Mask objects indicate which elements in the mini-batch are real (represented by :obj:`True`) and which were added as padding (represented by :obj:`False`). (default: :obj:`True`) options (poptorch.Options, optional): The additional PopTorch options to be passed to :py:class:`poptorch.DataLoader`. (default: :obj:`None`) exclude_keys (list or tuple, optional): The keys to exclude from the graphs in the output batch. (default: :obj:`None`) **kwargs (optional): Additional arguments of :class:`torch.utils.data.DataLoader`, such as :obj:`shuffle`, :obj:`drop_last` or :obj:`num_workers`. """ def __init__( self, data: Union[Data, HeteroData, Tuple[FeatureStore, GraphStore]], num_neighbors: Union[List[int], Dict[EdgeType, List[int]]], input_nodes: InputNodes = None, input_time: OptTensor = None, subgraph_type: SubgraphType = SubgraphType.directional, replace: bool = False, directed: bool = True, disjoint: bool = False, temporal_strategy: str = 'uniform', time_attr: Optional[str] = None, transform: Optional[Callable] = None, transform_sampler_output: Optional[Callable] = None, is_sorted: bool = False, filter_per_worker: bool = True, # Ignored batch_size: int = 1, neighbor_sampler: Optional[NeighborSampler] = None, over_size_strategy: OverSizeStrategy = OverSizeStrategy. TrimNodesAndEdges, fixed_size_options: FixedSizeOptions = None, add_pad_masks: Optional[bool] = True, follow_batch: Optional[Union[List[str], Tuple[str, ...]]] = None, exclude_keys: Optional[Union[List[str], Tuple[str, ...]]] = None, options: Optional[poptorch.Options] = None, **kwargs, ): self.batch_size = batch_size if options is None: # Create IPU default options options = poptorch.Options() super().__init__( data, num_neighbors, input_nodes=input_nodes, input_time=input_time, replace=replace, directed=directed, disjoint=disjoint, subgraph_type=subgraph_type, temporal_strategy=temporal_strategy, time_attr=time_attr, transform=transform, transform_sampler_output=transform_sampler_output, is_sorted=is_sorted, filter_per_worker=True, batch_size=batch_size, neighbor_sampler=neighbor_sampler, over_size_strategy=over_size_strategy, fixed_size_options=fixed_size_options, add_pad_masks=add_pad_masks, follow_batch=follow_batch, exclude_keys=exclude_keys, options=options, **kwargs, ) def _create_collater(self, **collater_args): collater = super()._create_collater(**collater_args) return CombinedBatchingCollater(mini_batch_size=self.batch_size, collater=collater) ================================================ FILE: poptorch_geometric/python/ops/__init__.py ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. from .aggregation_base import Aggregation from .cluster_gcn_conv import ClusterGCNConv from .hetero_linear import HeteroLinear from .instance_norm import InstanceNorm from .knn import knn from .knn_graph import knn_graph from .knn_interpolate import knn_interpolate from .mf_conv import MFConv from .radius import radius, radius_graph __all__ = [ 'Aggregation', 'ClusterGCNConv', 'HeteroLinear', 'InstanceNorm', 'knn', 'knn_graph', 'knn_interpolate', 'MFConv', 'radius', 'radius_graph', ] ================================================ FILE: poptorch_geometric/python/ops/aggregation_base.py ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. from typing import Optional from torch import Tensor import torch_geometric class Aggregation(torch_geometric.nn.aggr.Aggregation): def assert_sorted_index(self, index: Optional[Tensor]): pass ================================================ FILE: poptorch_geometric/python/ops/cluster_gcn_conv.py ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. # # This file includes content from PyTorch Geometric which # has been modified by Graphcore Ltd. # # Copyright (c) 2023 PyG Team # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. # Modified version of ClusterGCNConv that does not use dynamic shapes. # Original pytorch_geometric version will be replaced by this code. import torch from torch import Tensor from torch_geometric.nn.conv import MessagePassing from torch_geometric.nn.dense.linear import Linear from torch_geometric.typing import Adj, OptTensor, SparseTensor, torch_sparse from torch_geometric.utils import ( add_self_loops, degree, remove_self_loops, spmm, ) # pylint: disable=abstract-method, arguments-differ, no-value-for-parameter class ClusterGCNConv(MessagePassing): r"""The ClusterGCN graph convolutional operator from the `"Cluster-GCN: An Efficient Algorithm for Training Deep and Large Graph Convolutional Networks" `_ paper .. math:: \mathbf{X}^{\prime} = \left( \mathbf{\hat{A}} + \lambda \cdot \textrm{diag}(\mathbf{\hat{A}}) \right) \mathbf{X} \mathbf{W}_1 + \mathbf{X} \mathbf{W}_2 where :math:`\mathbf{\hat{A}} = {(\mathbf{D} + \mathbf{I})}^{-1}(\mathbf{A} + \mathbf{I})`. Args: in_channels (int): Size of each input sample, or :obj:`-1` to derive the size from the first input(s) to the forward method. out_channels (int): Size of each output sample. diag_lambda (float, optional): Diagonal enhancement value :math:`\lambda`. (default: :obj:`0.`) add_self_loops (bool, optional): If set to :obj:`False`, will not add self-loops to the input graph. (default: :obj:`True`) bias (bool, optional): If set to :obj:`False`, the layer will not learn an additive bias. (default: :obj:`True`) **kwargs (optional): Additional arguments of :class:`torch_geometric.nn.conv.MessagePassing`. Shapes: - **input:** node features :math:`(|\mathcal{V}|, F_{in})`, edge indices :math:`(2, |\mathcal{E}|)` - **output:** node features :math:`(|\mathcal{V}|, F_{out})` """ def __init__(self, in_channels: int, out_channels: int, diag_lambda: float = 0., add_self_loops: bool = True, bias: bool = True, **kwargs): kwargs.setdefault('aggr', 'add') super().__init__(**kwargs) self.in_channels = in_channels self.out_channels = out_channels self.diag_lambda = diag_lambda self.add_self_loops = add_self_loops self.lin_out = Linear(in_channels, out_channels, bias=bias, weight_initializer='glorot') self.lin_root = Linear(in_channels, out_channels, bias=False, weight_initializer='glorot') self.reset_parameters() def reset_parameters(self): super().reset_parameters() self.lin_out.reset_parameters() self.lin_root.reset_parameters() def forward(self, x: Tensor, edge_index: Adj) -> Tensor: edge_weight: OptTensor = None if isinstance(edge_index, Tensor): num_nodes = x.size(self.node_dim) if self.add_self_loops: edge_index, _ = remove_self_loops(edge_index) edge_index, _ = add_self_loops(edge_index, num_nodes=num_nodes) row, col = edge_index[0], edge_index[1] deg_inv = 1. / degree(col, num_nodes=num_nodes).clamp_(1.) edge_weight = deg_inv[col] eq = torch.eq(row, col) broadcast = torch.index_select(self.diag_lambda * deg_inv, 0, row) tmp = torch.mul(eq.float(), broadcast) edge_weight = torch.add(edge_weight, tmp) elif isinstance(edge_index, SparseTensor): if self.add_self_loops: edge_index = torch_sparse.set_diag(edge_index) col, row, _ = edge_index.coo() # Transposed. deg_inv = 1. / torch_sparse.sum(edge_index, dim=1).clamp_(1.) edge_weight = deg_inv[col] eq = torch.eq(row, col) broadcast = torch.index_select(self.diag_lambda * deg_inv, 0, row) tmp = torch.mul(eq.float(), broadcast) edge_weight = torch.add(edge_weight, tmp) edge_index = edge_index.set_value(edge_weight, layout='coo') # propagate_type: (x: Tensor, edge_weight: OptTensor) out = self.propagate(edge_index, x=x, edge_weight=edge_weight, size=None) out = self.lin_out(out) + self.lin_root(x) return out def message(self, x_j: Tensor, edge_weight: Tensor) -> Tensor: return edge_weight.view(-1, 1) * x_j def message_and_aggregate(self, adj_t: SparseTensor, x: Tensor) -> Tensor: return spmm(adj_t, x, reduce=self.aggr) def __repr__(self) -> str: return (f'{self.__class__.__name__}({self.in_channels}, ' f'{self.out_channels}, diag_lambda={self.diag_lambda})') ================================================ FILE: poptorch_geometric/python/ops/hetero_linear.py ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. # # This file includes content from PyTorch Geometric which # has been modified by Graphcore Ltd. # # Copyright (c) 2023 PyG Team # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. import torch import torch_geometric from torch import Tensor class HeteroLinear(torch_geometric.nn.dense.linear.HeteroLinear): r"""Applies separate linear tranformations to the incoming data according to types .. math:: \mathbf{x}^{\prime}_{\kappa} = \mathbf{x}_{\kappa} \mathbf{W}^{\top}_{\kappa} + \mathbf{b}_{\kappa} for type :math:`\kappa`. It supports lazy initialization and customizable weight and bias initialization. Args: in_channels (int): Size of each input sample. Will be initialized lazily in case it is given as :obj:`-1`. out_channels (int): Size of each output sample. num_types (int): The number of types. is_sorted (bool, optional): If set to :obj:`True`, assumes that :obj:`type_vec` is sorted. This avoids internal re-sorting of the data and can improve runtime and memory efficiency. (default: :obj:`False`) **kwargs (optional): Additional arguments of :class:`torch_geometric.nn.Linear`. Shapes: - **input:** features :math:`(*, F_{in})`, type vector :math:`(*)` - **output:** features :math:`(*, F_{out})` """ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) def forward(self, x: Tensor, type_vec: Tensor) -> Tensor: r""" Args: x (torch.Tensor): The input features. type_vec (torch.Tensor): A vector that maps each entry to a type. """ out = x.new_empty(x.size(0), self.out_channels) for i in range(self.num_types): mask = torch.eq(type_vec, i).view(-1, 1) x_type_i = torch.where(mask, x, 0.0) out_type_i = torch.nn.functional.linear(x_type_i, self.weight[i].T) out = torch.where(mask, out_type_i, out) if self.bias is not None: out += self.bias[type_vec] return out ================================================ FILE: poptorch_geometric/python/ops/instance_norm.py ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. # # This file includes content from PyTorch Geometric which # has been modified by Graphcore Ltd. # # Copyright (c) 2023 PyG Team # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. from typing import Optional import torch.nn.functional as F from torch import Tensor from torch.nn.modules.instancenorm import _InstanceNorm from torch_geometric.typing import OptTensor from torch_geometric.utils import degree, scatter # pylint: disable=abstract-method, arguments-differ, useless-super-delegation class InstanceNorm(_InstanceNorm): r"""Applies instance normalization over each individual example in a batch of node features as described in the `"Instance Normalization: The Missing Ingredient for Fast Stylization" `_ paper .. math:: \mathbf{x}^{\prime}_i = \frac{\mathbf{x} - \textrm{E}[\mathbf{x}]}{\sqrt{\textrm{Var}[\mathbf{x}] + \epsilon}} \odot \gamma + \beta The mean and standard-deviation are calculated per-dimension separately for each object in a mini-batch. Args: in_channels (int): Size of each input sample. eps (float, optional): A value added to the denominator for numerical stability. (default: :obj:`1e-5`) momentum (float, optional): The value used for the running mean and running variance computation. (default: :obj:`0.1`) affine (bool, optional): If set to :obj:`True`, this module has learnable affine parameters :math:`\gamma` and :math:`\beta`. (default: :obj:`False`) track_running_stats (bool, optional): If set to :obj:`True`, this module tracks the running mean and variance, and when set to :obj:`False`, this module does not track such statistics and always uses instance statistics in both training and eval modes. (default: :obj:`False`) """ def __init__( self, in_channels: int, eps: float = 1e-5, momentum: float = 0.1, affine: bool = False, track_running_stats: bool = False, ): super().__init__(in_channels, eps, momentum, affine, track_running_stats) def reset_parameters(self): r"""Resets all learnable parameters of the module.""" super().reset_parameters() def forward(self, x: Tensor, batch: OptTensor = None, batch_size: Optional[int] = None) -> Tensor: r""" Args: x (torch.Tensor): The source tensor. batch (torch.Tensor, optional): The batch vector :math:`\mathbf{b} \in {\{ 0, \ldots, B-1\}}^N`, which assigns each element to a specific example. (default: :obj:`None`) batch_size (int, optional): The number of examples :math:`B`. Automatically calculated if not given. (default: :obj:`None`) """ if batch is None: out = F.instance_norm( x.t().unsqueeze(0), self.running_mean, self.running_var, self.weight, self.bias, self.training or not self.track_running_stats, self.momentum, self.eps) return out.squeeze(0).t() if batch_size is None: batch_size = int(batch.max()) + 1 mean = var = unbiased_var = x # Dummies. if self.training or not self.track_running_stats: norm = degree(batch, batch_size, dtype=x.dtype).clamp_(min=1) norm = norm.view(-1, 1) unbiased_norm = (norm - 1).clamp_(min=1) mean = scatter(x, batch, dim=0, dim_size=batch_size, reduce='sum') / norm x = x - mean.index_select(0, batch) var = scatter(x * x, batch, dim=0, dim_size=batch_size, reduce='sum') unbiased_var = var / unbiased_norm var = var / norm momentum = self.momentum if self.running_mean is not None: tmp = (1 - momentum) * self.running_mean + momentum * mean.mean(0) self.running_mean.copy_(tmp) if self.running_var is not None: tmp = (1 - momentum ) * self.running_var + momentum * unbiased_var.mean(0) self.running_var.copy_(tmp) else: if self.running_mean is not None: mean = self.running_mean.view(1, -1).expand(batch_size, -1) if self.running_var is not None: var = self.running_var.view(1, -1).expand(batch_size, -1) x = x - mean.index_select(0, batch) out = x / (var + self.eps).sqrt().index_select(0, batch) if self.weight is not None and self.bias is not None: out = out * self.weight.view(1, -1) + self.bias.view(1, -1) return out def __repr__(self) -> str: return f'{self.__class__.__name__}({self.num_features})' ================================================ FILE: poptorch_geometric/python/ops/knn.py ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. # # This file includes content from PyTorch Cluster which # has been modified by Graphcore Ltd. # # Copyright (c) 2020 Matthias Fey # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. from typing import Optional import torch def knn(x: torch.Tensor, y: torch.Tensor, k: int, batch_x: Optional[torch.Tensor] = None, batch_y: Optional[torch.Tensor] = None, *args, **kwargs): # pylint: disable=unused-argument, keyword-arg-before-vararg r"""Finds for each element in `y` the `k` nearest points in `x`. Args: x (torch.Tensor): Node feature matrix y (torch.Tensor): Node feature matrix k (int): The number of neighbors. batch_x (torch.Tensor, optional): Batch vector which assigns each node to a specific example. (default: :obj:`None`) batch_y (torch.Tensor, optional): Batch vector which assigns each node to a specific example. (default: :obj:`None`) :rtype: :class:`LongTensor` .. testsetup:: import torch from torch_cluster import knn .. testcode:: >>> x = torch.Tensor([[-1, -1], [-1, 1], [1, -1], [1, 1]]) >>> batch_x = torch.tensor([0, 0, 0, 0]) >>> y = torch.Tensor([[-1, 0], [1, 0]]) >>> batch_y = torch.tensor([0, 0]) >>> assign_index = knn(x, y, 2, batch_x, batch_y) """ if batch_x is None: batch_x = x.new_zeros(x.size(0), dtype=torch.int32) if batch_y is None: batch_y = y.new_zeros(y.size(0), dtype=torch.int32) x = x.view(-1, 1) if x.dim() == 1 else x y = y.view(-1, 1) if y.dim() == 1 else y assert x.dim() == 2 and batch_x.dim() == 1 assert y.dim() == 2 and batch_y.dim() == 1 assert x.size(1) == y.size(1) assert x.size(0) == batch_x.size(0) assert y.size(0) == batch_y.size(0) # Rescale x and y. min_xy = torch.min(torch.min(x), torch.min(y)) x, y = x - min_xy, y - min_xy max_xy = torch.max(torch.max(x), torch.max(y)) x, y, = x / max_xy, y / max_xy # Concat batch/features to ensure no cross-links between examples exist. x = torch.cat([ x, 2 * x.size(1) * batch_x.view( -1, 1).to(torch.int32 if x.dtype == torch.long else x.dtype) ], dim=-1) y = torch.cat([ y, 2 * y.size(1) * batch_y.view( -1, 1).to(torch.int32 if y.dtype == torch.long else y.dtype) ], dim=-1) x_expanded = x.expand(y.size(0), *x.shape) y_expanded = y.reshape(y.size(0), 1, y.size(1)) dist, col = torch.topk(torch.norm(x_expanded - y_expanded, dim=-1), k=k, dim=-1, largest=False, sorted=True) row = torch.arange(col.size(0), dtype=torch.long).view(-1, 1).repeat(1, k) distance_upper_bound = x.size(1) row = torch.where(dist > distance_upper_bound, -1, row).view(-1) col = torch.where(dist > distance_upper_bound, -1, col).view(-1) return torch.stack([row, col], dim=0) ================================================ FILE: poptorch_geometric/python/ops/knn_graph.py ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. # This file includes content from PyTorch Cluster which # has been modified by Graphcore Ltd. # # Copyright (c) 2020 Matthias Fey # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. import torch from torch_geometric.typing import OptTensor from poptorch_geometric.ops.knn import knn def knn_graph(x: torch.Tensor, k: int, batch: OptTensor = None, loop: bool = False, flow: str = 'source_to_target', cosine: bool = False, num_workers: int = 1) -> torch.Tensor: r"""Computes graph edges to the nearest :obj:`k` points. .. code-block:: python import torch from torch_geometric.nn import knn_graph x = torch.Tensor([[-1, -1], [-1, 1], [1, -1], [1, 1]]) batch = torch.tensor([0, 0, 0, 0]) edge_index = knn_graph(x, k=2, batch=batch, loop=False) Args: x (torch.Tensor): Node feature matrix :math:`\mathbf{X} \in \mathbb{R}^{N \times F}`. k (int): The number of neighbors. batch (torch.Tensor, optional): Batch vector :math:`\mathbf{b} \in {\{ 0, \ldots, B-1\}}^N`, which assigns each node to a specific example. (default: :obj:`None`) loop (bool, optional): If :obj:`True`, the graph will contain self-loops. (default: :obj:`False`) flow (str, optional): The flow direction when using in combination with message passing (:obj:`"source_to_target"` or :obj:`"target_to_source"`). (default: :obj:`"source_to_target"`) cosine (bool, optional): If :obj:`True`, will use the cosine distance instead of euclidean distance to find nearest neighbors. (default: :obj:`False`) num_workers (int, optional): Number of workers to use for computation. Has no effect in case :obj:`batch` is not :obj:`None`, or the input lies on the GPU. (default: :obj:`1`) :rtype: :class:`torch.Tensor` """ assert flow in ['source_to_target', 'target_to_source'] edge_index = knn(x, x, k if loop else k + 1, batch, batch, cosine, num_workers) if flow == 'source_to_target': row, col = edge_index[1], edge_index[0] else: row, col = edge_index[0], edge_index[1] if not loop: mask = row != col row, col = row[mask], col[mask] return torch.stack([row, col], dim=0) ================================================ FILE: poptorch_geometric/python/ops/knn_interpolate.py ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. # # This file includes content from PyTorch Geometric which # has been modified by Graphcore Ltd. # # Copyright (c) 2023 PyG Team # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. import torch from torch_geometric.typing import OptTensor from torch_geometric.utils import scatter from poptorch_geometric.ops.knn import knn def knn_interpolate(x: torch.Tensor, pos_x: torch.Tensor, pos_y: torch.Tensor, batch_x: OptTensor = None, batch_y: OptTensor = None, k: int = 3, num_workers: int = 1, *args, **kwargs): # pylint: disable=unused-argument, keyword-arg-before-vararg r"""The k-NN interpolation from the `"PointNet++: Deep Hierarchical Feature Learning on Point Sets in a Metric Space" `_ paper. For each point :math:`y` with position :math:`\mathbf{p}(y)`, its interpolated features :math:`\mathbf{f}(y)` are given by .. math:: \mathbf{f}(y) = \frac{\sum_{i=1}^k w(x_i) \mathbf{f}(x_i)}{\sum_{i=1}^k w(x_i)} \textrm{, where } w(x_i) = \frac{1}{d(\mathbf{p}(y), \mathbf{p}(x_i))^2} and :math:`\{ x_1, \ldots, x_k \}` denoting the :math:`k` nearest points to :math:`y`. Args: x (torch.Tensor): Node feature matrix :math:`\mathbf{X} \in \mathbb{R}^{N \times F}`. pos_x (torch.Tensor): Node position matrix :math:`\in \mathbb{R}^{N \times d}`. pos_y (torch.Tensor): Upsampled node position matrix :math:`\in \mathbb{R}^{M \times d}`. batch_x (torch.Tensor, optional): Batch vector :math:`\mathbf{b_x} \in {\{ 0, \ldots, B-1\}}^N`, which assigns each node from :math:`\mathbf{X}` to a specific example. (default: :obj:`None`) batch_y (torch.Tensor, optional): Batch vector :math:`\mathbf{b_y} \in {\{ 0, \ldots, B-1\}}^N`, which assigns each node from :math:`\mathbf{Y}` to a specific example. (default: :obj:`None`) k (int, optional): Number of neighbors. (default: :obj:`3`) num_workers (int, optional): Number of workers to use for computation. Has no effect in case :obj:`batch_x` or :obj:`batch_y` is not :obj:`None`, or the input lies on the GPU. (default: :obj:`1`) """ with torch.no_grad(): assign_index = knn(pos_x, pos_y, k, batch_x=batch_x, batch_y=batch_y, num_workers=num_workers) y_idx, x_idx = assign_index[0], assign_index[1] extended_y_idx = torch.where(y_idx == -1, pos_y.size(0), y_idx) extended_x_idx = torch.where(x_idx == -1, pos_x.size(0), x_idx) posx_zeros = torch.zeros_like(pos_x[:1], dtype=pos_x.dtype) extended_diff_x = torch.cat((pos_x, posx_zeros)) posy_zeros = torch.zeros_like(pos_y[:1], dtype=pos_y.dtype) extended_diff_y = torch.cat((pos_y, posy_zeros)) x_zeros = torch.zeros_like(x[:1], dtype=x.dtype) extended_x = torch.cat((x, x_zeros)) diff = torch.index_select(extended_diff_x, 0, extended_x_idx) - torch.index_select( extended_diff_y, 0, extended_y_idx) squared_distance = (diff * diff).sum(dim=-1, keepdim=True) weights = 1.0 / torch.clamp(squared_distance, min=1e-16) scatter_y_idx = torch.arange(pos_y.size(0), dtype=torch.long).view(-1, 1).repeat(1, k).view(-1) y = scatter(torch.index_select(extended_x, 0, extended_x_idx) * weights, scatter_y_idx, 0, pos_y.size(0), reduce='sum') return y / scatter(weights, scatter_y_idx, 0, pos_y.size(0), reduce='sum') ================================================ FILE: poptorch_geometric/python/ops/mf_conv.py ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. # # This file includes content from PyTorch Geometric which # has been modified by Graphcore Ltd. # # Copyright (c) 2023 PyG Team # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. from typing import Union import torch import torch_geometric from torch_geometric.typing import Adj, OptPairTensor, Size, SparseTensor from torch_geometric.utils import degree from torch import Tensor # pylint: disable=abstract-method class MFConv(torch_geometric.nn.conv.MFConv): r"""The graph neural network operator from the `"Convolutional Networks on Graphs for Learning Molecular Fingerprints" `_ paper .. math:: \mathbf{x}^{\prime}_i = \mathbf{W}^{(\deg(i))}_1 \mathbf{x}_i + \mathbf{W}^{(\deg(i))}_2 \sum_{j \in \mathcal{N}(i)} \mathbf{x}_j which trains a distinct weight matrix for each possible vertex degree. Args: in_channels (int or tuple): Size of each input sample, or :obj:`-1` to derive the size from the first input(s) to the forward method. A tuple corresponds to the sizes of source and target dimensionalities. out_channels (int): Size of each output sample. max_degree (int, optional): The maximum node degree to consider when updating weights (default: :obj:`10`) bias (bool, optional): If set to :obj:`False`, the layer will not learn an additive bias. (default: :obj:`True`) **kwargs (optional): Additional arguments of :class:`torch_geometric.nn.conv.MessagePassing`. Shapes: - **inputs:** node features :math:`(|\mathcal{V}|, F_{in})` or :math:`((|\mathcal{V_s}|, F_{s}), (|\mathcal{V_t}|, F_{t}))` if bipartite, edge indices :math:`(2, |\mathcal{E}|)` - **outputs:** node features :math:`(|\mathcal{V}|, F_{out})` or :math:`(|\mathcal{V_t}|, F_{out})` if bipartite """ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) def forward(self, x: Union[Tensor, OptPairTensor], edge_index: Adj, size: Size = None) -> Tensor: if isinstance(x, Tensor): x: OptPairTensor = (x, x) x_r = x[1] deg = x[0] # Dummy. if isinstance(edge_index, SparseTensor): deg = edge_index.storage.rowcount() elif isinstance(edge_index, Tensor): i = 1 if self.flow == 'source_to_target' else 0 N = x[0].size(self.node_dim) N = size[1] if size is not None else N N = x_r.size(self.node_dim) if x_r is not None else N deg = degree(edge_index[i], N, dtype=torch.long) deg.clamp_(max=self.max_degree) # propagate_type: (x: OptPairTensor) h = self.propagate(edge_index, x=x, size=size) out = h.new_empty(list(h.size())[:-1] + [self.out_channels]) for i, (lin_l, lin_r) in enumerate(zip(self.lins_l, self.lins_r)): mask = torch.eq(deg, i).view(-1, 1) h_deg_i = torch.where(mask, h, 0.0) r = lin_l(h_deg_i) if x_r is not None: x_r_deg_i = torch.where(mask, x_r, 0.0) r = r + lin_r(x_r_deg_i) out = torch.where(mask, r, out) return out ================================================ FILE: poptorch_geometric/python/ops/radius.py ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. # # This file includes content from PyTorch Cluster which # has been modified by Graphcore Ltd. # # Copyright (c) 2020 Matthias Fey # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. import torch from torch import Tensor from torch_geometric.typing import OptTensor def radius( x: Tensor, y: Tensor, r: float, batch_x: OptTensor = None, batch_y: OptTensor = None, max_num_neighbors: int = 32, *args, **kwargs, ) -> Tensor: # pylint: disable=unused-argument, keyword-arg-before-vararg r"""Computes graph edges to all points within a given distance. .. code-block:: python import torch from torch_geometric.nn import radius_graph x = torch.Tensor([[-1, -1], [-1, 1], [1, -1], [1, 1]]) batch = torch.tensor([0, 0, 0, 0]) edge_index = radius_graph(x, r=1.5, batch=batch, loop=False) Args: x (torch.Tensor): Node feature matrix :math:`\mathbf{X} \in \mathbb{R}^{N \times F}`. r (float): The radius. batch (torch.Tensor, optional): Batch vector :math:`\mathbf{b} \in {\{ 0, \ldots, B-1\}}^N`, which assigns each node to a specific example. (default: :obj:`None`) loop (bool, optional): If :obj:`True`, the graph will contain self-loops. (default: :obj:`False`) max_num_neighbors (int, optional): The maximum number of neighbors to return for each element in :obj:`y`. (default: :obj:`32`) flow (str, optional): The flow direction when using in combination with message passing (:obj:`"source_to_target"` or :obj:`"target_to_source"`). (default: :obj:`"source_to_target"`) num_workers (int, optional): Number of workers to use for computation. Has no effect in case :obj:`batch` is not :obj:`None`, or the input lies on the GPU. (default: :obj:`1`) batch_size (int, optional): The number of examples :math:`B`. Automatically calculated if not given. (default: :obj:`None`) :rtype: :class:`torch.Tensor` with static shape, where not found neighbours are marked by -1 """ if batch_x is None: batch_x = x.new_zeros(x.size(0), dtype=torch.long) if batch_y is None: batch_y = y.new_zeros(y.size(0), dtype=torch.long) if not torch.is_floating_point(x): x = x.float() if not torch.is_floating_point(y): y = y.float() x = x.view(-1, 1) if x.dim() == 1 else x y = y.view(-1, 1) if y.dim() == 1 else y assert x.dim() == 2 and batch_x.dim() == 1 assert y.dim() == 2 and batch_y.dim() == 1 assert x.size(1) == y.size(1) assert x.size(0) == batch_x.size(0) assert y.size(0) == batch_y.size(0) x = torch.cat([x, 2 * r * batch_x.view(-1, 1).to(x.dtype)], dim=-1) y = torch.cat([y, 2 * r * batch_y.view(-1, 1).to(y.dtype)], dim=-1) distance_upper_bound = r + 1e-8 dist = torch.cdist(y, x) k = min(dist.size(-1), max_num_neighbors) dist, col = torch.topk(dist, k=k, dim=-1, largest=False, sorted=True) row = torch.cat([torch.full(col.shape[1:], i) for i in range(col.size(0))], dim=0) col = torch.where(dist < distance_upper_bound, col, -1) col = torch.flatten(col) row = torch.where(col == -1, -1, row) return torch.stack([row, col], dim=0) def radius_graph( x: torch.Tensor, r: float, batch: OptTensor = None, loop: bool = False, max_num_neighbors: int = 32, flow: str = 'source_to_target', *args, **kwargs, ) -> torch.Tensor: # pylint: disable=unused-argument, keyword-arg-before-vararg r"""Computes graph edges to all points within a given distance. Args: x (Tensor): Node feature matrix :math:`\mathbf{X} \in \mathbb{R}^{N \times F}`. r (float): The radius. batch (LongTensor, optional): Batch vector :math:`\mathbf{b} \in {\{ 0, \ldots, B-1\}}^N`, which assigns each node to a specific example. :obj:`batch` needs to be sorted. (default: :obj:`None`) loop (bool, optional): If :obj:`True`, the graph will contain self-loops. (default: :obj:`False`) max_num_neighbors (int, optional): The maximum number of neighbors to return for each element. If the number of actual neighbors is greater than :obj:`max_num_neighbors`, returned neighbors are picked randomly. (default: :obj:`32`) flow (string, optional): The flow direction when used in combination with message passing (:obj:`"source_to_target"` or :obj:`"target_to_source"`). (default: :obj:`"source_to_target"`) num_workers (int): Number of workers to use for computation. Has no effect in case :obj:`batch` is not :obj:`None`, or the input lies on the GPU. (default: :obj:`1`) batch_size (int, optional): The number of examples :math:`B`. Automatically calculated if not given. (default: :obj:`None`) :rtype: :class:`LongTensor` .. code-block:: python import torch from torch_cluster import radius_graph x = torch.Tensor([[-1, -1], [-1, 1], [1, -1], [1, 1]]) batch = torch.tensor([0, 0, 0, 0]) edge_index = radius_graph(x, r=1.5, batch=batch, loop=False) """ assert flow in ['source_to_target', 'target_to_source'] edge_index = radius(x, x, r, batch, batch, max_num_neighbors if loop else max_num_neighbors + 1, *args, **kwargs) if flow == 'source_to_target': row, col = edge_index[1], edge_index[0] else: row, col = edge_index[0], edge_index[1] if not loop: mask = row != col row, col = row[mask], col[mask] return torch.stack([row, col], dim=0) ================================================ FILE: poptorch_geometric/python/override.py ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. import functools import importlib import torch_geometric import poptorch from poptorch_geometric import ops from poptorch_geometric.common import call_once class _TorchGeometricOpsSubstitutionManager: subsitutions = { torch_geometric.nn: { "knn_interpolate": ops.knn_interpolate }, torch_geometric.nn.aggr.base.Aggregation: { "assert_sorted_index": ops.Aggregation.assert_sorted_index }, torch_geometric.nn.ClusterGCNConv: { "forward": ops.ClusterGCNConv.forward }, torch_geometric.nn.conv.edge_conv: { # pylint: disable=no-member "knn": ops.knn }, torch_geometric.nn.conv.gravnet_conv: { # pylint: disable=no-member "knn": ops.knn }, torch_geometric.nn.conv.x_conv: { # pylint: disable=no-member "knn_graph": ops.knn_graph }, torch_geometric.nn.dense.HeteroLinear: { "forward": ops.HeteroLinear.forward }, torch_geometric.nn.InstanceNorm: { "forward": ops.InstanceNorm.forward }, torch_geometric.nn.conv.MFConv: { "forward": ops.MFConv.forward }, torch_geometric.nn.unpool: { "knn_interpolate": ops.knn_interpolate }, torch_geometric.nn.pool: { "knn": ops.knn, "knn_graph": ops.knn_graph, "radius": ops.radius, "radius_graph": ops.radius_graph, } } def __init__(self): self.overrides = {} def __enter__(self): self.replace() return self def __exit__(self, exc_type, value, traceback): self.restore() def replace(self): torch_geometric.experimental.set_experimental_mode( True, 'disable_dynamic_shapes') def create_wrapper(f, replacement_f): @functools.wraps(f) def _wrapper(*args, **kwargs): return replacement_f(*args, **kwargs) return _wrapper for mod, replacement_map in self.subsitutions.items(): for op_name, replacement in replacement_map.items(): func = getattr(mod, op_name) self.overrides.setdefault(mod, {})[op_name] = func setattr(mod, op_name, create_wrapper(func, replacement)) def restore(self): for mod, replacement_map in self.overrides.items(): for op_name, func in replacement_map.items(): setattr(mod, op_name, func) torch_geometric.experimental.set_experimental_mode( False, 'disable_dynamic_shapes') @call_once def registerOptionalOverrides(): torch_cluster_spec = importlib.util.find_spec("torch_cluster") if torch_cluster_spec is not None: loader = torch_cluster_spec.loader if loader is not None: torch_cluster = loader.load_module() torch_cluster_overrides = \ _TorchGeometricOpsSubstitutionManager.subsitutions.setdefault( torch_cluster, {}) torch_cluster_overrides["knn"] = ops.knn torch_cluster_overrides["knn_graph"] = ops.knn_graph torch_cluster_overrides["radius"] = ops.radius torch_cluster_overrides["radius_graph"] = ops.radius_graph torch_cluster_overrides["nearest"] = poptorch.nearest registerOptionalOverrides() ================================================ FILE: poptorch_geometric/python/py.typed ================================================ # Marker file for PEP 561. ================================================ FILE: poptorch_geometric/python/pyg_cluster_loader.py ================================================ # Copyright (c) 2022 Graphcore Ltd. All rights reserved. from typing import Optional import torch from torch_geometric.loader import ClusterData, ClusterLoader from poptorch_geometric.collate import FixedSizeCollater from poptorch_geometric.fixed_size_options import FixedSizeOptions from poptorch_geometric.pyg_dataloader import OverSizeStrategy class FixedSizeClusterLoader(torch.utils.data.DataLoader): r"""A data loader which merges data objects from a :class:`torch_geometric.loader.ClusterData` to a mini-batch of clusters and pads node and edge features so tensors across all batches have constant shapes. Args: cluster_data (ClusterData): The cluster from which to load the data. fixed_size_options (FixedSizeOptions, optional): A :py:class:`poptorch_geometric.fixed_size_options.FixedSizeOptions` object which holds the maximum number of nodes, edges and other options required to pad the mini-batches, produced by the data loader, to a fixed size. batch_size (int, optional): The number of nodes per mini-batch to load. (default: :obj:`1`) over_size_strategy (OverSizeStrategy, optional): The behaviour if a sample cannot fit in the fixed-size mini-batch. By default, if the required number of samples cannot fit into the fixed-sized mini-batch, nodes and edges will be removed from the mini-batch to achieve the specified fixed size. (default: `poptorch_geometric.OverSizeStrategy.TrimNodesAndEdges`) add_pad_masks (bool, optional): If :obj:`True`, mask objects are attached to mini-batch result. They represents three levels of padding: - :obj:`graphs_mask` - graph level mask - :obj:`nodes_mask` - node level mask - :obj:`edges_mask` - edge level mask Mask objects indicate which elements in the mini-batch are real (represented by :obj:`True`) and which were added as padding (represented by :obj:`False`). (default: :obj:`True`) **kwargs (optional): The additional arguments of :class:`torch.utils.data.DataLoader`. """ def __init__( self, cluster_data: ClusterData, fixed_size_options: FixedSizeOptions, batch_size: int = 1, over_size_strategy: OverSizeStrategy = OverSizeStrategy. TrimNodesAndEdges, add_pad_masks: Optional[bool] = True, **kwargs, ): assert fixed_size_options.num_graphs == 2, ( "The number of graphs in a batch specified by the fixed sized" f" options must be 2 when using the {self.__class__.__name__}," " currently it is set to" f" {fixed_size_options.num_graphs}") unsupported = set(kwargs).intersection( {'collate_fn', 'batch_sampler', 'shuffle', 'exclude_keys'}) assert not unsupported, \ '`FixedSizeClusterLoader` does not support the following ' \ f'arguments: {unsupported}.' self.cluster_data = cluster_data self.batch_size = batch_size collater = self._create_collater( fixed_size_options=fixed_size_options, add_masks_to_batch=add_pad_masks, trim_nodes=( over_size_strategy in (OverSizeStrategy.TrimNodes, OverSizeStrategy.TrimNodesAndEdges)), trim_edges=( over_size_strategy in (OverSizeStrategy.TrimEdges, OverSizeStrategy.TrimNodesAndEdges))) super().__init__(dataset=range(len(cluster_data)), batch_size=batch_size, collate_fn=collater, **kwargs) def _collate(self, batch): batch = self.cluster_collater(batch) batch = self.fixed_size_collater([batch]) return batch def _create_collater(self, **collater_args): cluster_loader = ClusterLoader(self.cluster_data, batch_size=self.batch_size) self.cluster_collater = cluster_loader._collate # pylint: disable=protected-access self.fixed_size_collater = FixedSizeCollater(**collater_args) return self._collate ================================================ FILE: poptorch_geometric/python/pyg_collate.py ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. from torch_geometric.loader.dataloader import Collater as PyGCollater # TODO: Upstream that change (default arguments) to PyG when upstreaming # DataLoaders. class Collater(PyGCollater): def __init__(self, follow_batch=None, exclude_keys=None): follow_batch = follow_batch or [] exclude_keys = exclude_keys or [] super().__init__(follow_batch, exclude_keys) ================================================ FILE: poptorch_geometric/python/pyg_dataloader.py ================================================ # Copyright (c) 2022 Graphcore Ltd. All rights reserved. # Note: The content of this file is going to be upstreamed to PyG. from enum import Enum from typing import List, Optional, Sequence, Tuple, Union import torch.utils.data from torch.utils.data.sampler import RandomSampler, SequentialSampler from torch_geometric.data import Dataset, HeteroData from torch_geometric.data.data import BaseData from poptorch_geometric.collate import FixedSizeCollater from poptorch_geometric.fixed_size_options import FixedSizeOptions from poptorch_geometric.pyg_collate import Collater from poptorch_geometric.stream_packing_sampler import StreamPackingSampler class FixedSizeStrategy(Enum): """Specify the strategy to use to achieve fixed-size mini-batches. - ``PadToMax``: Each mini-batch will contain a fixed number of real graphs plus one single graph for padding. - ``StreamPack``: If the next sample to batch can fit in the mini-batch it will be added. This results in mini-batches with a varied number of real graphs, but minimises the amount of wasted space in a mini-batch due to padding. """ PadToMax = 0 StreamPack = 1 class OverSizeStrategy(Enum): """Specify the behaviour if a sample cannot fit in the fixed-size mini-batch. - ``Error``: If the required number of samples cannot fit into a mini-batch, an error will be thrown. - ``Skip``: If the required number of samples cannot fit into a mini-batch, the samples that cannot fit will be skipped. - ``TrimNodes``: If the required number of samples cannot fit into a mini-batch, the samples will still be added and then nodes will be removed from the mini-batch to achieve the fixed size. Enabling this can cause a loss of information in the samples of the mini-batch. - ``TrimEdges``: If the required number of samples cannot fit into a mini-batch, the samples will still be added and then edges will be removed from the mini-batch to achieve the fixed size. Enabling this can cause a loss of information in the samples of the mini-batch. - ``TrimNodesAndEdges``: If the required number of samples cannot fit into a mini-batch, the samples will still be added and then both nodes and edges will be removed from the mini-batch to achieve the fixed size. Enabling this can cause a loss of information in the samples of the mini-batch. """ Error = 0 Skip = 1 TrimNodes = 2 TrimEdges = 3 TrimNodesAndEdges = 4 # ==== Copied from PyG and changed to have `_create_collater` method and # pass arguments to `__init__`` as keyword ones. class DataLoader(torch.utils.data.DataLoader): r"""A data loader which merges data objects from a :class:`torch_geometric.data.Dataset` to a mini-batch. Data objects can be either of type :class:`~torch_geometric.data.Data` or :class:`~torch_geometric.data.HeteroData`. Args: dataset (Dataset): The dataset from which to load the data. batch_size (int, optional): How many samples per batch to load. (default: :obj:`1`) shuffle (bool, optional): If set to :obj:`True`, the data will be reshuffled at every epoch. (default: :obj:`False`) follow_batch (List[str], optional): Creates assignment batch vectors for each key in the list. (default: :obj:`None`) exclude_keys (List[str], optional): Will exclude each key in the list. (default: :obj:`None`) **kwargs (optional): Additional arguments of :class:`torch.utils.data.DataLoader`. """ def __init__( self, dataset: Union[Dataset, Sequence[BaseData]], batch_size: int = 1, shuffle: bool = False, follow_batch: Optional[List[str]] = None, exclude_keys: Optional[List[str]] = None, **kwargs, ): if 'collate_fn' in kwargs: del kwargs['collate_fn'] # Save for PyTorch Lightning < 1.6: self.follow_batch = follow_batch self.exclude_keys = exclude_keys collater = self._create_collater(follow_batch=follow_batch, exclude_keys=exclude_keys) super().__init__( dataset=dataset, batch_size=batch_size, shuffle=shuffle, collate_fn=collater, **kwargs, ) def _create_collater(self, **collater_args): return Collater(**collater_args) # ==== End of copied code class FixedSizeDataLoader(torch.utils.data.DataLoader): r"""A data loader which merges data objects from :class:`torch_geometric.data.Dataset` to a mini-batch and pads node and edge features so tensors across all batches have the same shapes. Data objects can be either of type :py:class:`~torch_geometric.data.Data` or :py:class:`~torch_geometric.data.HeteroData`. Args: dataset (Dataset): The :class:`~torch_geometric.data.Dataset` instance from which to load the graph samples. batch_size (int, optional): The number of graph samples to load in each mini-batch. This should be at least :obj:`2` to allow for creating at least one padding graph. (default: :obj:`2`) shuffle (bool, optional): If :obj:`True`, the data will be reshuffled at every epoch. (default: :obj:`False`) fixed_size_options (FixedSizeOptions, optional): A :py:class:`poptorch_geometric.fixed_size_options.FixedSizeOptions` object which holds the maximum number of nodes, edges and other options required to pad the mini-batches, produced by the data loader, to a fixed size. If not specified, this will be determined from the provided dataset. (default: :obj:`None`) fixed_size_strategy (FixedSizeStrategy, optional): The strategy to use to achieve fixed-size mini-batches. By default, each mini-batch will contain a fixed number of real graphs (`batch_size` - 1) plus one single graph for padding. (default: `poptorch_geometric.FixedSizeStrategy.PadToMax`) over_size_strategy (OverSizeStrategy, optional): The behaviour if a sample cannot fit in the fixed-size mini-batch. By default, if the required number of samples cannot fit into the fixed-sized batch an error will be raised. (default: `poptorch_geometric.OverSizeStrategy.Error`) add_pad_masks (bool, optional): If :obj:`True`, mask objects are attached to mini-batch result. They represents three levels of padding: - :obj:`graphs_mask` - graph level mask - :obj:`nodes_mask` - node level mask - :obj:`edges_mask` - edge level mask Mask objects indicate which elements in the mini-batch are real (represented by :obj:`True`) and which were added as padding (represented by :obj:`False`). (default: :obj:`True`) follow_batch (list or tuple, optional): Creates assignment batch vectors for each key in the list. (default: :obj:`None`) exclude_keys (list or tuple, optional): Keys to exclude from the batch. (default: :obj:`None`) **kwargs (optional): Additional arguments of :class:`torch.utils.data.DataLoader`. """ def __init__( self, dataset: Dataset, batch_size: int = 2, shuffle: bool = False, fixed_size_options: Optional[FixedSizeOptions] = None, fixed_size_strategy: FixedSizeStrategy = FixedSizeStrategy. PadToMax, over_size_strategy: OverSizeStrategy = OverSizeStrategy.Error, add_pad_masks: Optional[bool] = True, follow_batch: Optional[Union[List[str], Tuple[str, ...]]] = None, exclude_keys: Optional[Union[List[str], Tuple[str, ...]]] = None, **kwargs, ) -> None: if fixed_size_options is None: self.fixed_size_options = FixedSizeOptions.from_dataset( dataset, batch_size) else: self.fixed_size_options = fixed_size_options if (isinstance(dataset[0], HeteroData) and not self.fixed_size_options.is_hetero()): self.fixed_size_options.to_hetero(dataset[0].node_types, dataset[0].edge_types) assert batch_size == self.fixed_size_options.num_graphs, ( "`num_graphs` in fixed size options must match" " provided batch size in dataloader. `num_graphs`" f" is {self.fixed_size_options.num_graphs} but batch" f" size is {batch_size}.") self.padded_batch_size = batch_size batch_sampler = kwargs.pop("batch_sampler", None) if fixed_size_strategy == FixedSizeStrategy.StreamPack: if batch_sampler is not None: raise ValueError( f"Fixed size strategy {fixed_size_strategy} is" " incompatible with the provided batch_sampler" f" {batch_sampler}. Either use a different strategy" " or set `batch_sampler` to `None`.") base_sampler = RandomSampler( dataset) if shuffle else SequentialSampler(dataset) # Leave space for padding. sampler_graphs = batch_size - 1 sampler_nodes = fixed_size_options.total_num_nodes - 1 sampler_edges = fixed_size_options.total_num_edges - 1 batch_sampler = StreamPackingSampler( dataset, sampler_graphs, max_num_nodes=sampler_nodes, max_num_edges=sampler_edges, base_sampler=base_sampler, allow_skip_data=(over_size_strategy == OverSizeStrategy.Skip)) elif fixed_size_strategy != FixedSizeStrategy.PadToMax: raise NotImplementedError( f"Fixed size strategy {fixed_size_strategy} is not a supported" f" strategy for {self.__class__.__name__}") if batch_sampler is not None: # The `torch.DataLoader` class expects batch size to be `1` # and shuffle to be `None` when `batch_sampler` is provided. torch_dataloader_batch_size = 1 shuffle = None else: torch_dataloader_batch_size = batch_size - 1 self.batch_sampler = batch_sampler assert 'collate_fn' not in kwargs, \ f'Cannot set `collate_fn` with `{self.__class__.__name__}`. ' \ 'Consider attaching a torch_geometric.transform.Pad transform' \ ' after your collate_fn and use with' \ ' `torch.utils.dataloader.DataLoader` to achieve fixed sized' \ ' batches.' collater = self._create_collater( fixed_size_options=self.fixed_size_options, add_masks_to_batch=add_pad_masks, trim_nodes=( over_size_strategy in (OverSizeStrategy.TrimNodes, OverSizeStrategy.TrimNodesAndEdges)), trim_edges=( over_size_strategy in (OverSizeStrategy.TrimEdges, OverSizeStrategy.TrimNodesAndEdges)), follow_batch=follow_batch, exclude_keys=exclude_keys) super().__init__(dataset=dataset, batch_size=torch_dataloader_batch_size, shuffle=shuffle, batch_sampler=batch_sampler, collate_fn=collater, **kwargs) def _create_collater(self, **collater_args): return FixedSizeCollater(**collater_args) ================================================ FILE: poptorch_geometric/python/stream_packing_sampler.py ================================================ # Copyright (c) 2022 Graphcore Ltd. All rights reserved. from functools import lru_cache from typing import Iterable, Iterator, List, Optional, Union from torch.utils.data.sampler import RandomSampler, Sampler, SequentialSampler from torch_geometric.data import Dataset from torch_geometric.data.data import BaseData __all__ = ['StreamPackingSampler'] class StreamPackingSampler(Sampler[List[int]]): r"""Wraps a sampler to generate a mini-batch of graphs with potentially varying batch sizes. :py:class:`StreamPackingSampler` creates batches by adding one graph at a time to the batch one at a time without exceeding the maximum number of nodes, edges, or graphs. This gives similar results to packing without requiring the dataset to be preprocessed. Args: data_source (torch_geometric.data.Dataset): The data source to process. max_num_graphs (int): The maximum number of graphs to include in a batch. max_num_nodes (int, optional): The maximum number of nodes allowed in a batch. (default: :obj:`None`) max_num_edges (int, optional): The maximum number of edges allowed in a batch. (default: :obj:`None`) base_sampler (Sampler or Iterable, optional): The base sampler used to sample the graphs before packing them into a batch. This can be any iterable object. (default: SequentialSampler(data_source)) allow_skip_data (bool, optional): If true, allows for a skip :obj:`data_source` item to be skipped. Otherwise, a :py:exc:`RuntimeError` will be thrown when the sampler is not able to form a single item batch from :obj:`data_source`, because the iterated data exceeds the maximum batch requirements. (default :obj:`False`) """ def __init__( self, data_source: Dataset, max_num_graphs: int, max_num_nodes: Optional[int] = None, max_num_edges: Optional[int] = None, base_sampler: Optional[Union[Sampler[int], Iterable[int]]] = None, allow_skip_data: Optional[bool] = False) -> None: super().__init__(data_source) self._validate(base_sampler, max_num_nodes, max_num_edges, max_num_graphs) self.data_source = data_source self.max_num_graphs = max_num_graphs self.max_num_nodes = max_num_nodes if max_num_nodes is None: self.max_num_nodes = max(data.num_nodes for data in data_source) * max_num_graphs self.max_num_edges = max_num_edges if max_num_edges is None: self.max_num_edges = max(data.num_edges for data in data_source) * max_num_graphs self.base_sampler = base_sampler if base_sampler is not None else \ SequentialSampler(data_source) self.allow_skip_data = allow_skip_data def _validate(self, sampler, max_num_nodes, max_num_edges, max_num_graphs): if sampler is not None and len(sampler) == 0: raise ValueError( f'The `sampler` {sampler} provided is invalid,' ' the length of the sampler must be greater than 0.') def validate_batch_limit(param, param_name, limit=1): if param is not None and param < limit: raise ValueError( f'Invalid value for `{param_name}` parameter, ' f'{param_name} should be at least greater ' f' than {limit}.') if max_num_graphs is None: raise ValueError('Invalid value for `max_num_graphs` parameter.' ' `max_num_graphs` must be an integer of at least' ' 1, it is None.') validate_batch_limit(max_num_graphs, 'max_num_graphs', 1) validate_batch_limit(max_num_nodes, 'max_num_nodes', max_num_graphs) validate_batch_limit(max_num_edges, 'max_num_edges', max_num_graphs) class _Batch: def __init__(self) -> None: self.indices: List[int] = [] self.num_nodes = 0 self.num_edges = 0 self.num_graphs = 0 def append(self, idx: int, data: BaseData) -> None: self.indices.append(idx) self.num_nodes += data.num_nodes self.num_edges += data.num_edges self.num_graphs += 1 def empty(self) -> bool: return len(self.indices) == 0 def __repr__(self) -> str: return f'Batch{{ indices: {self.indices}, ' \ f'num_nodes: {self.num_nodes}, ' \ f'num_edges: {self.num_edges}, ' \ f'num_graphs: {self.num_graphs} }}' def __iter__(self) -> Iterator[List[int]]: batch = self._Batch() for idx in self.base_sampler: data = self.data_source[idx] is_data_appendable = True while True: if self._has_space(batch, data): batch.append(idx, data) elif not batch.empty(): yield batch.indices batch = self._Batch() continue else: is_data_appendable = False if not self.allow_skip_data and not is_data_appendable: raise RuntimeError( 'The maximum number of graphs, nodes or edges' ' specified is too small to fit in the single sample' f' {idx} with {data.num_nodes} nodes and' f' {data.num_edges} edges. The maximum number of graphs' f' specified is {self.max_num_graphs}, the maximum' f' number of nodes is {self.max_num_nodes} and the' f' maximum number of edges is {self.max_num_edges}.' ' If this is intended, use `allow_skip_data` to' ' enable this sample to be completely skipped' f' from batching. The sample is {data}.') break if not batch.empty(): yield batch.indices def _has_space(self, batch: _Batch, data: BaseData) -> bool: next_nodes = data.num_nodes next_edges = data.num_edges nodes_left = self.max_num_nodes - (batch.num_nodes + next_nodes) edges_left = self.max_num_edges - (batch.num_edges + next_edges) graphs_left = self.max_num_graphs - (batch.num_graphs + 1) graph_fits = nodes_left >= 0 and edges_left >= 0 and \ graphs_left >= 0 has_space_for_padding = nodes_left >= graphs_left and \ edges_left >= graphs_left has_space = graph_fits and has_space_for_padding return has_space @lru_cache(maxsize=128) def __len__(self) -> int: if isinstance(self.base_sampler, RandomSampler): raise NotImplementedError( f'{self.__class__.__name__} length (`__len__`) cannot' ' be determined. The base sampler used is an instance of' '`RandomSampler`, which will result in' f' {self.__class__.__name__} producing a nondeterministic' ' number of batches. When using this sampler with stream' ' packing avoid requiring the length.') return len(list(self.__iter__())) ================================================ FILE: poptorch_geometric/python/types.py ================================================ # Copyright (c) 2022 Graphcore Ltd. All rights reserved. from itertools import chain try: from functools import singledispatchmethod except ImportError: from singledispatchmethod import singledispatchmethod from typing import Any, Generator, Union, Iterable, List import torch from torch_geometric.data import Batch, Data, HeteroData from torch_geometric.data.storage import BaseStorage from torch_geometric.data.data import BaseData from poptorch import ICustomArgParser, registerCustomArgParser from poptorch_geometric.common import DataBatch, HeteroDataBatch, call_once class PyGArgsParser(ICustomArgParser): @staticmethod def _sortedTensorKeys(struct: Union[Data, DataBatch]) -> Iterable[str]: all_keys = sorted(struct.keys) def isTensor(k): return isinstance(struct[k], torch.Tensor) return filter(isTensor, all_keys) @singledispatchmethod def yieldTensors(self, struct) -> Generator[torch.Tensor, None, None]: raise ValueError(f'Unsupported data type: {type(struct)}') @yieldTensors.register def _(self, struct: Data or DataBatch) -> Generator[torch.Tensor, None, None]: for k in self._sortedTensorKeys(struct): yield struct[k] @yieldTensors.register def _(self, struct: HeteroData or HeteroDataBatch) -> Generator[torch.Tensor, None, None]: def isTensor(val): return isinstance(val, torch.Tensor) for v in filter(isTensor, struct._global_store.values()): # pylint: disable=protected-access yield v for attr in chain(struct.node_stores, struct.edge_stores): if isinstance(attr, BaseStorage): for v in filter(isTensor, attr.values()): yield v @staticmethod def _setup_num_fields( batch: Union[DataBatch, HeteroDataBatch], original_structure: Union[DataBatch, HeteroDataBatch]): if hasattr(original_structure, '_num_graphs'): batch._num_graphs = original_structure._num_graphs # pylint: disable=protected-access num_nodes = original_structure.num_nodes num_edges = original_structure.num_edges batch['num_nodes'] = num_nodes batch['num_edges'] = num_edges if isinstance(batch, HeteroDataBatch): # We need to override properties getters, to make them return the # proper (device iterations independent) `num_nodes` and `num_edges` # The general idea is to return values from `num_nodes` or # `num_edges` fields (if defined) in the first place. def nodes_fget(sub_self): if 'num_nodes' in sub_self._global_store: # pylint: disable=protected-access return sub_self['num_nodes'] return super(type(sub_self), sub_self).num_nodes setattr(HeteroDataBatch, 'num_nodes', property(fget=nodes_fget)) def edges_fget(sub_self): if 'num_edges' in sub_self._global_store: # pylint: disable=protected-access return sub_self['num_edges'] return super(type(sub_self), sub_self).num_edges setattr(HeteroDataBatch, 'num_edges', property(fget=edges_fget)) @staticmethod def _add_next(tensor_iterator: Iterable[List[Any]], original_struct_val: Any) -> Any: if isinstance(original_struct_val, torch.Tensor): return next(tensor_iterator) return original_struct_val @singledispatchmethod def reconstruct(self, original_structure, tensor_iterator: Iterable[torch.Tensor]) -> Any: # pylint: disable=unused-argument raise ValueError(f'Unsupported data type: {type(original_structure)}') @reconstruct.register def _(self, original_structure: Data or DataBatch, tensor_iterator: Iterable[torch.Tensor]) -> Union[Data, DataBatch]: """ Create a new instance with the same class type as the original_structure. This new instance will be initialized with tensors from the provided iterator and uses the same sorted keys from the yieldTensors() implementation. """ tensor_keys = self._sortedTensorKeys(original_structure) kwargs = dict() for key in tensor_keys: kwargs[key] = self._add_next(tensor_iterator, original_structure[key]) cls = original_structure.__class__ if cls is DataBatch: batch = Batch(**kwargs, _base_cls=Data) self._setup_num_fields(batch, original_structure) return batch return Data(**kwargs) @reconstruct.register def _(self, original_structure: HeteroData or HeteroDataBatch, tensor_iterator: Iterable[torch.Tensor] ) -> Union[HeteroData, HeteroDataBatch]: """ Create a new instance with the same class type as the original_structure. This new instance will be initialized with tensors from the provided iterator and uses the same sorted keys from the yieldTensors() implementation. """ kwargs = dict() for key, attr in original_structure._global_store.items(): # pylint: disable=protected-access kwargs[key] = self._add_next(tensor_iterator, attr) for key, attr in chain(original_structure.node_items(), original_structure.edge_items()): if isinstance(attr, BaseStorage): kwargs[key] = { k: self._add_next(tensor_iterator, v) for k, v in attr.items() } else: kwargs[key] = self._add_next(attr, attr) cls = original_structure.__class__ if cls is HeteroDataBatch: batch = Batch(kwargs, _base_cls=HeteroData) self._setup_num_fields(batch, original_structure) return batch return HeteroData(kwargs) # PyG uses the BaseData object as the root for data and batch objects. @call_once def registerCustomArgParsers(): registerCustomArgParser(BaseData, PyGArgsParser()) registerCustomArgParser(DataBatch, PyGArgsParser()) registerCustomArgParser(HeteroDataBatch, PyGArgsParser()) registerCustomArgParsers() ================================================ FILE: poptorch_geometric/python/utils.py ================================================ # Copyright (c) 2022 Graphcore Ltd. All rights reserved. import time from copy import deepcopy import torch from torch.testing import assert_close from torch_geometric.nn import MessagePassing import poptorch def set_aggregation_dim_size(model: torch.nn.Module, dim_size: int): """Sets the dim_size argument used in the aggregate step of message passing The dim_size will need to be at least as large as the total number of nodes in the batch. """ def set_dim_size_hook(module, inputs): # pylint: disable=unused-argument aggr_kwargs = inputs[-1] aggr_kwargs['dim_size'] = dim_size return aggr_kwargs for module in model.modules(): if isinstance(module, MessagePassing): module.register_aggregate_forward_pre_hook(set_dim_size_hook) class TrainingStepper: """ Test utility for comparing training runs between IPU and CPU. Usage: model = ... batch = ... model.train() stepper = TrainingSteper(model) stepper.run(10, batch) """ def __init__(self, model, lr=0.001, optimizer=poptorch.optim.Adam, options=None, rtol=None, atol=None, enable_fp_exception=True, equal_nan=False): super().__init__() model.train() self.lr = lr self.rtol = rtol self.atol = atol self.equal_nan = equal_nan self.enable_fp_exception = enable_fp_exception self.options = poptorch.Options() if options is None else options self.training_model = None self.inference_model = None self.setup_cpu(model, optimizer) self.setup_ipu(model, optimizer) self.check_parameters() def setup_cpu(self, model, optimizer): self.cpu_model = deepcopy(model) parameters = list(self.cpu_model.parameters()) if parameters: self.optimizer = optimizer(parameters, lr=self.lr) def setup_ipu(self, model, optimizer): self.ipu_model = deepcopy(model) options = self.options if self.enable_fp_exception: options.Precision.enableFloatingPointExceptions(True) parameters = list(self.ipu_model.parameters()) if parameters: ipu_optimizer = optimizer(parameters, lr=self.lr) self.training_model = poptorch.trainingModel( self.ipu_model, optimizer=ipu_optimizer, options=options) self.inference_model = poptorch.inferenceModel(self.ipu_model, options=options) def check_parameters(self): for cpu, ipu in zip(self.cpu_model.named_parameters(), self.ipu_model.named_parameters()): name, cpu = cpu ipu = ipu[1] self.assert_close(actual=ipu, expected=cpu, id=name) def cpu_step(self, batch): self.optimizer.zero_grad() out, loss = self.cpu_model(*batch) loss.backward() self.optimizer.step() return out, loss def ipu_step(self, batch, copy_weights=True): out, loss = self.training_model(*batch) if copy_weights: self.training_model.copyWeightsToHost() return out, loss def run(self, *args): assert self.training_model, 'Training model was not created.' self.cpu_model.train() if len(args) == 2: self._run_common_input(*args) elif len(args) == 3: self._run_separate_inputs(*args) assert True, f"Wrong number of args ({len(args)}!)" def run_inference(self, batch): self.cpu_model.eval() with torch.no_grad(): cpu_out = self.cpu_model(*batch) ipu_out, _ = self.inference_model(*batch) self.assert_close(actual=ipu_out, expected=cpu_out, id="inference") def _run_common_input(self, num_steps, batch): cpu_loss = torch.empty(num_steps) ipu_loss = torch.empty(num_steps) for i in range(num_steps): cpu_out, cpu_loss[i] = self.cpu_step(batch) ipu_out, ipu_loss[i] = self.ipu_step(batch) self.assert_close(actual=ipu_out, expected=cpu_out, id="Output") self.check_parameters() self.assert_close(actual=ipu_loss, expected=cpu_loss, id="loss") def _run_separate_inputs(self, num_steps, cpu_batch, ipu_batch): cpu_loss = torch.empty(num_steps) ipu_loss = torch.empty(num_steps) for i in range(num_steps): cpu_out, cpu_loss[i] = self.cpu_step(cpu_batch) ipu_out, ipu_loss[i] = self.ipu_step(ipu_batch) min_shape = min(cpu_out.shape[0], ipu_out.shape[0]) self.assert_close(actual=ipu_out[:min_shape], expected=cpu_out[:min_shape], id="Output") self.check_parameters() self.assert_close(actual=ipu_loss, expected=cpu_loss, id="loss") def assert_close(self, actual, expected, id): def msg_fn(msg): return f"{id} was not equal:\n\n{msg}\n" assert_close(actual=actual, expected=expected, msg=msg_fn, rtol=self.rtol, atol=self.atol, equal_nan=self.equal_nan) def benchmark(self, num_steps, batch, devices=('ipu')): results = {} if 'ipu' in devices: _, _ = self.ipu_step(batch, copy_weights=False) t_start = time.perf_counter() for _ in range(num_steps): _, _ = self.ipu_step(batch, copy_weights=False) t_end = time.perf_counter() results['ipu_time'] = t_end - t_start if 'cpu' in devices: _, _ = self.cpu_step(batch) t_start_cpu = time.perf_counter() for _ in range(num_steps): _, _ = self.cpu_step(batch) t_end_cpu = time.perf_counter() results['cpu_time'] = t_end_cpu - t_start_cpu if 'gpu' in devices: results['gpu_time'] = None raise NotImplementedError('GPU benchmarking currently unsupported') return results ================================================ FILE: poptorch_geometric/requirements.txt ================================================ # Install pre-built wheels for PyTorch Geometric that are compatible with # poptorch which is currently pinned to torch 2.0.1 --find-links https://data.pyg.org/whl/torch-2.0.1+cpu.html pyg-nightly==2.4.0.dev20230613 torch-scatter==2.1.1+pt20cpu torch-sparse==0.6.17+pt20cpu torch-cluster==1.6.1+pt20cpu torch-spline-conv==1.2.2+pt20cpu pytest-benchmark==4.0.0 pytest-cov==4.0.0 nbconvert==7.2.9 nbformat==5.7.3 pandas==2.0.1 singledispatchmethod==1.0; python_version < '3.8' ================================================ FILE: poptorch_geometric/setup.cfg ================================================ [metadata] license_files = License.txt poptorch_geometric_third_party_licenses.txt ================================================ FILE: poptorch_geometric/setup.py ================================================ # Copyright (c) 2022 Graphcore Ltd. All rights reserved. import sys from setuptools import setup, find_packages REQUIRES = [ '@PYG_DEPENDENCY@', '@POPTORCH_DEPENDENCY@', '@TORCH_SCATTER_DEPENDENCY@', '@TORCH_SPARSE_DEPENDENCY@', ] python_version = f'{sys.version_info.major}.{sys.version_info.minor}' if python_version == '3.7': REQUIRES.append('singledispatchmethod==1.0') VERSION = '@VERSION@' LONG_DESCRIPTION = ( 'PopTorch Geometric is a set of extensions for PyTorch Geometric, enabling ' 'GNN models to be trained, evaluated and used on the Graphcore IPU.') setup(name='poptorch_geometric', version=VERSION, description=LONG_DESCRIPTION, long_description=LONG_DESCRIPTION, long_description_content_type='text/markdown', license='MIT License', license_files=('License.txt', 'poptorch_geometric_third_party_licenses.txt'), author='Graphcore Ltd.', author_email='contact@graphcore.ai', url='http://graphcore.ai', classifiers=[ 'Development Status :: 3 - Alpha', 'Intended Audience :: Developers', 'Intended Audience :: Science/Research', 'Topic :: Scientific/Engineering', 'Topic :: Scientific/Engineering :: Artificial Intelligence', 'License :: OSI Approved :: MIT License', 'Programming Language :: Python :: 3', ], platforms='@PLATFORM@', install_requires=REQUIRES, python_requires=f'=={python_version}.*', packages=find_packages()) ================================================ FILE: poptorch_logging/CMakeLists.txt ================================================ cmake_minimum_required(VERSION 3.14 FATAL_ERROR) project(poptorch_logging) set(CMAKE_POSITION_INDEPENDENT_CODE ON) find_package(spdlog 1.8.0 EXACT REQUIRED) # Packages provided by Poplar find_package(libpvti REQUIRED) find_package(gccs REQUIRED) add_library(poptorch_logging STATIC "source/Error.cpp" "source/Logging.cpp" "source/Tracepoint.cpp") file(GLOB_RECURSE poptorch_logging_public_headers "${CMAKE_CURRENT_SOURCE_DIR}/include/*.hpp*") set_target_properties(poptorch_logging PROPERTIES CXX_STANDARD 14 PUBLIC_HEADER "${poptorch_logging_public_headers}") target_include_directories(poptorch_logging SYSTEM PUBLIC $ $) # Unfortunately, there seems to be an issue with using the `spdlog::*` targets # directly with `target_link_libraries()`, which breaks dependencies of # `poptorch_logging` adding any other include directories. Instead, we'll # manually add spdlog's include directories and compile definitions here. target_include_directories(poptorch_logging SYSTEM PUBLIC $) target_compile_definitions(poptorch_logging PUBLIC $) target_link_libraries(poptorch_logging PRIVATE libpvti gccs_stacktrace) install(TARGETS poptorch_logging LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/poptorch_logging) ================================================ FILE: poptorch_logging/include/poptorch_logging/Error.hpp ================================================ // Copyright (c) 2020 Graphcore Ltd. All rights reserved. #ifndef INCLUDE_POPTORCH_LOGGING_ERROR_HPP #define INCLUDE_POPTORCH_LOGGING_ERROR_HPP #include #include #include #include #include #include namespace poptorch { namespace logging { namespace detail { struct LogContextImpl; struct ErrorImpl; } // namespace detail // Remove everything before the last occurrence of "/poptorch/" in a string // For example given an absolute path like: // /a/b/c/poptorch/d/e/f.cpp -> poptorch/d/e/f.cpp const char *shortPoptorchFilename(const char *filename); #define UNLIKELY(var) __builtin_expect(var, 0) #define UNUSED(var) (void)(var) #define ERROR(msg) \ do { \ std::stringstream __error_msg; \ __error_msg << msg; /* NOLINT */ \ throw ::poptorch::logging::InternalError(__error_msg.str().c_str(), \ __FILE__, __LINE__); \ } while (0) #define ERROR_ON_MSG(condition, msg) \ do { \ if (UNLIKELY(static_cast(condition))) { \ ERROR(msg); \ } \ } while (0) #define ERROR_ON(condition) ERROR_ON_MSG(condition, #condition) /** * Exception class for poptorch. * * The what() method returns both the error message and the * stacktrace. * To have the error without the stacktrace use message(). */ class Error : public std::runtime_error { public: explicit Error(const char *s, const char *file, uint64_t line); Error(Error &&e); const char *file() const; uint64_t line() const; // The error message without the stacktrace const char *message() const; ~Error() override; private: std::unique_ptr _impl; }; /** * Exception class specific to internal errors * This should be used as an assert; for states where the user should not have * been able to create. */ class InternalError : public Error { public: using Error::Error; }; /* Context stack used to attach extra information to exceptions when they're * raised. All contexts changes can be printed by enabling the info mode. */ class LogContext { public: // Current context stack as a string static std::unique_ptr context(); static void resetContext(); static void push(const char *); LogContext(); // Push the context at the top of the context stack. explicit LogContext(const std::string &context) : LogContext(context.c_str()) {} explicit LogContext(const char *context); // Replace the top of the context stack with new_context. void updateContext(const std::string &new_context); // Pop the top of the context stack. void clear(); // Implicitly pop the top of the context stack if clear() hasn't been // explicitly called. ~LogContext(); private: std::unique_ptr _impl; }; } // namespace logging } // namespace poptorch #endif // INCLUDE_POPTORCH_LOGGING_ERROR_HPP ================================================ FILE: poptorch_logging/include/poptorch_logging/Logging.hpp ================================================ // Copyright (c) 2020 Graphcore Ltd. All rights reserved. #ifndef INCLUDE_POPTORCH_LOGGING_H #define INCLUDE_POPTORCH_LOGGING_H #include #include #include #include #include "poptorch_logging/LoggingLight.hpp" /// This is a simple logging system for poptorch based on spdlog. The easiest /// way to use it is to simply call `logging::()` where is one /// of trace, debug, info, warn or err. For example: /// /// #include /// /// void foo(int i) { /// logging::info("foo({}) called", i); /// } /// /// logging can be configured by the methods below, or by environment /// variables, eg /// POPTORCH_LOG_LEVEL=ERR /// POPTORCH_LOG_DEST=Mylog.txt /// /// Formatting is done using the `fmt` library. It supports {}-style and %-style /// format specification strings. See https://github.com/fmtlib/fmt for details. namespace poptorch { namespace logging { // Log a formatted message. This uses the `fmt` C++ library for formatting. // See https://github.com/fmtlib/fmt for details. You should probably use // the MAKE_LOG_TEMPLATE macros instead, e.g. // logging::debug("The answer is: {}", 42). template void log(Level l, const char *s, const Args &...args) { // Avoid formatting if the logging is disabled anyway. if (shouldLog(l)) { const std::string str = fmt::format(s, args...); log(l, str.c_str()); } } // Create a bit of syntactic sugar which allows log statements // of the form logging::debug("Msg"). #define MAKE_LOG_TEMPLATE(fnName, lvl) \ template \ inline void fnName(const char *s, const Args &...args) { \ log(Level::lvl, s, std::forward(args)...); \ } \ \ template \ inline void fnName(std::uint64_t &dedup_count, const char *s, \ const Args &...args) { \ std::uint64_t rlimit = repeatLimit(); \ if (dedup_count > rlimit) { \ return; \ } \ if (dedup_count < rlimit) { \ log(Level::lvl, s, std::forward(args)...); \ } else { \ log(Level::lvl, "...repeated messages suppressed..."); \ } \ dedup_count++; \ } MAKE_LOG_TEMPLATE(trace, Trace) MAKE_LOG_TEMPLATE(debug, Debug) MAKE_LOG_TEMPLATE(info, Info) MAKE_LOG_TEMPLATE(warn, Warn) MAKE_LOG_TEMPLATE(err, Err) #undef MAKE_LOG_TEMPLATE // Convenience macro to create a log entry prefixed with function name e.g.: // void someFunc(int i) { // FUNC_LOGGER(info, " with i := {}", i); // } // Then the log entry would be something like: // 14:30:31.00 [I] void someFunc(int): with i := 42 // NOTE: Because of the limitations of __VA_ARGS__ this log entry must have at // least one parameter. #define FUNC_LOGGER(lvl, fmtStr, ...) \ logging::lvl("{}: " fmtStr, __PRETTY_FUNCTION__, __VA_ARGS__) #undef FUNC_LOGGER } // namespace logging } // namespace poptorch #endif // INCLUDE_POPTORCH_LOGGING_H ================================================ FILE: poptorch_logging/include/poptorch_logging/LoggingLight.hpp ================================================ // Copyright (c) 2022 Graphcore Ltd. All rights reserved. #ifndef INCLUDE_POPTORCH_LOGGING_LIGHT_H #define INCLUDE_POPTORCH_LOGGING_LIGHT_H #include #include // This header is a lighter version of poptorch_logging which doesn't require // spdlog and therefore doesn't support formatting. // // For the full version of the logging API use // poptorch_logging/Logging.hpp instead. namespace poptorch { namespace logging { enum class Level { Trace = 0, Debug = 1, Info = 2, Warn = 3, Err = 4, // level 5 is "critical" in spdlog, which we don't use so isn't exposed here. Off = 6, }; // Set the current log level to one of the above levels. The default // log level is set by the POPTORCH_LOG_LEVEL environment variable // and is off by default. void setLogLevel(Level l); // Return true if the passed log level is currently enabled. bool shouldLog(Level l); // Return true if the Popart IR should be dumped. bool outputPopartIR(); // Return number of times logs should be allowed to repeat std::uint64_t repeatLimit(); void setRepeatLimit(std::uint64_t limit); // Flush the log. By default it is only flushed when the underlying libc // decides to. void flush(); // Log a message. You should probably use the MAKE_LOG_TEMPLATE macros // instead, e.g. logging::debug("A debug message"). void log(Level l, const char *msg); } // namespace logging } // namespace poptorch #endif // INCLUDE_POPTORCH_LOGGING_LIGHT_H ================================================ FILE: poptorch_logging/include/poptorch_logging/Tracepoint.hpp ================================================ // Copyright (c) 2022 Graphcore Ltd. All rights reserved. #ifndef SOURCE_INCLUDE_POPTORCH_TRACEPOINT_HPP #define SOURCE_INCLUDE_POPTORCH_TRACEPOINT_HPP #include #include #include namespace poptorch { namespace logging { namespace detail { class TracepointImpl; } /** RAII class to create tracepoints */ class Tracepoint { public: explicit Tracepoint(const char *label); ~Tracepoint(); static void begin(const char *label); static void end(const char *label); private: std::unique_ptr _impl; }; inline std::string formatPrettyFunction(const char *c) { std::string s(c); // Find the namespace(s)::class::method substring // First locate the start of the arguments auto j = std::find(s.begin(), s.end(), '('); // Second find the last space before the arguments // PRETTY_FUNCTION can return "virtual void poptorch::...." auto i = std::find(std::make_reverse_iterator(j), s.rend(), ' '); // Get the position of the beginning of the substring auto begin_pos = s.size() - static_cast(i - s.rbegin()); // Get the size of the substring auto size = static_cast(j - s.begin()) - begin_pos; return s.substr(begin_pos, size); } #define POPTORCH_TRACEPOINT() \ poptorch::logging::Tracepoint tp { \ poptorch::logging::formatPrettyFunction(__PRETTY_FUNCTION__).c_str() \ } #define POPTORCH_TRACEPOINT_WITH_DEBUG_INFO(debug_info) \ std::stringstream ss; \ ss << poptorch::logging::formatPrettyFunction(__PRETTY_FUNCTION__) << " (" \ << (debug_info) << ")"; \ poptorch::logging::Tracepoint tp { ss.str().c_str() } } // namespace logging } // namespace poptorch #endif // SOURCE_INCLUDE_POPTORCH_TRACEPOINT_HPP ================================================ FILE: poptorch_logging/source/Error.cpp ================================================ // Copyright (c) 2020 Graphcore Ltd. All rights reserved. #include "poptorch_logging/Error.hpp" #include // This is a wrapper for boost::stacktrace without exposing Boost. #include #include "poptorch_logging/Logging.hpp" namespace poptorch { namespace logging { namespace { using Context = std::vector; Context &getContext() { static thread_local Context log_context{}; return log_context; } std::string singleLineContext() { std::stringstream ss; std::string sep{}; for (const auto &lvl : getContext()) { ss << sep << lvl; sep = " -> "; } return ss.str(); } std::string getStackTrace() { std::stringstream out; // 3 to get out of gccs + getStackTrace + Error constructor constexpr size_t num_frames_to_skip = 3; constexpr size_t max_depth = 100; out << "\nStacktrace:\n" << gccs::getStackTrace(num_frames_to_skip, max_depth); return out.str(); } } // namespace const char *shortPoptorchFilename(const char *filename) { auto pos = std::string(filename).rfind("/poptorch/"); if (pos == std::string::npos) { return filename; } return filename + pos + 1; // NOLINT } namespace detail { struct LogContextImpl { LogContextImpl() : cleared(true) {} bool cleared; static bool trace_enabled; }; bool LogContextImpl::trace_enabled = []() { auto *level = std::getenv("POPTORCH_LOG_LEVEL"); if (level == nullptr) { return false; } return std::string(level) == "TRACE_ALL"; }(); struct ErrorImpl { std::string file; std::string message; uint64_t line; }; } // namespace detail Error::~Error() = default; Error::Error(Error &&e) : std::runtime_error(e.what()), _impl(std::move(e._impl)) {} Error::Error(const char *s, const char *file, uint64_t line) : std::runtime_error(std::string(s) + getStackTrace()), _impl(std::make_unique()) { _impl->file = logging::shortPoptorchFilename(file); _impl->line = line; _impl->message = s; } const char *Error::message() const { return _impl->message.c_str(); } const char *Error::file() const { return _impl->file.c_str(); } uint64_t Error::line() const { return _impl->line; } LogContext::LogContext() : _impl(std::make_unique()) {} LogContext::LogContext(const char *context) : LogContext() { updateContext(context); } void LogContext::updateContext(const std::string &new_context) { clear(); getContext().push_back(new_context); _impl->cleared = false; if (detail::LogContextImpl::trace_enabled) { logging::trace("[{}] Start", singleLineContext()); } } void LogContext::clear() { if (!_impl->cleared) { // Don't restore the saved context if we're handling an exception // we might want to recover the context later. if (std::uncaught_exceptions() == 0) { if (detail::LogContextImpl::trace_enabled && !getContext().empty()) { logging::trace("[{}] End", singleLineContext()); } // Don't restore the saved context if the context has been cleared. if (!getContext().empty()) { getContext().pop_back(); } } _impl->cleared = true; } } LogContext::~LogContext() { clear(); } /* static */ std::unique_ptr LogContext::context() { std::stringstream ss; auto &ctx = getContext(); if (ctx.empty()) { return nullptr; } for (int64_t idx = ctx.size() - 1; idx >= 0; --idx) { ss << " [" << ctx.size() - idx - 1 << "] " << ctx.at(idx) << std::endl; } std::string str = ss.str(); auto ptr = std::unique_ptr(new char[str.size() + 1]); str.copy(ptr.get(), std::string::npos); ptr.get()[str.size()] = '\0'; return ptr; } /* static */ void LogContext::resetContext() { return getContext().clear(); } /* static */ void LogContext::push(const char *new_context) { getContext().push_back(new_context); } } // namespace logging } // namespace poptorch ================================================ FILE: poptorch_logging/source/Logging.cpp ================================================ // Copyright (c) 2020 Graphcore Ltd. All rights reserved. #include "poptorch_logging/Logging.hpp" #include #include #include #include #include #include #include #include namespace poptorch { namespace logging { namespace { // Check our enums match (incase spdlog changes under us) static_assert(static_cast(Level::Trace) == spdlog::level::trace, "Logging enum mismatch"); static_assert(static_cast(Level::Off) == spdlog::level::off, "Logging enum mismatch"); // Translate to a speedlog log level. spdlog::level::level_enum translate(Level l) { return static_cast(l); } // Stores the logging object needed by spdlog. struct LoggingContext { LoggingContext(); std::shared_ptr logger; bool output_popart_ir{false}; std::uint64_t repeat_limit{4u}; }; LoggingContext &context() { // This avoids the static initialisation order fiasco, but doesn't solve the // deinitialisation order. Who logs in destructors anyway? static thread_local LoggingContext logging_context; return logging_context; } Level logLevelFromString(const std::string &level) { if (level == "TRACE" || level == "TRACE_ALL") { return Level::Trace; } if (level == "DEBUG" || level == "DEBUG_IR") { return Level::Debug; } if (level == "INFO") { return Level::Info; } if (level == "WARN") { return Level::Warn; } if (level == "ERR") { return Level::Err; } if (level == "OFF" || level.empty()) { return Level::Off; } throw std::runtime_error(fmt::format( "Unknown POPTORCH_LOG_LEVEL '{}'. Valid values are TRACE_ALL, TRACE, " "DEBUG, DEBUG_IR, INFO, WARN, ERR and OFF.", level)); } template void setColours(spdlog::sinks::ansicolor_sink &sink) { // See https://en.wikipedia.org/wiki/ANSI_escape_code#Colors // Ansi colours make zero sense. static const std::string bright_black = "\033[90m"; sink.set_color(spdlog::level::trace, bright_black); sink.set_color(spdlog::level::debug, sink.cyan); sink.set_color(spdlog::level::info, sink.white); sink.set_color(spdlog::level::warn, sink.yellow_bold); sink.set_color(spdlog::level::err, sink.red_bold); } LoggingContext::LoggingContext() { auto *poptorch_log_dest = std::getenv("POPTORCH_LOG_DEST"); auto *poptorch_log_level = std::getenv("POPTORCH_LOG_LEVEL"); // Get logging output from the POPTORCH_LOG_DEST environment variable. // The valid options are "stdout", "stderr", or if it is neither // of those it is treated as a filename. The default is stderr. const std::string log_dest = poptorch_log_dest != nullptr ? poptorch_log_dest : "stderr"; const std::string log_level = poptorch_log_level != nullptr ? poptorch_log_level : "WARN"; // Get logging level from OS ENV. The default level is off. Level default_level = logLevelFromString(log_level); if (log_dest == "stdout") { auto sink = std::shared_ptr(); setColours(*sink); logger = std::make_shared("graphcore", sink); } else if (log_dest == "stderr") { auto sink = std::make_shared(); setColours(*sink); logger = std::make_shared("graphcore", sink); } else { try { logger = spdlog::basic_logger_mt("graphcore", log_dest, true); } catch (const spdlog::spdlog_ex &e) { std::cerr << "Error opening log file: " << e.what() << std::endl; throw; } } logger->set_pattern("%^[%T.%e] [poptorch:cpp] [%l] %v%$"); logger->set_level(translate(default_level)); output_popart_ir = log_level == "DEBUG_IR"; } } // namespace bool outputPopartIR() { return context().output_popart_ir || shouldLog(Level::Trace); } std::uint64_t repeatLimit() { return context().repeat_limit; } void setRepeatLimit(std::uint64_t limit) { context().repeat_limit = limit; } void log(Level l, const char *msg) { context().logger->log(translate(l), msg); } bool shouldLog(Level l) { return context().logger->should_log(translate(l)); } void setLogLevel(Level l) { context().logger->set_level(translate(l)); } void flush() { context().logger->flush(); } } // namespace logging } // namespace poptorch ================================================ FILE: poptorch_logging/source/Tracepoint.cpp ================================================ // Copyright (c) 2022 Graphcore Ltd. All rights reserved. #include "poptorch_logging/Tracepoint.hpp" #include #include "poptorch_logging/Error.hpp" namespace poptorch { namespace logging { namespace detail { class TracepointImpl : public pvti::Tracepoint { public: explicit TracepointImpl(const std::string &label_) : pvti::Tracepoint(&TracepointImpl::channel, label_), ctx(label_) {} ~TracepointImpl() = default; static pvti::TraceChannel channel; LogContext ctx; }; pvti::TraceChannel TracepointImpl::channel = {"poptorch"}; } // namespace detail Tracepoint::Tracepoint(const char *label) : _impl(std::make_unique(std::string(label))) {} void Tracepoint::begin(const char *label) { pvti::Tracepoint::begin(&detail::TracepointImpl::channel, label); } void Tracepoint::end(const char *label) { pvti::Tracepoint::end(&detail::TracepointImpl::channel, label); } Tracepoint::~Tracepoint() = default; } // namespace logging } // namespace poptorch ================================================ FILE: poptorch_third_party_licenses.txt ================================================ The PopTorch package includes the following third party software: pybind11 -------- Copyright (c) 2016 Wenzel Jakob , All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. Please also refer to the file https://github.com/pybind/pybind11/blob/master/.github/CONTRIBUTING.md, which clarifies licensing of external contributions to this project including patches, pull requests, etc. spdlog -------- The MIT License (MIT) Copyright (c) 2016 Gabi Melman. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -- NOTE: Third party dependency used by this software -- This software depends on the fmt lib (MIT License), and users must comply to its license: https://github.com/fmtlib/fmt/blob/master/LICENSE.rst ================================================ FILE: pyproject.toml ================================================ [build-system] requires = [ "python_version>=3.7", "setuptools>=42", "wheel", "pybind11>=2.8.0", "@TORCH_DEPENDENCY@", ] build-backend = "setuptools.build_meta" [tool.pytest.ini_options] # Required to supress a warning from the package `ruamel` using a deprecated pkg_resources function. filterwarnings = [ "ignore::DeprecationWarning:pkg_resources.*", # Deprecation warnings from pillow in torchvision. "ignore:.*Pillow.*:DeprecationWarning:torchvision", ] ================================================ FILE: python/CMakeLists.txt ================================================ include(GNUInstallDirs) file(GLOB python_files "${CMAKE_CURRENT_SOURCE_DIR}/*.py") # __init__.py needs to be edited by set_version.py so don't copy it over. list(REMOVE_ITEM python_files "${CMAKE_CURRENT_SOURCE_DIR}/__init__.py") run_poptorch_install_command("python3 ${PROJECT_SOURCE_DIR}/scripts/set_version.py --torch-version ${TORCH_VERSION} ${CMAKE_CURRENT_BINARY_DIR}/__init__.py" "${PROJECT_SOURCE_DIR}" "Generate __init__.py") install(FILES ${CMAKE_CURRENT_BINARY_DIR}/__init__.py DESTINATION "${INSTALL_PYDIR}") install(FILES ${python_files} py.typed DESTINATION "${INSTALL_PYDIR}") # Compile the Pybind11 module using setup.py (Called by generate_python_package.py run_poptorch_install_command( "python3 ${PROJECT_SOURCE_DIR}/scripts/generate_python_package.py install --include-dir ${CMAKE_INSTALL_PREFIX}/include --lib-dir ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR} --output-dir ${CMAKE_INSTALL_PREFIX} --python-dir ${INSTALL_PYDIR}" "${PROJECT_SOURCE_DIR}" "poptorch_core.so module compilation") ================================================ FILE: python/__init__.py ================================================ # Copyright (c) 2020 Graphcore Ltd. All rights reserved. import abc import atexit import copy import copyreg import functools import importlib import os from typing import Any, Callable, Dict, Iterator, Optional, Union, Type, Sequence, Iterable import pickle import pkg_resources import torch # These are needed before the assert # pylint: disable=wrong-import-order from . import _logging from ._logging import logger # pylint: enable=wrong-import-order assert torch.__version__.startswith("@TORCH_VERSION@"), ( "This version" " of PopTorch only works with torch==@TORCH_VERSION@ but the version " f"installed is {torch.__version__}") # On POD the RDMA driver will hang if the parent process is forked after the # driver was initialised. # This would typically happen when a PyTorch Dataloader creates some workers. # To avoid the issue we need to explicitly enable safe fork. if "RDMAV_FORK_SAFE" not in os.environ: os.environ["RDMAV_FORK_SAFE"] = "1" try: import poptorch.poptorch_core as poptorch_core # type: ignore except ImportError as e: raise ImportError("Unable to import PopTorch, this can be caused by " "attempting to import PopTorch without an active Poplar " "SDK.\n The SDK can be enabled by running: " "`source /path/to/poplar-sdk/enable`") from e # pylint: disable=wrong-import-position from poptorch.poptorch_core import Error, RecoverableError, UnrecoverableError, importPoptorchMetadataFromFile from . import _dataloader from . import _impl from . import _poptorch_data from . import _utils from .enums import * from .ops import * from .options import * from ._impl import isRunningOnIpu, createPoptorchError from ._utils import accessAttributes, getIpuTensorId from ._poplar_executor import PoplarExecutor, registerPreCompileHook, registerPostCompileHook, _OverwriteContextManager from ._printing import * from . import optim from . import profiling # pylint: enable=wrong-import-position __version__ = "@VERSION@-@SNAPSHOT@" # Use package discovery to pass the true filesystem path of the installed python # package to C++. The path could later be used to pre-compile custom codelets # on demand. poptorch_core.setCustomCodeletsPath( pkg_resources.resource_filename("poptorch", "")) @atexit.register def poptorchAtExit(): poptorch_core.poptorchAtExit() def load(filename: str, edit_opts_fn: Optional[Callable[['poptorch.Options'], None]] = None ) -> 'poptorch.PoplarExecutor': """Load a PopTorch model from a file previously created using :py:meth:`~poptorch.PoplarExecutor.compileAndExport` :param edit_opts_fn: Function to edit the options before the model is restored. For example to attach to a specific IPU device. >>> model = poptorch.inferenceModel(model) >>> model.compileAndExport("my_model.poptorch") ... >>> model = poptorch.load("my_model.poptorch") >>> model(my_input) """ serialized_data = importPoptorchMetadataFromFile(filename) try: data = _poptorch_data.parse(serialized_data, __version__) except AssertionError as e: raise AssertionError("Invalid file %s: %s" % (filename, e)) from e assert data.model and data.training is not None, ( f"{filename} is a valid PopTorch file but was created" " with 'export_model=False' which means you need to re-create" " the PopTorch model using poptorch.inferenceModel or " "poptorch.trainingModel then call " f"poptorch_model.loadExecutable(\"{filename}\").") if edit_opts_fn: edit_opts_fn(data.options) if data.optimizer_state is not None: assert data.optimizer is not None data.optimizer.load_state_dict(data.optimizer_state) # It may look wrapped but not be in _impl._wrapper_types because it has been # loaded in a new session. Unwrap manually if so. wrapped_model_cls_str = ( "poptorch._poplar_executor." "PoplarExecutor.__init__..PoptorchModel'>") if wrapped_model_cls_str in str(data.model.__class__): data.model.__class__ = data.model.__class__.__bases__[0] if data.training: executor = trainingModel(data.model, data.options, data.optimizer) else: executor = inferenceModel(data.model, data.options) executor.loadExecutable(filename) if data.random_seed is not None: executor.random_seed = data.random_seed if data.rng_state is not None: executor.rng_state = data.rng_state return executor class _SubDataset: """For distributed execution split the dataset into serial blocks of tensors All the tensors used by process 0, followed by all the tensors used by process 1, and so on. [p0, p0, p0, ..., p1, p1, p1, ..., p2,p2, p2] If shuffling is used, then the indices in the parent (entire) dataset are randomised and ``swap_range`` will be called every time a new iterator is created in order to make sure all the tensors get used. """ def __init__(self, dataset, opts, step, drop_last): num_elts = len(dataset) # Note: all the processes must have the same number of batches # or it will hang. if drop_last: per_proc = step * (num_elts // (step * opts.Distributed.numProcesses)) self._offset = opts.Distributed.processId * per_proc self._length = min(per_proc, num_elts - self._offset) self._leftovers = num_elts % per_proc else: # If the user explicitly requested to not drop the left over elements # then evenly distribute them across all the processes and let the user # take care of padding the tensors. per_proc = [(num_elts // opts.Distributed.numProcesses) + (num_elts % opts.Distributed.numProcesses > proc) for proc in range(opts.Distributed.numProcesses)] self._offset = sum(per_proc[:opts.Distributed.processId]) self._length = per_proc[opts.Distributed.processId] self._leftovers = 0 self._base_offset = self._offset self._dataset = dataset self._seed = opts.random_seed if opts.exists('random_seed') else None self._shuffling_generator_state = None self._shuffled_global_indices = None def shuffle_global_indices(self): """Shuffles the indices across the entire dataset.""" generator = torch.Generator() if self._shuffling_generator_state is None: assert self._seed is not None, ( "Seed must be set when shuffling so that all " "instances end up with the same shuffled global indices.") generator.manual_seed(self._seed) else: generator.set_state(self._shuffling_generator_state) shuffled = torch.randperm(len(self._dataset), generator=generator) # Use shared memory so that the workers' indices # also get shuffled. if self._shuffled_global_indices is None: self._shuffled_global_indices = shuffled.share_memory_() else: self._shuffled_global_indices.copy_(shuffled) self._shuffling_generator_state = generator.get_state() def swap_range(self): """If there are leftovers in the randomly sampled dataset make sure they get included in the next iteration. For example if we've got: T = N * B + L T = total number of tensors N = number of full batches in T B = batch size L = Number of left over tensors First the dataset will return the tensors in [0, T-L] after ``swap_range`` was called the dataset will return tensors in [L, T] """ if self._base_offset == self._offset: self._offset += self._leftovers else: self._offset = self._base_offset def __len__(self): return self._length def __getitem__(self, index): global_index = index + self._offset if self._shuffled_global_indices is not None: global_index = self._shuffled_global_indices[global_index] return self._dataset[global_index] def _batch_sampler_len( batch_sampler: Union[torch.utils.data. Sampler[Sequence], Iterable[Sequence]]): if hasattr(batch_sampler, "__len__"): try: length = len(batch_sampler) if length == NotImplemented: return None return length except NotImplementedError: return None return None class DataLoader(torch.utils.data.DataLoader): """ Thin wrapper around the traditional `torch.utils.data.DataLoader` to abstract away some of the batch sizes calculations. If this data loader is used in a distributed execution environment, it will ensure that each process uses a different subset of the dataset, providing you first call ``options.randomSeed(N)`` with an integer N which is the same across all hosts. """ def __init__( self, options: 'poptorch.Options', dataset: 'torch.utils.data.Dataset', batch_size: int = 1, shuffle: bool = None, num_workers: int = 0, drop_last: bool = True, persistent_workers: Optional[bool] = None, auto_distributed_partitioning: bool = True, mode: 'poptorch.DataLoaderMode' = DataLoaderMode.Sync, async_options: Optional[Dict[str, Any]] = None, rebatched_worker_size: Optional[int] = None, batch_sampler: Optional[Union[torch.utils.data.Sampler[Sequence], Iterable[Sequence]]] = None, **kwargs): """ :param options: Options that will be used to compile and run the model. :param dataset: The dataset to get the data from. :param batch_size: This is the batch size in the conventional sense of being the size that runs through an operation in the model at any given time. :param shuffle: Whether or not the dataset should be shuffled. :param num_workers: Number of worker processes to use to read the data. :param drop_last: If True and the number of elements in the dataset is not a multiple of the combined batch size then the incomplete batch at the end will be dropped. :param persistent_workers: Re-use workers between iterations if True. :param auto_distributed_partitioning: If True, partitions the dataset for distributed execution automatically. Otherwise, it is assumed that partitioning has been handled manually. :param mode: If `DataLoaderMode.Async`, uses an :py:class:`~poptorch.AsynchronousDataAccessor` to access the dataset. If `DataLoaderMode.Sync`, accesses the dataset synchronously. :param async_options: Options to pass to :py:class:`~poptorch.AsynchronousDataAccessor`. :param rebatched_worker_size: When using AsyncRebatched: batch size of the tensors loaded by the workers. Default to the combined batch size. If specified the ``rebatched_worker_size`` must be less than or equal to the combined batch size. :param batch_sampler: Defines the strategy to draw samples from the dataset. Returns a batch of indices at a time. Mutually exclusive with `batch_size`, `shuffle`. :param kwargs: Other options to pass to PyTorch's ``DataLoader`` constructor. """ self._is_user_batch_sampler_set = batch_sampler is not None if self._is_user_batch_sampler_set: if batch_size != 1 or shuffle: raise createPoptorchError( '`batch_sampler` option is mutually ' 'exclusive with batch_size, shuffle.') if options.Distributed.numProcesses > 1 and \ auto_distributed_partitioning: raise createPoptorchError( '`batch_sampler` option is mutually ' 'exclusive with auto_distributed_partitioning=True.') if hasattr(batch_sampler, "batch_size"): batch_size = batch_sampler.batch_size self.batch_sampler_drop_last = drop_last drop_last = None else: if shuffle is None: shuffle = False assert isinstance(options, Options) options._freeze() # pylint: disable=protected-access if persistent_workers is None: persistent_workers = num_workers > 0 self._combined_batch_size: Optional[int] self._num_batches_to_combine: Optional[int] if batch_size is None: self._combined_batch_size = None self._num_batches_to_combine = None else: input_group_count = options.replication_factor // \ options.input_group_size self._num_batches_to_combine = options.device_iterations * \ input_group_count * \ options.Training.gradient_accumulation self._combined_batch_size = batch_size * \ self._num_batches_to_combine self._options = options # Iterable datasets need to be handled differently: they don't have # __getitem__ and __len__ self._is_iterable = isinstance(dataset, torch.utils.data.IterableDataset) self._shuffle_map_style_data_in_distributed_env = False self._accessor = None if self._is_iterable: if auto_distributed_partitioning: assert options.Distributed.numProcesses == 1, ( "auto_distributed_partitioning not supported for" " IterableDataset") if num_workers > 1 and "worker_init_fn" not in kwargs: logger.warning( "IterableDataset used with num_workers=" "%d but no worker_init_fn specified: as a result" " the DataLoader will return %d times each element" " in the dataset (See torch.utils.data.IterableDataset's" " documentation for more information)", num_workers, num_workers) else: num_elts = len(dataset) if not drop_last: if self._is_user_batch_sampler_set: batch_sampler_len = _batch_sampler_len(batch_sampler) if batch_sampler_len is not None: num_incomplete_batches = batch_sampler_len % \ self._num_batches_to_combine if num_incomplete_batches != 0: logger.warning( "The number of batches generated by the batch" " sampler (%d) is not divisible by the number" " of batches elements processed per step (%d)" " and drop_last=False. The last tensor will" " have a batch size of %d. To avoid having to " " handle this special case switch to " " drop_last=True. Batch size = %d," " combined batch size = %d .", batch_sampler_len, self._num_batches_to_combine, num_incomplete_batches * batch_size, batch_size, self._combined_batch_size) else: logger.warning( "The `batch_sampler` __len__ method is not" " implemented and drop_last=False. The last tensor" " may be incomplete - batch size < %d. To avoid" " having to handle this special case switch to" " drop_last=True.", self._num_batches_to_combine) elif self._combined_batch_size is not None and \ num_elts % (self._combined_batch_size * options.Distributed.numProcesses) != 0: logger.warning( "The number of elements in the dataset " "(%d) is not divisible by the number of" " elements processed per step (%d)" " and drop_last=False. The last tensor will have " "a batch size of %d. To avoid having to handle " "this special case switch to drop_last=True", num_elts, self._combined_batch_size * options.Distributed.numProcesses, num_elts % (self._combined_batch_size * options.Distributed.numProcesses)) if options.Distributed.numProcesses > 1: if auto_distributed_partitioning: assert not shuffle or options.exists("random_seed"), ( "When using auto_distributed_partitioning you must set " "poptorch.Options.randomSeed() to ensure that tensors " "are in the same order in all processes.") assert self._combined_batch_size is not None, ( "batch_size=None not allowed when using " "auto_distributed_partitioning.") dataset = _SubDataset(dataset, options, self._combined_batch_size, drop_last) if shuffle: # In a distributed environment we handle the shuffling # ourselves (take a look at _SubDataset and __iter__) # so no need for parent class to shuffle within each of # the subsets again. self._shuffle_map_style_data_in_distributed_env = True shuffle = False if not self._is_iterable: dataset = profiling.Channel("dataset").instrument( dataset, "__getitem__") rebatched_size = None dataset_batch_size = 1 if self._is_user_batch_sampler_set \ else self._combined_batch_size if self._is_user_batch_sampler_set: real_drop_last = self.batch_sampler_drop_last else: real_drop_last = drop_last cbs_is_gt_one = self._combined_batch_size is not None and \ self._combined_batch_size > 1 async_mode_with_remainder = mode == DataLoaderMode.Async and \ not real_drop_last and cbs_is_gt_one if mode == DataLoaderMode.AsyncRebatched or async_mode_with_remainder: mode = DataLoaderMode.Async rebatched_size = self._combined_batch_size # When we rebatch: always let the worker process handle the # leftovers instead of the Dataloader. self.rebatched_drop_last = drop_last drop_last = False if rebatched_worker_size is not None: assert rebatched_worker_size <= self._combined_batch_size, ( f"The rebatched_worker_size ({rebatched_worker_size})" " must be <= to the combined batch size (" f"{self._combined_batch_size})") dataset_batch_size = rebatched_worker_size super().__init__(dataset, batch_size=dataset_batch_size, shuffle=shuffle, batch_sampler=batch_sampler, num_workers=num_workers, drop_last=drop_last, persistent_workers=persistent_workers, **kwargs) if mode == DataLoaderMode.Async: async_options = async_options or {} assert "rebatched_size" not in async_options, ( "You cannot " "use DataLoaderMode.AsyncRebatched and manually specify" " the rebatched_size in async_options") self._accessor = AsynchronousDataAccessor( self, **async_options, rebatched_size=rebatched_size) def __len__(self) -> int: # If we're rebatching in the AsynchronousDataAccessor we need to # adjust the dataset's length. if self._accessor is not None and self._accessor.rebatched_size: num_elts = len(self.dataset) dataset_len = num_elts // self._accessor.rebatched_size if not self.rebatched_drop_last and \ num_elts % self._accessor.rebatched_size: # Round up dataset_len += 1 else: dataset_len = super().__len__() return dataset_len @property def _profiling(self): return profiling.Channel("poptorch.DataLoader") @property def combinedBatchSize(self) -> Optional[int]: """Total number of elements consumed from the dataset for a single execution of the model.""" return self._combined_batch_size @property def options(self) -> 'poptorch.Options': """A reference to the options that were used to initialise this instance. """ return self._options def terminate(self) -> None: """If `mode==DataLoaderMode.Async`, kills the worker process in the underlying :py:class:`~poptorch.AsynchronousDataAccessor` manually, otherwise has no effect. """ if self._accessor is not None: self._accessor.terminate() def __del__(self) -> None: self.terminate() def __iter__(self) -> "torch.utils.data.dataloader._BaseDataLoaderIter": if self._shuffle_map_style_data_in_distributed_env: self.dataset.shuffle_global_indices() self.dataset.swap_range() if self._accessor is not None: return self._accessor.__iter__() if self._is_user_batch_sampler_set and \ self._num_batches_to_combine != 1: return _utils.combined_batch_generator( super().__iter__(), self._num_batches_to_combine, self.batch_sampler_drop_last) return super().__iter__() class AsynchronousDataAccessor: """A data loader which launches the data loading process on a separate thread to allow for the data to be preprocessed asynchronous on CPU to minimise CPU/IPU transfer time. This works by loading the data into a ring buffer of shared memory. When the IPU needs another batch it uses the data ready in the in the ring buffer. The memory is shared so will be used in-place and won't be freed until the next batch is requested. Behind the scenes the worker thread will be filling the unready elements of the ring buffer. .. note:: When using a ``torch.utils.data.Dataset`` with ``rebatched_size`` the accessor will default to ``drop_last=True``, to change that behaviour wrap the dataset into a ``poptorch.DataLoader(..., drop_last=False)``. """ def __init__( self, dataset: Union['torch.utils.data.Dataset', DataLoader], buffer_size: int = 3, miss_sleep_time_in_ms: float = 0.1, load_indefinitely: bool = True, early_preload: bool = False, sharing_strategy: 'poptorch.SharingStrategy' = SharingStrategy. ForkServer, rebatched_size: Optional[int] = None): """ :param dataset: The dataset to pull data from, this can be any Python iterable. :param buffer_size: The size of the ring buffer. :param miss_sleep_time_in_ms: When the buffer is full how long should we sleep the worker before checking again. :param load_indefinitely: If True when we hit the end of the dataset we will just loop round again. :param early_preload: If True, start loading data in the ring buffer as soon as the worker is created. If False, wait for an iterator to be created before loading data. :param sharing_strategy: Method to use to pass the dataset object when the child process is created. * `SharedMemory` is fast but might be quite limited in size. * `FileSystem` will serialise the dataset to file and reload it which will be slower. * `Fork` new processes: no data sharing required but might cause problems if worker processes use threading. * `ForkServer` is similar to `Fork` but uses a server process to fork child processes. It is safe to use even if worker processes use threading. :param rebatched_size: If not None: return N batched tensors from the dataset per iteration. (The passed dataset must have a batch_size of 1). .. note :: If dataset is an iterable-type ``poptorch.DataLoader`` configured with ``drop_last=False`` then ``rebatched_size`` must be used. """ # Set _worker to None in case something goes wrong and terminate is called self._worker = None # Ensure the DataLoader doesn't already have an AsynchronousDataAccessor if isinstance(dataset, DataLoader) and dataset._accessor is not None: raise createPoptorchError( "The DataLoader already uses an " "AsynchronousDataAccessor internally. Either use " "the existing one or set mode='poptorch.DataLoaderMode.Sync'" " in the DataLoader.") if isinstance(dataset, DataLoader) and \ not dataset._is_user_batch_sampler_set and \ not dataset.drop_last and \ rebatched_size is None: # Otherwise we'll end up with one left over tensor per worker # to return to the main process and we don't currently # support that. assert dataset.combinedBatchSize is None or \ dataset.combinedBatchSize == 1, ( "The 'drop_last=False' option from the DataLoader only " "works if 'rebatched_size' is specified too.") if rebatched_size is not None: assert rebatched_size > 1, ("rebatched_size" " must be None or greater than 1") self._dataset = dataset # To avoid hangs when the application exits: implicitly call terminate(). atexit.register(self.terminate) self.rebatched_size = rebatched_size self._worker = _dataloader.AsynchronousWorker( buffer_size, miss_sleep_time_in_ms, dataset, load_indefinitely, early_preload, sharing_strategy, rebatched_size) def terminate(self) -> None: """ An override function to kill the worker process manually. """ if self._worker is not None: self._worker.terminate() self._worker = None def __del__(self) -> None: self.terminate() def __len__(self) -> int: dataset_len = len(self._dataset) # If this AsynchronousDataAccessor is embedded in a DataLoader then the dataset # length has already been adjusted. if self.rebatched_size and getattr(self._dataset, "_accessor", None) != self: num_elts = dataset_len * self._dataset.batch_size dataset_len = num_elts // self.rebatched_size return dataset_len def __iter__(self) -> 'poptorch.AsynchronousDataAccessor': assert self._worker is not None self._worker.resetIterator() return self def __next__(self) -> Any: # We return shared memory to the user so we can't tell the worker to # refill it until the next item is requested. assert self._worker is not None self._worker.releaseElement() while not self._worker.endOfFile(): data = self._worker.acquireElementIfAvailable() if data is not None: return data self._worker.assertNoError() # EOF event raise StopIteration def trainingModel(model: Union['torch.nn.Module', 'poptorch.PoplarExecutor'], options: Optional['poptorch.Options'] = None, optimizer: Optional['torch.optim.Optimizer'] = None ) -> 'poptorch.PoplarExecutor': """ Create a PopTorch training model, from a PyTorch model, to run on IPU hardware in training mode. .. note:: PopTorch makes a shallow copy of the model and wraps the original model to facilitate weight synchronisation. Changes to the parameters in the returned training model affect the original model and vice versa. However, primitive variable types are not synced. For example calling ``model.train()`` on the original model, which changes the ``training`` bool of the model instance, will not alter the model returned by this function. You may need to call ``model.train()`` on your model before you call this function for correct behaviour. .. note: To restore a model use :py:meth:`~poptorch.PoplarExecutor.destroy`. You will need to do this first if you need to call this function again on the same instance. :param model: The PyTorch model to wrap. :param options: The IPU specific options :param optimizer: The optimizers to apply during \ training. Supported PyTorch optimizers: ``optim.SGD``, ``optim.Adam``, \ ``optim.AdamW``, ``optim.RMSprop``. Supported PopTorch optimizers: :py:class:`~poptorch.optim.SGD`, \ :py:class:`~poptorch.optim.Adam`, \ :py:class:`~poptorch.optim.AdamW`, \ :py:class:`~poptorch.optim.RMSprop`. \ :py:class:`~poptorch.optim.LAMB`. :returns: The :py:class:`~poptorch.PoplarExecutor` wrapper to use in place of ``model``. """ if isinstance(model, PoplarExecutor): model = model._user_model # pylint: disable=protected-access # Handle the model already being wrapped if _impl.isWrapped(model): raise RuntimeError("Model has already been wrapped in " "'poptorch.trainingModel'. Call model.destroy() " "on the model to unwrap before wrapping again.") # Create a copy of the original model in case it needs to be wrapped maybe_wrapped_model = copy.copy(model) return PoplarExecutor(model=maybe_wrapped_model, options=options, training=True, optimizer=optimizer, user_model=model, poptorch_version=__version__) def inferenceModel(model: Union['torch.nn.Module', 'poptorch.PoplarExecutor'], options: Optional['poptorch.Options'] = None ) -> 'poptorch.PoplarExecutor': """Create a PopTorch inference model, from a PyTorch model, to run on IPU hardware in inference mode. .. note:: PopTorch makes a shallow copy of the model. Changes to the parameters in the returned inference model affect the original model and vice versa. However, primitive variable types are not synced: for example calling ``model.eval()`` on the original model will not alter the model returned by this function. You may need to call ``model.eval()`` on your model before you call this function for correct behaviour. :param model: The PyTorch model to wrap. :param options: The IPU specific options :returns: The :py:class:`~poptorch.PoplarExecutor` wrapper to use in place of ``model``. """ if isinstance(model, PoplarExecutor): model = model._user_model # pylint: disable=protected-access return PoplarExecutor(model=copy.copy(model), options=options, training=False, poptorch_version=__version__) def ipuHardwareIsAvailable(num_ipus: int = 1) -> bool: """Indicates whether any IPU hardware with `num_ipus` is present in the system. Note: This function doesn't check if the IPU is free or already being used. :param num_ipus: The number of IPUs required. :returns: True if physical IPUs are available, False otherwise. """ return poptorch_core.ipuHardwareVersion(num_ipus) != 0 def ipuHardwareVersion() -> int: """Indicates what IPU hardware version is available in the system. Raise an exception if no hardware is available. :returns: The IPU hardware version or -1 if unknown. """ version = poptorch_core.ipuHardwareVersion() assert version != 0, "No IPU hardware available on this system" return version def setLogLevel(level: Union[str, int]): """Changes the volume of messages printed in the console (stdout) :param level: * TRACE: Print all messages. * DEBUG: Print debug messages and above. * INFO: Print info messages and above. * WARN: Print warnings and errors. * ERR: Print errors only. * OFF: Print nothing. """ _logging.setLogLevel(level) # Hack so that print() works for static graphs: we can print the device, shape, etc. # but we print "" instead of trying to retrieve the content of the tensor. _real_tensor_str = torch._tensor_str._tensor_str # pylint: disable=protected-access def _tensor_str(self, indent): if self.device.type == "ipu": return "" return _real_tensor_str(self, indent) torch._tensor_str._tensor_str = _tensor_str # pylint: disable=protected-access class ICustomArgParser(abc.ABC): """Interface to create custom argument parsers to extract tensors and rebuild custom object types.""" @abc.abstractmethod def yieldTensors(self, struct) -> None: """yield every single torch.Tensor contained in your object in a deterministic order. For example: >>> self.a = { "t0": torch.Tensor(), "t1": torch.Tensor()} ... >>> def yieldTensors(self, struct): >>> for k in sorted(struct.a.keys()): >>> yield struct.a[k] """ @abc.abstractmethod def reconstruct(self, original_structure, tensor_iterator) -> Any: """Create a new structure based on original_structure but using tensors from the provided iterator in the same deterministic order as in yieldTensors(). For example: >>> self.a = { "t0": torch.Tensor(), "t1": torch.Tensor()} ... >>> def reconstruct(self, original_struct, tensor_iterator): >>> out = type(original_struct)() >>> for k in sorted(original_struct.a.keys()): >>> out.a[k] = next(tensor_iterator) >>> return out .. important:: Only IPU tensors should be dequeued from the tensor iterator (not CPU tensors or other object types), """ def registerCustomArgParser(arg_data_type: Type, arg_parser: Optional[ICustomArgParser]): """Register an argument parser for a custom argument type. If a custom parser is already registered for this data type, it will be replaced. If arg_parser is None, then the current custom parser, if there is one, will be deleted. """ if arg_parser is None: if arg_data_type in _utils.custom_arg_parsers: del _utils.custom_arg_parsers[arg_data_type] else: if not isinstance(arg_parser, ICustomArgParser): raise createPoptorchError( "arg_parser must inherit from ICustomArgParser") _utils.custom_arg_parsers[arg_data_type] = arg_parser def registerGeometricCustomArgParsers(): # Try registering pyg's custom arg parsers. # If anything goes wrong with import of pyg, then silently ignore it, # so that poptorch importers that don't need pyg do not needlessly fail. try: poptorch_geometric_spec = importlib.util.find_spec( "poptorch_geometric") if poptorch_geometric_spec is None: return types_spec = importlib.util.find_spec("poptorch_geometric.types") if types_spec is not None and types_spec.loader is not None: types = types_spec.loader.load_module() types.registerCustomArgParsers() except ImportError: pass registerGeometricCustomArgParsers() from ._poplar_executor import PoplarExecutor # pylint: disable=reimported, wrong-import-position ================================================ FILE: python/_args_parser.py ================================================ # Copyright (c) 2021 Graphcore Ltd. All rights reserved. import copy import inspect from typing import Any, Dict import torch # Do not import any poptorch.* here: it will break the poptorch module from . import _impl from ._logging import logger from . import _utils class ArgsParser: class Args: def __init__(self): self._args = [] self._arg_names = [] self._kwargs = {} self.first_none = None @property def args(self): return self._args @property def arg_names(self): return self._arg_names @property def kwargs(self): return self._kwargs def appendArg(self, arg, name): self._args.append(arg) self._arg_names.append(name) def setNamedArg(self, name, arg): self._kwargs[name] = arg def popArg(self): self._args.pop() self._arg_names.pop() def clone(self): # pylint: disable=protected-access clone = ArgsParser.Args() clone._args = copy.copy(self._args) clone._arg_names = copy.copy(self._arg_names) clone._kwargs = copy.copy(self._kwargs) clone.first_none = self.first_none return clone def _forEach(self, data, fn): tensors = _utils.flattenTensorStructure(data) return _utils.reconstructTensorStructure( data, [fn(tensor) for tensor in tensors]) def validateInputs(self, inputs): end = ( "\nThis error occurred because the inputs passed at runtime" " don't match the inputs used to compile the model.\n" "To recompile the model for the new inputs create a new " "inferenceModel / trainingModel wrapper or call destroy() on " "the curent one and try again.") if len(inputs.args) != len(self.args): raise _impl.createPoptorchError( "Number of positional arguments mismatch: expected " f"{len(self.args)} arguments but got " f"{len(inputs.args)}.{end}") def validate(name, compiled, input, are_named_args=False): ctype = type(compiled) itype = type(input) if ctype != itype: raise _impl.createPoptorchError( f"Type mismatch for {name}: expected " f"{ctype} but got {itype}.{end}") if isinstance(compiled, tuple): clen = len(compiled) ilen = len(input) if clen != ilen: raise _impl.createPoptorchError( f"Length mismatch for {name}: " f"expected {clen} elements but got {ilen}.{end}") for i, c in enumerate(compiled): validate(name + f"[{i}]", c, input[i]) elif isinstance(compiled, dict): expected = set(compiled.keys()) provided = set(input.keys()) if expected != provided: extra = provided - expected details = [] if extra: details.append("Unexpected arguments: " + ", ".join(sorted(extra))) missing = expected - provided if missing: details.append("Missing arguments: " + ", ".join(sorted(missing))) raise _impl.createPoptorchError( f"Keys mismatch for {name}: " f"{'. '.join(details)}.{end}") for k, v in compiled.items(): if are_named_args: n = k else: n = f"{name}[{k}]" validate(n, v, input[k]) elif isinstance(compiled, torch.Tensor): if compiled.dtype != input.dtype: raise _impl.createPoptorchError( "Data type " f"mismatch for {name}: expected {compiled.dtype} " f"but got {input.dtype}.{end}") if compiled.shape != input.shape: raise _impl.createPoptorchError( "Shape " f"mismatch for {name}: expected {compiled.shape} " f"but got {input.shape}.{end}") else: # If we've got a custom parser then we'll be able to extract # the tensors and validate them as a list. compiled_tensors = _utils.flattenTensorStructure(compiled) if compiled_tensors: input_tensors = _utils.flattenTensorStructure(input) validate(name, tuple(compiled_tensors), tuple(input_tensors)) elif compiled != input: # Other types are compiled in the graph (scalars, etc) and # therefore should be an exact match to the value used to # compile the model. raise _impl.createPoptorchError( f"Value mismatch for {name}: " f"expected {compiled} but got {input}.{end}") for i, arg in enumerate(self.args): validate(self.arg_names[i], arg, inputs.args[i]) validate("named arguments", self.kwargs, inputs.kwargs, are_named_args=True) def forEachTensorMatchedAtLeastOnce(self, condition, doOnTrue=None): matches = [False] def fn(t): if condition(t): matches[0] = True if doOnTrue is not None: return doOnTrue(t) return t self.forEach(fn) return matches[0] def forEach(self, fn): self._args = self._forEach(self._args, fn) self._kwargs = self._forEach(self._kwargs, fn) def asPackedFlatTuple(self, canonical_args=None): # Remove all the non torch.tensor types and flatten # any data structure. cargs = None if canonical_args is None else canonical_args.args ckwargs = None if canonical_args is None else canonical_args.kwargs return tuple( _utils.flattenTensorStructure(self._args, cargs) + _utils.flattenTensorStructure(self._kwargs, ckwargs)) def __init__(self, model: Any): # Combine args and kwargs: if isinstance(model, _impl.OptimizerWrapper): sig = inspect.signature(model.model.forward) elif isinstance(model, torch.nn.Module): sig = inspect.signature(model.forward) elif callable(model): try: sig = inspect.signature(model) except ValueError: # ValueError: no signature found for builtin ... # If the callable is a Cython function then its signature # might not be available (E.g torch.nn.functional.logsigmoid) sig = None else: raise TypeError("Expected a torch.nn.Module or a callable") if sig is None: # If we couldn't extract the function's signature: be flexible # and default to "*args, **kwargs" self._varnames = ["args", "kwargs"] self._var_kinds = [ inspect.Parameter.VAR_POSITIONAL, inspect.Parameter.VAR_KEYWORD ] self._defaults = {} self._has_variadic_arguments = True else: self._var_kinds = [p.kind for p in sig.parameters.values()] self._has_variadic_arguments = any(kind in [ inspect.Parameter.VAR_POSITIONAL, inspect.Parameter.VAR_KEYWORD ] for kind in self._var_kinds) self._varnames = list(sig.parameters.keys()) self._defaults = { name: p.default for name, p in sig.parameters.items() if p.default != inspect.Parameter.empty } self._warned_not_contiguous_input = False def __call__(self, args: Any, kwargs: Dict[str, Any], fast_path: bool = False) -> Args: """Checks the inputs are of a supported type. Inputs must be tensors or tuples/lists of tensors. Will convert list to tuples as we can't natively support lists in the JIT. """ in_tensors = ArgsParser.Args() assert self._has_variadic_arguments or len(args) + len(kwargs) <= len( self._varnames), ("Too many arguments provided: expected %s (%d) " "but got %d") % (self._varnames, len(self._varnames), len(args) + len(kwargs)) # Make sure all the arguments provided are allowed. if not self._has_variadic_arguments: for k in kwargs.keys(): assert k in self._varnames, ( f"{k} is not a valid parameter." f"Allowed values are {self._varnames}") variadic_pos_set = False for i, name in enumerate(self._varnames): is_variadic_pos = self._var_kinds[ i] == inspect.Parameter.VAR_POSITIONAL is_variadic_keyword = self._var_kinds[ i] == inspect.Parameter.VAR_KEYWORD if is_variadic_keyword: # A variadic keyword argument will consume all the remaining # kwargs used_names = self._varnames[:i] for k, v in kwargs.items(): if k not in used_names: in_tensors.setNamedArg(k, v) elif i < len(args) or is_variadic_pos: # If it's a variadic parameter: consume all the remaining args # otherwise consume only one. if is_variadic_pos: variadic_pos_set = True a = args[i:] # Clear args: all the arguments have been consumed args = [] else: a = [args[i]] for idx, arg in enumerate(a): if is_variadic_pos: arg_name = f"*{name}[{idx}]" else: arg_name = name # Non fast path for compilation, fast path for executing. if not fast_path: self._dictCheck(arg) in_tensors.appendArg(arg, arg_name) assert name not in kwargs, ("Parameter %s was passed more " "than once") % name elif name in kwargs: # Non fast path for compilation, fast path for executing. if not fast_path: self._dictCheck(kwargs[name]) # Everything after a variadic positional argument must be named if variadic_pos_set: in_tensors.setNamedArg(name, kwargs[name]) else: in_tensors.appendArg(kwargs[name], name) else: if name not in self._defaults: raise _impl.createPoptorchError("Mandatory parameter " f"{name} missing") value = self._defaults[name] # Everything after a variadic positional argument must be named if variadic_pos_set: in_tensors.setNamedArg(name, value) else: in_tensors.appendArg(value, name) if in_tensors.forEachTensorMatchedAtLeastOnce( condition=lambda t: not t.is_contiguous(), doOnTrue=lambda t: t.contiguous()): if not self._warned_not_contiguous_input: logger.warning("At least one input tensor is not contiguous: " "non-contiguous tensors will be converted.") self._warned_not_contiguous_input = True return in_tensors def _dictCheck(self, data): work = [data] while len(work) > 0: d = work.pop() if isinstance(d, (tuple, list)): work.extend(d) elif isinstance(d, dict): logger.warning("Dicts as inputs only have partial support, " "they can be accessed using literal keys, but " "full Python functionality is not enabled. " "Consider changing dict inputs to tuple.") return ================================================ FILE: python/_dataloader.py ================================================ # Copyright (c) 2021 Graphcore Ltd. All rights reserved. import io import signal import sys import os import tempfile import enum import math import pickle import time import torch import torch.multiprocessing as multiprocessing # Do not import any poptorch.* here: it will break the poptorch module from . import enums from ._logging import logger from . import _impl from ._utils import custom_arg_parsers, getCustomParser, reconstructTensorStructure class AsynchronousWorker: """Interface for the host to create and manage a separate worker process to fetch elements from a dataset.""" def __init__(self, buffer_size, miss_sleep_time_in_ms, dataset, load_indefinitely, early_preload, sharing_strategy, rebatched_size): self._process = _AsynchronousWorkerProcess( buffer_size, miss_sleep_time_in_ms, dataset, load_indefinitely, early_preload, sharing_strategy, rebatched_size) self._was_used = False self._worker_started = False # Keep end of file events in a special buffer shared between worker and device. This is due to the worker reseting automatically. (self._command_pipe, self._is_single_tensor, self._dict_keys, self._data_type_obj, self._eof, self._data_buffers) = self._process.start() def terminate(self): if self._process.isAlive(): self._requestShutdown() self._process.join() def resetIterator(self): if self._worker_started and not self._was_used: # The current iterator hasn't been used: nothing to do. return # Reset if: # - The EOF was reached and the worker is waiting to know if it # should create a new iterator (load_indefinitely=False) # - We're partway through an iteration and we want to restart. # # Note: there is a race condition where the worker reaches EOF # after endOfFile() returned False. # The consequence is that reset will be called when it wasn't # actually needed. (i.e it won't break anything) if self._was_used and (not self.endOfFile() or (self.endOfFile() and not self._process.load_indefinitely)): # Request reset: self._command_pipe.send(_HostCommand.ResetIterator) self.releaseElement() # Wait for the worker to acknowledge self._eof.waitForReset() self._data_buffers.reset() self._eof.clearFlag() # Let the worker know it can start loading self._command_pipe.send(_HostCommand.StartIterating) self._was_used = False self._worker_started = True def dataIsAvailable(self): return self._data_buffers.isAvailable() def endOfFile(self): return self._eof.isEofIndex(self._data_buffers.currentIndex()) def acquireElementIfAvailable(self): assert not self._data_buffers.hasLock(), ( "The current element " "must be released by calling releaseElement() before trying to " "acquire a new one") # Important: eof must be checked **after** dataIsAvailable. # # The worker does: # 1. setEOFflag() # 2. if load_indefinitely -> start prefetching the next iteration. # 3. mark data as available. # # So in the consumer / reader we need to check the flags in reverse # order otherwise there is a risk that eof will be False, then by # the time data is checked both eof and data are now True but # we'll miss eof and iterate over the ring buffer an extra time. if not self.dataIsAvailable() or self.endOfFile(): return None left_over = self._eof.leftOver(self._data_buffers.currentIndex()) # Pull and lock the ready buffer. data = self._data_buffers.lock() self._was_used = True if left_over > 0: data = [d.narrow(0, 0, left_over) for d in data] # Update the EOF flag to the real index and clear the # left over value. self._eof.setFlag(self._data_buffers.currentIndex()) # The worker process always sends us a tuple of tensors, however # the user data can actually be either: # - A list # - A single tensor # - A dictionary string -> Tensor # If it's a single tensor: return the first element of the list. if self._is_single_tensor: return data[0] if self._dict_keys: # If it's a dictionary: associate the data to the keys here. return dict(zip(self._dict_keys, data)) if self._data_type_obj: # If it's a custom object type: reconstruct it using the ArgParser return reconstructTensorStructure(self._data_type_obj, data) # Else return the list as is. return data def assertNoError(self): if not self._process.isAlive(): assert self._process.exitCode() == 0, \ "An error occurred in the data fetcher" def releaseElement(self): # Set the previous iteration to false so it can be pulled in now # avoiding any data races. self._data_buffers.unlockIfLocked() def _requestShutdown(self): # Send the exit signal if the worker is still alive. try: self._command_pipe.send(_HostCommand.Shutdown) except BrokenPipeError: pass class _AsynchronousWorkerProcess: """Worker process fetching elements from a given dataset""" def __init__(self, buffer_size, miss_sleep_time_in_ms, dataset, load_indefinitely, early_preload, sharing_strategy, rebatched_size): self._buffer_size = buffer_size self._miss_sleep_time_in_ms = miss_sleep_time_in_ms self._dataset = dataset self.load_indefinitely = load_indefinitely self._early_preload = early_preload self._process = None self._sharing_strategy = sharing_strategy self._rebatched_size = rebatched_size self._next_batch_idx = 0 def isAlive(self): return self._process.exitcode is None def exitCode(self): return self._process.exitcode def join(self): self._process.join(timeout=10) # If the asynchronous worker process is blocked waiting for the dataset # to process the next batch it will not be able to respond to host # command handler's shutdown_now command. We try stopping it by sending # a SIGINT signal first and choose SIGTERM as the last resort. if self.isAlive(): os.kill(self._process.pid, signal.SIGINT) self._process.join(timeout=10) if self.isAlive(): self._process.terminate() self._process.join() def start(self): # The dataset might not fit in shared memory: so use the file system instead. if self._sharing_strategy != enums.SharingStrategy.FileSystem: return self._start() # Serialise the dataset to file and replace the dataset by the filename. with tempfile.TemporaryDirectory() as d: pickle_file = os.path.join(d, "dataset.pkl") logger.debug("Serialising dataset to file: %s", pickle_file) dataset = self._dataset with open(pickle_file, "wb") as f: pickle.dump(self._dataset, f, protocol=4) self._dataset = pickle_file try: return self._start() finally: self._dataset = dataset def _start(self): assert self._process is None, "Worker already started" # We use a small pipe to get the initial data. The latency of # deserialising the python data is too high to be used for the # actual fetch so we just use this to return the initial buffers # in shared memory which will be used for the actual read/write # in the hot loop. if self._sharing_strategy == enums.SharingStrategy.Fork: ctx = multiprocessing.get_context('fork') elif self._sharing_strategy == enums.SharingStrategy.ForkServer: ctx = multiprocessing.get_context('forkserver') else: ctx = multiprocessing.get_context('spawn') read_data_pipe, write_data_pipe = ctx.Pipe(duplex=False) # If the worker exits before the parent process is done # setting up the _data_buffers then the pipe will get freed # and bad things will happen. read_command_pipe, write_command_pipe = ctx.Pipe(duplex=False) # Fetch the data on a seperate process. logger.debug("AsynchronousDataAccessor parent process: %d", os.getpid()) self._process = ctx.Process(target=self._mainLoop, args=(write_data_pipe, read_command_pipe)) self._process.start() write_data_pipe.close() read_command_pipe.close() try: indices_mem = read_data_pipe.recv() data_len = read_data_pipe.recv() is_single_tensor = read_data_pipe.recv() dict_keys = read_data_pipe.recv() data_type_obj = read_data_pipe.recv() eof_mem = read_data_pipe.recv() buffers = _DataRingBufferReader(self._buffer_size, data_len, indices_mem) for data_idx in range(0, data_len): # Get the buffer from the host. buffer = read_data_pipe.recv() buffers.setBuffer(buffer, data_idx) # We're all set: let the worker know. write_command_pipe.send(_HostCommand.SetupComplete) return (write_command_pipe, is_single_tensor, dict_keys, data_type_obj, _EndOfFileFlag(eof_mem), buffers) except EOFError: pass # Exit the except block before raising a cleaner exception otherwise the previous one will not be cleared. raise _impl.createPoptorchError( "AsynchronousDataAccessor worker thread failed to start " "(Check above for details)") def _mainLoop(self, conn, command_pipe): # pylint: disable=inconsistent-return-statements """Main event loop of the asynchronous worker process SIGINT signals appear as KeyboardInterrupts and need to be handled as the ``atexit`` terminate hook is not guaranteed to be called before the signal is propagated to the worker processes. See Also: :meth:`_mainLoopNoInterrupt` for the implementation of worker event loop. """ try: return self._mainLoopNoInterrupt(conn, command_pipe) except KeyboardInterrupt: # Core interpretter libraries may already be unloaded # so don't do anything. More detail of caveats in the # pytorch note on [ Data Loader Multiprocessing Shutdown Logic ]: # https://github.com/pytorch/pytorch/blob/ # aa7da7b09c4a3f972ede5fd8ad0cbc8c13498a00/ # torch/utils/data/dataloader.py#L570 pass def _mainLoopNoInterrupt(self, conn, command_pipe): # pylint: disable=too-many-statements # Make sure this process's output gets printed (In case of error) sys.stdout = io.TextIOWrapper(open(sys.stdout.fileno(), 'wb', 0), write_through=True) sys.stderr = io.TextIOWrapper(open(sys.stderr.fileno(), 'wb', 0), write_through=True) # We're in a new process: we need to re-initialise the logger from ._logging import logger # pylint: disable=import-outside-toplevel logger.debug("AsynchronousDataAccessor worker process: %d", os.getpid()) # If the dataset is a string then it's a path to file containing # the dataset if isinstance(self._dataset, str): with open(self._dataset, "rb") as f: self._dataset = pickle.load(f) dataset_iterator = iter(self._dataset) rebatched_drop_last = getattr(self._dataset, "rebatched_drop_last", True) data = None try: data = next(dataset_iterator) except StopIteration: pass if data is None: raise _impl.createPoptorchError("The Dataset is empty") # We support either a single tensor or a flat 1D iterable of tensors. is_single_tensor = False dict_keys = [] data_type_obj = None if isinstance(data, torch.Tensor): is_single_tensor = True data = (data, ) elif isinstance(data, dict): # If the data is a dictionary the keys must # be the same for each instance returned by the dataloader so save # the list of keys here. dict_keys = list(data.keys()) data = tuple(data[k] for k in dict_keys) elif type(data) in custom_arg_parsers.keys(): # If the Dataset stores objects of a custom type, we do need to use # ArgsParser to reconstruct the object from the list of # torch.Tensors. To do so the ArgsParser uses an original object # (as a template) to properly setup the Dataloader output fields. # We assume, that all the batches of objects generated by the # Dataloader will have the same sets of fields. Thanks to this, # we can prepare an empty copy of the object (keeping all its # attributes) and pass it to the _AsynchronousWorkerProcess caller, # so that the data type can get reconstructed. parser = getCustomParser(data) data_iter = iter(type(t)() for t in parser.yieldTensors(data)) data_type_obj = parser.reconstruct(data, data_iter) # Tell the host how many tensors we will be sending... data_length = sum(1 for i in data.__dict__.values() if isinstance(i, torch.Tensor)) # ...before the custom type data gets replaced with a generator. data = parser.yieldTensors(data) # Tell the host how many tensors we will be sending. if data_type_obj is None: data_length = len(data) buffers = _DataRingBufferWriter(self._buffer_size, data_length) # We communicate with the host via an array of sentinel values to say # if the data is ready as this has much better latency than queue or # lock approaches. conn.send(buffers.indices_mem) conn.send(data_length) conn.send(is_single_tensor) conn.send(dict_keys) conn.send(data_type_obj) eof = _EndOfFileFlag(eof_mem=None) conn.send(eof.eof_mem) # Send the tensors to the host. for index, tensor in enumerate(data): assert isinstance( tensor, torch.Tensor), (f"Tensor at index {index} is not a torch " f"tensor ({type(tensor)})." " AsynchronousDataAccessor expects data to " "be organised as a flat 1D container of " "tensors.") # Shared with parent process. tensor_size = [*tensor.size()] if self._rebatched_size: self._next_batch_idx = tensor_size[0] # Reshape with repeat if expand is not working in batch dimension if tensor_size[0] != self._rebatched_size: repeat_count = math.ceil(self._rebatched_size / tensor_size[0]) # Repeat then shrink to the right size tensor = tensor.repeat( repeat_count, *[1] * (len(tensor_size) - 1))[:self._rebatched_size] tensor_size[0] = self._rebatched_size memory = tensor.expand( self._buffer_size, *tensor_size).clone().contiguous().share_memory_() buffers.setBuffer(memory, index) # Send it to the host. conn.send(memory) # We've loaded the first element as part of the spin up process. if self._rebatched_size is None or \ self._next_batch_idx == self._rebatched_size: self._next_batch_idx = 0 # Tell the host this data is ready. buffers.markWriteComplete() host_handler = _HostCommandHandler(command_pipe) if self._early_preload: state = _WorkerState.Prefetching else: state = _WorkerState.Stopped rebatch_leftover = [] while not host_handler.shutdown_now: # Check for messages from the parent process: host_handler.checkMessages() if state == _WorkerState.Stopped: if host_handler.waitUntilStartIteration(): state = _WorkerState.Loading # else reset or shutdown received: fallthrough elif state == _WorkerState.Prefetching and \ host_handler.startIteratingPending(): # The host sent a request to start loading so transition from prefetching # to loading. state = _WorkerState.Loading if host_handler.shutdown_now: continue if host_handler.resetIteratorPending(): logger.debug("AsynchronousDataAccessor worker: reset command " "received. Creating a new iterator") buffers.reset() dataset_iterator = iter(self._dataset) self._next_batch_idx = 0 rebatch_leftover = [] # Let the host know everything has been reset eof.setResetFlag() # Wait for the host to ask for the new iteration to start if not host_handler.waitUntilStartIteration(): continue # received a shutdown command logger.debug("AsynchronousDataAccessor worker: the iterator " "has been reset") state = _WorkerState.Loading # We're now guaranteed to be either loading or prefetching eof_reached = False # Handle the left overs if any before asking for more data. if rebatch_leftover: data = rebatch_leftover rebatch_leftover = [] else: try: # Retrieve data from the dataset data = next(dataset_iterator) if isinstance(data, torch.Tensor): data = (data, ) elif isinstance(data, dict): # If the data is a dictionary: we expect the keys to # be strings and always the same, and the values to # all be tensors. As a result we only need to pass # the tensors as a tuple to the main process and # re-assemble the dictionary there. assert len(data) == len(dict_keys) data = tuple(data[k] for k in dict_keys) elif type(data) in custom_arg_parsers.keys(): parser = getCustomParser(data) data = parser.yieldTensors(data) except StopIteration: logger.debug( "AsynchronousDataAccessor worker: end of dataset" " reached") eof_reached = True # Wait for a writing slot to become available while not buffers.isAvailable( ) and not host_handler.priorityCommandWaiting(): # (Briefly) sleep the thread if we neither is True. if self._miss_sleep_time_in_ms > 0.0: time.sleep(self._miss_sleep_time_in_ms / 1000.0) host_handler.checkMessages() if host_handler.priorityCommandWaiting(): continue if eof_reached: # Note: it's important to have a writing slot before signalling # the end of the dataset or we might encounter the case where # the whole ring buffer is ready to read: # [ True, True, True] # At that point the read and write indices point at the same # index so if we set the EOF as the current write index then # the consumer will discard the whole ring buffer instead of # consuming the ready to read elements first. # Having a writing slot available ensures the read and write # indices never match (Even though the slot might not be used). # If we reach the EOF before the host asked us to start loading, # wait here to avoid potentially overwriting a pending # EOF event. if state == _WorkerState.Prefetching: if not host_handler.waitUntilStartIteration(): continue # reset or shutdown if self._rebatched_size and not rebatched_drop_last \ and self._next_batch_idx != 0: eof.setFlag(buffers.currentIndex(), self._next_batch_idx) # We're in the middle of a rebatch so the buffer # should already be available from previous # batch indices. assert buffers.isAvailable() buffers.markWriteComplete() else: eof.setFlag(buffers.currentIndex(), 0) # If we are not to load indefinitely we wait for the host # to explicitly ask for a new iterator to be created. if not self.load_indefinitely: logger.debug( "AsynchronousDataAccessor worker: end of dataset" " reached signalled to host: waiting for command from" " host") state = _WorkerState.Stopped continue # Go back to the wait for reset logger.debug("AsynchronousDataAccessor worker: end of dataset " "reached. Creating a new iterator") state = _WorkerState.Prefetching # We reset and keep the worker thread prefetching. dataset_iterator = iter(self._dataset) self._next_batch_idx = 0 logger.debug( "AsynchronousDataAccessor worker: new iterator ready") continue # We've got a writing slot if self._rebatched_size: assert not rebatch_leftover, ( "Rebatch data should be empty and" " ready to be used if needed") for index, tensor in enumerate(data): # Note _index_copy_ doesn't work for FP16, it causes # the following error: # RuntimeError: _th_index_copy_ not supported on CPUType # for Half" # # That's why we instead use a regular copy_ in_size = len(tensor) out_size = self._rebatched_size - self._next_batch_idx copy_size = min(in_size, out_size) if in_size > out_size: rebatch_leftover.append(tensor[copy_size:]) buffers.current[index][self._next_batch_idx:self. _next_batch_idx + copy_size].copy_( tensor[:copy_size]) self._next_batch_idx += copy_size else: # Copy the tensor into the preallocated shared memory. for index, tensor in enumerate(data): buffers.current[index].copy_(tensor) # If we're not rebatching: always notify the host an element is ready. # Otherwise only notify the host if the full batch is ready. if self._rebatched_size is None or \ self._next_batch_idx == self._rebatched_size: self._next_batch_idx = 0 # Tell the host this data is ready. buffers.markWriteComplete() logger.debug( "AsynchronousDataAccessor worker: ready to exit: checking parent" " is ready") # In the unlikely event the worker is done reading the dataset # before the parent is done setting the buffers up: wait here. host_handler.waitUntilSetupComplete() logger.debug("AsynchronousDataAccessor worker: clean exit") class _HostCommand(enum.IntEnum): SetupComplete = 0 Shutdown = 1 ResetIterator = 2 StartIterating = 3 class _WorkerState(enum.IntEnum): Stopped = 0 Prefetching = 1 Loading = 2 class _HostCommandHandler: def __init__(self, command_pipe): self.pipe = command_pipe self.setup_complete = False self.shutdown_now = False self._reset_iterator = False self._start_iterating = False def checkMessages(self, blocking=False, ignore_setup_complete=True): """ ignore_setup_complete: setup complete is usually just noise. (We only care about the setup being complete if we're trying trying to shutdown the worker process), so when asked to wait for a message, if the first one we receive is setup complete, usually we'll want to wait some more for the one we actually are interested in. """ # Check for messages from the parent process: if self.pipe.poll() or blocking: cmd = self.pipe.recv() # remove the data assert isinstance(cmd, _HostCommand) if cmd == _HostCommand.SetupComplete: logger.debug("SetupComplete command received") assert not self.setup_complete, ("More than one SetupComplete " "event received") self.setup_complete = True if ignore_setup_complete: self.checkMessages(blocking) elif cmd == _HostCommand.Shutdown: logger.debug("Shutdown command received") self.shutdown_now = True elif cmd == _HostCommand.ResetIterator: logger.debug("ResetIterator command received") self._reset_iterator = True elif cmd == _HostCommand.StartIterating: logger.debug("StartIterating command received") self._start_iterating = True else: raise _impl.createPoptorchError( f"Unknown command received {cmd}") def priorityCommandWaiting(self): return self.shutdown_now or self._reset_iterator def waitUntilSetupComplete(self): if not self.setup_complete: self.checkMessages(blocking=True, ignore_setup_complete=False) # Shutdown has been requested: there is no other valid command the host # can send at that point assert self.setup_complete def startIteratingPending(self): """Note: returns state and reset the value to False""" if self._start_iterating: self._start_iterating = False return True return False def resetIteratorPending(self): """Note: returns state and reset the value to False""" if self._reset_iterator: self._reset_iterator = False return True return False def waitUntilStartIteration(self): """Wait until a start iteration message is received. Return True if we successfully received a start iteration message. False if it was a reset or shutdown command. """ if self.priorityCommandWaiting(): return False if not self._start_iterating: self.checkMessages(blocking=True) return self.startIteratingPending() class _EndOfFileFlag: """ Share a small 2 values buffer with host to signal EOF and where in ring buffer the event occurred. First value: -1 means no event and the worker will keep loading until EOF is reached or the buffer is full. -2 means iterator reset complete. (Will be cleared by the worker once it's received the start iterating command from the host) Any other value: wait for an iterator to be created to start loading more data. Second value: when rebatching + drop_last=False: Indicate the batch size of the left over tensor 0: No left over 0 < N < rebatch_size: left-over batch size """ def __init__(self, eof_mem=None): if eof_mem is None: eof_mem = torch.tensor([-1, 0], dtype=torch.int).share_memory_() self.eof_mem = eof_mem def setResetFlag(self): """Called by the worker once the iterator has been reset""" self.eof_mem[0] = -2 def waitForReset(self): while self.eof_mem[0] != -2: pass def isEofIndex(self, index): return self.eof_mem[0] == index and self.eof_mem[1] == 0 def leftOver(self, index): """Batch size of the tensor at the end of file index. 0 either means it's not the end of the dataset yet or there is no left over batches, the end of file index is empty. (It will contain the first element from the next iteration if a new iterator is created). N means the element at the end of file index has a reduced batch of N. (The first element from the next iteration if a new iterator is created will be located at the next index). """ if self.eof_mem[0] == index: return self.eof_mem[1] return 0 def clearFlag(self): self.eof_mem[1] = 0 self.eof_mem[0] = -1 def setFlag(self, buffer_idx, last_batch_size=0): """If ``last_batch_size`` is 0 then ``buffer_idx`` is the index of the first buffer after the end of file. Otherwise the buffer at ``buffer_idx`` will contain a tensor of reduced batch size ``last_batch_size`` elements. (Only used when drop_last=False and rebatched_size > 0). """ # Important: eof_tensor[1] must be set before eof_tensor[0] # to avoid race conditions with the consumer. self.eof_mem[1] = last_batch_size self.eof_mem[0] = buffer_idx class _RingBufferIndex: """The index ring buffer is a ``buffer_size`` list of booleans keeping track which elements from the data ring buffers is ready to be written or ready to be read. * True: ready to write * False: ready to read. It is allocated using shared memory as it is shared between the worker process (producer) and the main process (consumer). The memory for the ring buffer will be allocated by the producer (Worker process) and initialised to all False (i.e all ready to be written). """ def __init__(self, buffer_size, indices_mem=None): if indices_mem is not None: self.buffers = indices_mem assert len(indices_mem) == buffer_size else: self.buffers = torch.tensor([False] * buffer_size, dtype=torch.bool).share_memory_() self.buffer_size = buffer_size self._index = 0 def increment(self): self._index += 1 if self._index >= self.buffer_size: self._index = 0 def reset(self): self.buffers.fill_(False) self._index = 0 def set(self, value): self.buffers[self._index] = value def value(self): return self.buffers[self._index] def __call__(self): return self._index class _IDataRingBuffer: def __init__(self, buffer_size, data_len, indices_mem=None): self._index = _RingBufferIndex(buffer_size, indices_mem) D = data_len B = buffer_size assert buffer_size == self._index.buffer_size # The structure of the allocated buffers is # buffers[D][B][tensor] where: # D = number of tensors in one tuple from the dataset # B = number of buffers in ring buffer. # # but we're going to iterate over B so we will store # the buffers as they get added to: # buffers[B][D][tensor] self._data = [[None] * D for _ in range(B)] def setBuffer(self, buffer, data_idx): """Add a new buffer to the ring expecting the tensor to be of the shape buffer[B][tensor] but we store tensors as: buffers[B][D] so we need to shuffle the data. """ assert len(buffer) == self._index.buffer_size assert data_idx < len(self._data[0]) for d in range(self._index.buffer_size): self._data[d][data_idx] = buffer[d] @property def current(self): """Return the current buffer""" return self._data[self._index()] @property def indices_mem(self): """Return the shared memory buffer used to store the indices""" return self._index.buffers def currentIndex(self): return self._index() def reset(self): """Reset the state of the ring buffer (All the buffers become available to write again)""" self._index.reset() class _DataRingBufferWriter(_IDataRingBuffer): """The writer's logic goes as follow: - Wait for the current slot to become available for writing - Fill the buffer - Mark the buffer as ready to be read and move to the next one. >>> while True: ... while not buffers.isAvailable(): ... time.sleep() ... buffers.current.copy(data) ... buffers.markWriteComplete() """ def markWriteComplete(self): """Mark the current buffer as ready to be read and move to the next buffer.""" self._index.set(True) self._index.increment() def isAvailable(self): """Return True if the current index is available for writing, or False if it contains a tensor which hasn't been read by the consumer process yet.""" return not bool(self._index.value()) class _DataRingBufferReader(_IDataRingBuffer): """The reader's logic goes as follow: - Wait for the current slot to become ready to read. - Mark the buffer as locked for reading and move to next buffer. - Read the locked buffer - Release the locked buffer Note: the consumer can check if the current buffer is available while the previous one is still locked however it cannot lock more than one buffer at any given time. """ def __init__(self, buffer_size, data_len, indices_mem=None): self._locked = None super().__init__(buffer_size, data_len, indices_mem) def isAvailable(self): """Return True if the current buffer is ready to be read.""" return bool(self._index.value()) def hasLock(self): """Return True if the ring buffer currently has a buffer locked for reading.""" return self._locked is not None def lock(self): assert self._locked is None self._locked = self.currentIndex() data = self.current self._index.increment() return data def unlockIfLocked(self): if self._locked is not None: self._index.buffers[self._locked] = False self._locked = None ================================================ FILE: python/_impl.py ================================================ # Copyright (c) 2020 Graphcore Ltd. All rights reserved. from contextlib import contextmanager import copy import copyreg import fcntl import hashlib import itertools import os from functools import partial, wraps import weakref import torch # Do not import any poptorch.* here: it will break the poptorch module from ._logging import logger from . import poptorch_core from ._utils import isOnIpu, getIpuTensorId # A flag to tell the user if the current target is IPU. This is to allow # divergent IPU/CPU codepaths within one model. _is_ipu_context = False # A flag to tell if the dispatch mechanism is used to obtain # a graph. _dispatch_tracing = False # Some modules will still work even if the buffer address changes during tracing BUFFERS_CAN_CHANGE = ( torch.nn.BatchNorm1d, torch.nn.modules.batchnorm.BatchNorm1d, torch.nn.BatchNorm2d, torch.nn.modules.batchnorm.BatchNorm2d, torch.nn.BatchNorm3d, torch.nn.modules.batchnorm.BatchNorm3d, ) class NameScopeHook: """ Create a name scope for each operator present in the module. The operator name scope will be based on the names appearing in the named_modules function from torch.nn.Module.. """ def __init__(self, module: 'torch.nn.Module'): self.hooks = [] for name, m in module.named_modules(): if len(name) > 0: self.hooks.append( m.register_forward_pre_hook( partial(self._enter_fn, name=name))) self.hooks.append(m.register_forward_hook(self._exit_fn)) def _enter_fn(self, module, input, name): # pylint: disable=unused-argument torch.ops.poptorch.push_name_scope(name.split(".")[-1]) def _exit_fn(self, module, input, output): # pylint: disable=unused-argument torch.ops.poptorch.pop_name_scope() def remove(self): """ Remove all existing hooks related to creating a name scope for operators. """ for hook in self.hooks: hook.remove() def createPoptorchError(msg): type = "poptorch_py_error" error = poptorch_core.Error(f"'{type}': {msg}") error.type = type error.message = msg error.location = "" return error def isRunningOnIpu() -> bool: """ This function returns `True` when executing on IPU and `False` when executing the model outside IPU scope. This allows for separate code-paths to be marked in the model simply by using: >>> if poptorch.isRunningOnIpu(): >>> # IPU path >>> else: >>> # CPU path Note this will only apply to code during execution. During model creation it will always return `False`. :returns: True if running on IPU, otherwise False. """ global _is_ipu_context return _is_ipu_context def setIpuContext(val: bool): global _is_ipu_context _is_ipu_context = val def isDispatchTracing() -> bool: """ This function returns `True` when executing within the IPUScope. The flag is set when entering the scope and turned off when exiting. """ global _dispatch_tracing return _dispatch_tracing def setDispatchTracing(val: bool): global _dispatch_tracing _dispatch_tracing = val def internal_cast(tensor, dtype): if dtype in [torch.float, torch.float32]: return torch.ops.poptorch.internal_cast(tensor, "FLOAT") if dtype in [torch.half, torch.float16]: return torch.ops.poptorch.internal_cast(tensor, "FLOAT16") raise ValueError( 'Invalid poptorch.cast target type. Expecting torch.float or torch.half' ) def applyOptimizer(optimizer): num_groups = len(optimizer.param_groups) for index in range(0, num_groups): torch.ops.poptorch.optimizer_group( index, optimizer.param_groups[index]["params"]) # To understand which variable groups the user wants to apply the # optimizer to we need to mark them via a wrapper. We do this because # when we reference the variables in the context of the operation we # get the corresponding IR value for "free" as part of the trace. # Otherwise we would need a system to map the variable in the optimizer # to the variable in the model to the variable in the IR. class OptimizerWrapper(torch.nn.Module): def __init__(self, model, optimizer): super().__init__() self.model = model self.optimizer = optimizer def forward(self, *args, **kwargs): out = self.model(*args, **kwargs) applyOptimizer(self.optimizer) return out def destroyDispatcherOnExit(func): """Function decorator to always destroy the dispatcher at the end of the wrapped function.""" class OnExit(): def __enter__(self): pass def __exit__(self, exc_type, value, traceback): poptorch_core.destroyDispatcher() @wraps(func) def wrapper(*args, **kwargs): with OnExit(): return func(*args, **kwargs) return wrapper @contextmanager def distributedCacheLock(model, opts): """In a distributed environment we only want the model to be compiled once. If there is only one process or if the cache is not enabled: no need for a lock, early return. Otherwise: The first process to reach the lock takes it and compiles the model. The model will be added to the PopART cache. After the first process releases the lock the other ones will grab it one at the time and compile the model too (Except that they will now all hit the cache). The last process to grab / release the lock will delete the file. (Each process append a character to the file, so the position in the file when acquiring the lock indicates how many processes have already successfully compiled the model). """ filename = None if opts.Distributed.numProcesses > 1: cache = opts._popart.options.get("cachePath", "") # pylint: disable=protected-access if not cache: logger.warning( "Use poptorch.Options.enableExecutableCaching() to avoid " "compiling the model once per process") else: os.makedirs(cache, exist_ok=True) assert os.access(cache, os.W_OK), (f"Cache folder {cache}" " is not writable") filename = os.path.join( cache, "%s.lock" % hashlib.md5(repr(model).encode("utf-8")).hexdigest()) # Not distributed mode or the cache is not enabled: do nothing. if not filename: yield False return delete_file = False try: with open(filename, "a+") as f: try: fcntl.flock(f, fcntl.LOCK_EX) # Add a character to the file f.write("0") logger.debug( "Executable cache file locked by process %s (pos %d/%d)", opts.Distributed.processId, f.tell(), opts.Distributed.numProcesses) delete_file = f.tell() == opts.Distributed.numProcesses # Only the first process should compile yield f.tell() == 1 finally: logger.debug("Process %s released the cache lock", opts.Distributed.processId) fcntl.flock(f, fcntl.LOCK_UN) finally: if delete_file: os.remove(filename) # A helper class that compares using pointer semantics rather than value # semantics (i.e. comparing using `is` rather than eq). This is needed because # Tensor comparison in torch returns a Tensor rather than an boolean class WeakPtr(weakref.ref): __hash__ = weakref.ref.__hash__ def __eq__(self, other): s = self() o = other() return self is other if s is None else s is o # Our own dictionary with weak keys that compares keys using pointer semantics # rather than value semantics (i.e. comparing using `is` rather than `eq`). We # use this rather than a weakref.WeakKeyDictionary because that uses equality on # values to compare items. # # Note: that we do not provide functionality for iterating over the dictionary # since there will be issues if the cleanup function is called while iterating class WeakKeyPtrDict: def __init__(self, dict=None): self.data = {} def cleanup(k, selfref=weakref.ref(self)): self = selfref() if self is not None: del self.data[k] self._cleanup = cleanup self.update(dict) def __setitem__(self, key, value): self.data[WeakPtr(key, self._cleanup)] = value def __delitem__(self, key): del self.data[WeakPtr(key)] def __getitem__(self, key): return self.data[WeakPtr(key)] def get(self, key, default=None): return self.data.get(WeakPtr(key), default) def __contains__(self, key): return WeakPtr(key) in self.data def update(self, dict=None): if dict is not None: for k, v in dict.items(): self.__setitem__(k, v) # The pickle handlers are called in two cases: when an object is copied # (i.e copy.copy(obj)) or when an object is pickled / serialised. # In both cases the object is first dumped using pickleUnwrapModel and then # in the copy case _restoreWrapperIfNecessary() is called immediately after # to create the new object. # # The _wrapper_registry keeps track of the mapping between user model, parameter, # buffer types and their corresponding wrapper. # When an object is copied we want to preserve the Wrapper type: the PopTorch # wrapper doesn't contain any attribute so it's just a question of updating # the __class__attribute. # # When an object is loaded from file: the wrapper type doesn't exist any more # therefore we keep the object unwrapped. (It will be wrapped again when passed # to poptorch.trainingModel anyway) _wrapper_registry = WeakKeyPtrDict() # List of all the wrapper types used by PopTorch. _wrapper_types = [] def _restoreWrapperIfNecessary(obj): wrapperType = _wrapper_registry.get(obj) if not wrapperType is None: obj.__class__ = wrapperType return obj def _unwrapIfWrappedAndRegister(obj): global _wrapper_registry if isWrapped(obj): wrapperType = obj.__class__ obj.__class__ = obj.__class__.__bases__[0] _wrapper_registry[obj] = wrapperType def _pickleUnwrapObject(obj): global _wrapper_registry wrapperType = obj.__class__ if not wrapperType in _wrapper_types: raise createPoptorchError("Internal Error") # We need to unwrap obj before copying it because this is the function # registered for doing copies obj.__class__ = obj.__class__.__bases__[0] other = copy.copy(obj) _wrapper_registry[other] = wrapperType obj.__class__ = wrapperType return _restoreWrapperIfNecessary, (other, ) def registerWrapperType(wrapper_type): global _wrapper_types assert wrapper_type not in _wrapper_types _wrapper_types.append(wrapper_type) copyreg.pickle(wrapper_type, _pickleUnwrapObject) def isWrapped(obj): global _wrapper_types return isinstance(obj, tuple(_wrapper_types)) def unwrapIfWrapped(obj): """Unwrap the model if it is wrapped, without unwrapping parameters and buffers.""" if isWrapped(obj): obj.__class__ = obj.__class__.__bases__[0] return obj def traceMethod(label): def decorator(func): @wraps(func) def wrapper(self, *args, **kwargs): with self._profiling.tracepoint(label): # pylint: disable=protected-access return func(self, *args, **kwargs) return wrapper return decorator def forEachParameterAndBuffer(model, fn): for module_name, module in model.named_modules(): if isinstance(module, BUFFERS_CAN_CHANGE): continue for name, buff in module.named_buffers(prefix=module_name, recurse=False): fn(name, buff) for name, param in model.named_parameters(): fn(name, param) def unwrapModelIfNecessary(model: torch.nn.Module): """Unwrap the model, including parameter and buffer annotations and the model as a whole.""" for buff in itertools.chain(model.buffers(), model.parameters()): _unwrapIfWrappedAndRegister(buff) _unwrapIfWrappedAndRegister(model) def rewrapModelIfNecessary(model: torch.nn.Module): # Restores the PoptorchParameter and PoptorchBuffer annotations in the model for buff in itertools.chain(model.buffers(), model.parameters()): _restoreWrapperIfNecessary(buff) _restoreWrapperIfNecessary(model) def getBufferAndParameterTensors(model): tensors = {} def fn(name, buff): tensors[name] = buff forEachParameterAndBuffer(model, fn) return tensors def getBufferAndParameterAddresses(model): # Obtains dictionaries of the data ptr addresses of every buffer # and parameter def tensor_info(x): if isOnIpu(x): return x.device, getIpuTensorId(x) return x.device, x.data_ptr() buffer_addresses = {} for module_name, module in model.named_modules(): if isinstance(module, BUFFERS_CAN_CHANGE): continue for name, buff in module.named_buffers(prefix=module_name, recurse=False): buffer_addresses[name] = tensor_info(buff) parameter_addresses = {} for name, param in model.named_parameters(): parameter_addresses[name] = tensor_info(param) return buffer_addresses, parameter_addresses def errorOnBufferOrParameterAddressChanges(old_addresses, new_addresses): # Do the buffers first then parameters order = ["Buffer", "Parameter"] for idx, dic in enumerate(old_addresses): for name, address in dic.items(): if name not in new_addresses[idx]: err_msg = ( f"{order[idx]} {name} is removed from the model when" " calling the forward method.") raise createPoptorchError(err_msg) if address != new_addresses[idx][name]: err_msg = ( f"{order[idx]} {name} is reassigned within the model" " when calling the forward method. This is not supported. " f"Consider using self.{name}.copy_(src) to copy data from " "a source tensor, where src is the name of the source " "tensor.") raise createPoptorchError(err_msg) ================================================ FILE: python/_logging.py ================================================ # Copyright (c) 2020 Graphcore Ltd. All rights reserved. import datetime as dt import logging import os import signal import sys import subprocess import traceback import faulthandler # Print tracebacks even when Python dies (e.g Segfault) faulthandler.enable() faulthandler.register(signal.SIGTERM.value, chain=True) # pylint: disable=no-member # Create a poptorch logger which outputs to the console INFO messages and above logger = logging.getLogger("poptorch::python") _LOG_LEVEL_MAPPING = { "TRACE": (0, logging.DEBUG), "TRACE_ALL": (0, logging.DEBUG), "DEBUG": (1, logging.DEBUG), "DEBUG_IR": (1, logging.DEBUG), "INFO": (2, logging.INFO), "WARN": (3, logging.WARN), "ERR": (4, logging.ERROR), "OFF": (6, logging.CRITICAL) } _INTERNAL_ONLY = ("TRACE_ALL", "DEBUG_IR") def setPopartLogLevel(level): if not isinstance(level, str) or level not in _LOG_LEVEL_MAPPING: raise ValueError("Level must be one of " + ", ".join(_LOG_LEVEL_MAPPING.keys())) # Only import poptorch_core when it's needed import poptorch.poptorch_core as poptorch_core # type: ignore # pylint: disable=wrong-import-position, import-outside-toplevel poptorch_core.setPopartLogLevel(_LOG_LEVEL_MAPPING[level][0]) def setLogLevel(level, update_cpp=True): if isinstance(level, int): # Legacy usage for key in _LOG_LEVEL_MAPPING: if _LOG_LEVEL_MAPPING[key][0] == level: setLogLevel(key) return raise ValueError("Invalid log level integer") try: # Change it in C++ first if update_cpp: # Only import poptorch_core when it's needed import poptorch.poptorch_core as poptorch_core # type: ignore # pylint: disable=wrong-import-position, import-outside-toplevel level_int = _LOG_LEVEL_MAPPING[level][0] poptorch_core.setLogLevel(level_int) # Then in python level_py = _LOG_LEVEL_MAPPING[level][1] logger.setLevel(level_py) except KeyError: error_str = "Unknown log level: " + str(level) + ". Valid values are " all_keys = sorted(list(_LOG_LEVEL_MAPPING.keys())) public_keys = [k for k in all_keys if k not in _INTERNAL_ONLY] for key in public_keys: error_str += key if key == public_keys[-2]: error_str += " and " elif key != public_keys[-1]: error_str += ", " raise ValueError(error_str) setLogLevel(os.environ.get("POPTORCH_LOG_LEVEL", "WARN"), update_cpp=False) class _PoptorchFormatter(logging.Formatter): BLACK, RED, GREEN, YELLOW, BLUE, MAGENTA, CYAN, WHITE = range(30, 38) RESET_COLOR = "\033[0m" BOLD_COLOR_SEQ = "\033[1;%dm" COLOR_SEQ = "\033[%dm" FORMATS = { logging.DEBUG: COLOR_SEQ % CYAN, logging.INFO: RESET_COLOR, logging.WARNING: BOLD_COLOR_SEQ % YELLOW, logging.ERROR: BOLD_COLOR_SEQ % RED, logging.CRITICAL: BOLD_COLOR_SEQ % RED, } def outputToFile(self): return not sys.stdout.isatty() or not sys.stderr.isatty() def __init__(self): fmt = "[%(asctime)s] [%(name)s] [%(levelname)s] %(message)s" # Disable the colours when the output is redirected to a file. if self.outputToFile(): super().__init__(fmt) else: super().__init__("%(color)s" + fmt + self.RESET_COLOR) def formatTime(self, record, datefmt=None): ct = dt.datetime.fromtimestamp(record.created) if datefmt: s = ct.strftime(datefmt) else: t = ct.strftime("%H:%M:%S") s = "%s.%03d" % (t, record.msecs) return s def format(self, record): record.color = self.FORMATS[record.levelno] record.levelname = record.levelname.lower() return super().format(record) def _excepthook(*args): e = traceback.format_exception(*args) extra_info = "" # If the exception was raised by a subprocess print its # stderr / stdout if available. if isinstance(args[1], subprocess.CalledProcessError): extra_info = args[1].stderr or args[1].stdout extra_info = "\n" + extra_info.decode("utf-8") if any("[closed]" in repr(h) for h in logger.handlers): # In some cases pytest has already closed the logger so use stderr # as a fallback. print("%s\n%s%s", e[-1], "".join(e), extra_info, file=sys.stderr) else: logger.critical("%s\n%s%s", e[-1], "".join(e), extra_info) _console = logging.StreamHandler() _console.setFormatter(_PoptorchFormatter()) _console.setLevel(logging.DEBUG) logger.addHandler(_console) sys.excepthook = _excepthook ================================================ FILE: python/_optimizer_attributes.py ================================================ # Copyright (c) 2021 Graphcore Ltd. All rights reserved. import enum import numbers import torch # Do not import any poptorch.* here: it will break the poptorch module from ._logging import logger from . import optim, enums class OptimizerAttrTracker: def __init__(self, opts): if opts._relax_optimizer_checks: self.log = logger.debug else: self.log = logger.warning self.group_attributes = [] self.optim_attributes = [] self.record_attributes = True self.printed_msgs = [] self.type = "Unknown" def setType(self, optimizer_type): assert isinstance(optimizer_type, _OptimizerType), \ "Unsupported optimizer type. Types supported %s" % \ ', '.join(str(t) for t in _OptimizerType) self.type = optimizer_type.name def enableChecks(self): self.record_attributes = False def checkDefaultAttributes(self, provided): self._check(self.group_attributes, provided, "default group variable") def checkGroupAttributes(self, provided, group): self._check(self.group_attributes, provided, f"group {group} attribute") def checkOptimizerAttributes(self, provided): self._check(self.optim_attributes, provided, "optimizer attribute") def _check(self, expected, provided, attr_type): extra = [attr for attr in provided if attr not in expected] if self.record_attributes: expected += extra elif extra: msg = f"Ignoring unexpected {attr_type} in {self.type} optimizer:" msg += f" {extra}" if msg not in self.printed_msgs: self.log(msg) self.printed_msgs.append(msg) # pylint: disable=too-many-statements def convertOptimizerToDict(optimizer, attr_tracker, options, is_compiled): optimizer_type = _toPoptorchOptimizer(optimizer) attr_tracker.setType(optimizer_type) assert optimizer_type is not None, \ "Unsupported optimizer type. Types supported %s" % \ ', '.join(str(t) for t in _OptimizerType) opt_class = _toPoptorchClass(optimizer_type) num_groups = len(optimizer.param_groups) variable_attrs = getattr(optimizer, "variable_attrs", None) def assertAmsgradDisabled(params): if params["amsgrad"]: raise ValueError("Only non-amsgrad " "Adam/AdamW optimizers are supported.") return {} def isFloat16(type, name): if type not in [torch.float16, torch.float32]: raise ValueError(f"{name} must be set to either torch.float16" " or torch.float32 not {type}") return type == torch.float16 def assertRMSProp(value, name): if optimizer_type not in (_OptimizerType.RMSPROP, _OptimizerType.RMSPROP_CENTERED): raise ValueError( f"{name} is only available with RMSProp optimizers.") return value def ignore(_params): return {} def isAlwaysConst(_value): return True def isNeverConst(_value): return False def isNotNaN(value, name): if value == float("nan"): raise ValueError(f"{name} must not be NaN") return value # Separate attributes which can be set per group (And therefore are stored # in `defaults` and `param_groups`) and the ones which are global and just # stored as attributes of the optimizer. # Register all the attribute readers attr_readers = { "amsgrad": assertAmsgradDisabled, "bias_correction": ignore, "centered": ignore, "use_combined_accum": ignore } # Optimizer attributes: global, cannot change over time. # source: opt.name # format: {name: value} _AttrReader(attr_readers, "accum_type", _OptimizerGetter(torch.float32), isFloat16) _AttrReader(attr_readers, "velocity_accum_type", _OptimizerGetter(torch.float32), isFloat16) _AttrReader(attr_readers, "first_order_momentum_accum_type", _OptimizerGetter(torch.float32), isFloat16) _AttrReader(attr_readers, "second_order_momentum_accum_type", _OptimizerGetter(torch.float32), isFloat16) _AttrReader(attr_readers, "use_tf_variant", _OptimizerGetter(False), assertRMSProp) _AttrReader(attr_readers, "max_grad_norm", _OptimizerGetter(float("Inf")), isNotNaN) # Optimizer variables: global, can change over time. # source: opt.name # format: {name: (value, is_const)} # Set MeanReductionStrategy based on accum_type # float32: Post (default) # float16: Running if hasattr(optimizer, "accum_type") and optimizer.accum_type == torch.float16: # Only Post MeanReductionStrategy is supported for combined_accum variant if not hasattr( optimizer, "use_combined_accum") or not optimizer.use_combined_accum: if not is_compiled: # If the executable hasn't been compiled yet then it's ok to change # the reduction strategy. options._unfreeze() # pylint: disable=protected-access options.Training.setMeanAccumulationAndReplicationReductionStrategy( # pylint: disable=line-too-long enums.MeanReductionStrategy.Running) options._freeze() # pylint: disable=protected-access elif options.Training.meanAccumulationAndReplicationReductionStrategy != enums.MeanReductionStrategy.Running: # pylint: disable=line-too-long raise ValueError( "Invalid optimizer: the new optimizer would " "require changing options.Training." "meanAccumulationAndReplicationReductionStrategy to " "poptorch.MeanReductionStrategy.Running but the " "executable is already compiled.") # pylint: disable=protected-access auto_loss_scaling = options._Popart.options.get( "automaticLossScalingSettings.enabled", False) if variable_attrs and auto_loss_scaling: # Automatic loss scaling requires loss scaling to be variable variable_attrs.markAsVariable("loss_scaling") _AttrReader( attr_readers, "loss_scaling", _OptimizerGetter(1.0), _ValueConstPairFormatter( variable_attrs, lambda v: v == 1.0 and not auto_loss_scaling)) # Group variables: per group, can change over time. # source: opt.param_groups[i][name] / opt.defaults[name] # format: {name: (value, is_const)} _AttrReader(attr_readers, "lr", _GroupGetter(), _ValueConstPairFormatter(variable_attrs, isNeverConst), new_name="learningRate") weight_decay_const_value = 0.0 # In PyTorch AdamW has a different default value from Adam if optimizer_type == _OptimizerType.ADAMW: weight_decay_const_value = 1e-2 _AttrReader( attr_readers, "weight_decay", _GroupGetter(), _ValueConstPairFormatter(variable_attrs, _IsEqualTo(weight_decay_const_value))) _AttrReader(attr_readers, "momentum", _GroupGetter(), _ValueConstPairFormatter(variable_attrs, _IsEqualTo(0.0))) _AttrReader(attr_readers, "velocity_scaling", _GroupGetter(1.0), _ValueConstPairFormatter(variable_attrs, _IsEqualTo(1.0))) _AttrReader(attr_readers, "dampening", _GroupGetter(), _ValueConstPairFormatter(variable_attrs, _IsEqualTo(0.0))) _AttrReader(attr_readers, "eps", _GroupGetter(), _ValueConstPairFormatter(variable_attrs, _IsEqualTo(1e-08))) _AttrReader(attr_readers, "max_weight_norm", _GroupGetter(), _ValueConstPairFormatter(variable_attrs, _IsEqualTo(65500.0))) _AttrReader(attr_readers, "alpha", _GroupGetter(), _ValueConstPairFormatter(variable_attrs, isAlwaysConst)) _AttrReader(attr_readers, "nesterov", _GroupGetter(), _ValueConstPairFormatter(variable_attrs, isAlwaysConst)) _BetaReader(attr_readers, variable_attrs) # Split the optimizer's attributes in one of the three categories: # - Group variables # - Optimizer variables # - Optimizer attributes # # The optimizer dictionary we send to the backend is structured like: # { # "optimizer_type": type, # "opt_attrs_0": value, # ... # "defaults": { # "group_vars_0": (value, is_const), # ... # "opt_vars_0": (value, is_const), # ... # }, # "groups": [ # { # "group_vars_0": (value, is_const), # ... # }, # ... # ] # } group_vars = opt_class._group_vars # pylint: disable=protected-access all_attrs = [ attr for attr in opt_class._child_only if attr not in group_vars # pylint: disable=protected-access ] opt_attrs = [ attr for attr in all_attrs if attr not in opt_class._child_vars # pylint: disable=protected-access ] opt_vars = [ attr for attr in opt_class._child_only # pylint: disable=protected-access if attr in opt_class._child_vars # pylint: disable=protected-access ] def getOptimizerAttrNames(opt): # Remove attributes belonging to the upstream Optimizer exceptions = ["defaults", "state", "param_groups", "variable_attrs"] return [k for k in opt.__dict__.keys() if k not in exceptions] def getGroupAttrNames(group): # Remove attributes belonging to the upstream Optimizer exceptions = ["params"] return [k for k in group.keys() if k not in exceptions] opts = {"optimizer_type": optimizer_type} for attr in opt_attrs: opts.update(attr_readers[attr](optimizer)) defaults = {} for attr in group_vars: defaults.update(attr_readers[attr](optimizer.defaults)) attr_tracker.checkDefaultAttributes(list(optimizer.defaults.keys())) for attr in opt_vars: defaults.update(attr_readers[attr](optimizer)) attr_tracker.checkOptimizerAttributes(getOptimizerAttrNames(optimizer)) for i, g in enumerate(optimizer.param_groups): attr_tracker.checkGroupAttributes(getGroupAttrNames(g), i) opts["defaults"] = defaults # Create num_groups dictionaries opts["groups"] = [] for index in range(0, num_groups): group = {} params = optimizer.param_groups[index] for attr in group_vars: group.update(attr_readers[attr](params)) opts["groups"].append(group) logger.debug("Python optimizer %s", opts) # From now on print a message when encountering unknown attributes attr_tracker.enableChecks() return opts class _OptimizerType(enum.IntEnum): SGD1 = 0 SGD2 = 1 ADAM = 2 ADAMW = 3 ADAMW_NO_BIAS = 4 RMSPROP = 5 RMSPROP_CENTERED = 6 LAMB = 7 LAMB_NO_BIAS = 8 def _toPoptorchClass(optimizer_type): assert isinstance(optimizer_type, _OptimizerType) if optimizer_type in [_OptimizerType.ADAMW, _OptimizerType.ADAMW_NO_BIAS]: return optim.AdamW if optimizer_type in [ _OptimizerType.RMSPROP, _OptimizerType.RMSPROP_CENTERED ]: return optim.RMSprop if optimizer_type in [_OptimizerType.LAMB, _OptimizerType.LAMB_NO_BIAS]: return optim.LAMB if optimizer_type in [_OptimizerType.SGD1, _OptimizerType.SGD2]: return optim.SGD assert optimizer_type == _OptimizerType.ADAM, ( "Unknown optimizer_type %s" % optimizer_type) return optim.Adam # pylint: disable=too-many-return-statements def _toPoptorchOptimizer(optimizer): # If an optimizer has anything other than torch.optim.Optimizer or # poptorch.optim.Optimizer as its parent classes, it may be an attempt to # implement a custom optimizer through subclassing if torch.optim.Optimizer not in optimizer.__class__.__bases__ and \ optim.Optimizer not in optimizer.__class__.__bases__: logger.warning( "Optimizer `%s` subclassed from classes in poptorch.optim or " "torch.optim are unlikely to behave as intended. Poptorch does " "not run Python optimizer code directly; but instead uses IPU " "native optimisers implemented in PopART.", type(optimizer).__name__) if isinstance(optimizer, torch.optim.SGD): use_combined_accum = getattr(optimizer, "use_combined_accum", False) if use_combined_accum: return _OptimizerType.SGD1 return _OptimizerType.SGD2 if isinstance(optimizer, torch.optim.Adam): return _OptimizerType.ADAM if isinstance(optimizer, torch.optim.AdamW): if isinstance(optimizer, optim.AdamW): bias_correction = getattr(optimizer, "bias_correction", True) if not bias_correction: return _OptimizerType.ADAMW_NO_BIAS return _OptimizerType.ADAMW if isinstance(optimizer, torch.optim.RMSprop): centered = optimizer.param_groups[0]["centered"] for i, group in enumerate(optimizer.param_groups): assert group["centered"] == centered, ( "All parameter groups must " "have the same value for the 'centered' attribute (Group 0: " f"{centered} / Group {i}: {group['centered']})") if centered: return _OptimizerType.RMSPROP_CENTERED return _OptimizerType.RMSPROP if isinstance(optimizer, optim.LAMB): bias_correction = getattr(optimizer, "bias_correction", True) if bias_correction: return _OptimizerType.LAMB return _OptimizerType.LAMB_NO_BIAS return None def _toCamelCase(string): """Convert a snake case string (PyTorch) to camel case (PopART)""" words = string.split("_") return words[0] + "".join(w.capitalize() for w in words[1:]) class _GroupGetter: """Functor to access a parameter group attribute""" def __init__(self, default_value=None): self.default_value = default_value def __call__(self, group, name): assert isinstance(group, dict), (f"{name} must be stored in " "param_groups") value = group.get(name, self.default_value) assert value is not None, (f"Mandatory attribute {name} not found " "in optimizer group") return value class _OptimizerGetter: """Functor to access an Optimizer attribute""" def __init__(self, default_value=None): self.default_value = default_value def __call__(self, opt, name): assert isinstance(opt, torch.optim.Optimizer), ( f"{name} must be stored " "as an Optimizer attribute (Not in a group)") value = getattr(opt, name, self.default_value) assert value is not None, (f"Mandatory attribute {name} not found " "in optimizer attributes") return value def _assertIsNumber(value, name): assert isinstance(value, numbers.Number), (f"Expected a number for {name}" f" but got {value} instead") class _ValueConstPairFormatter: """Functor to format a value into a pair ``(value, is_const)`` where "is_const" is a boolean If ``variable_attrs`` is provided it will be used to determine the attribute's const-ness. Otherwise the ``const_evaluator`` function will be called. """ def __init__(self, variable_attrs, const_evaluator, value_validator=None): assert variable_attrs is None or isinstance(variable_attrs, optim.VariableAttributes) if value_validator is None: value_validator = _assertIsNumber self.value_validator = value_validator self.variable_attrs = variable_attrs self.const_evaluator = const_evaluator def __call__(self, value, name): self.value_validator(value, name) if self.variable_attrs: is_const = self.variable_attrs.isConstant(name) else: is_const = self.const_evaluator(value) return (value, is_const) class _IsEqualTo: """Functor which returns True if the passed value is equal to the reference""" def __init__(self, reference): self.reference = reference def __call__(self, value): return value == self.reference class _AttrReader: def __init__(self, readers, name, getter, formatter=None, new_name=None): if new_name is None: new_name = _toCamelCase(name) if formatter is None: formatter = lambda x, _: x self.name = name self.getter = getter self.new_name = new_name self.formatter = formatter # Register itself readers[name] = self def __call__(self, params): """Get the ``name`` attribute value from ``params`` (An ``optimizer`` or ``param_group``) - if ``name`` is not part of ``params`` then ``default_value`` will be used. - If no ``variable_attrs`` list and no const value are provided then only ``{name: value}`` will be returned. - if a ``variable_attrs`` object is provided then the parameter's const-ness will depend on whether or not it's marked as const. - if no list is provided but the parameter's value is equal to ``is_const_val`` then the parameter will be considered constant """ value = self.getter(params, self.name) return {self.new_name: self.formatter(value, self.name)} class _BetaReader(_AttrReader): def __init__(self, attr_readers, variable_attrs): def isAlwaysConst(_value): return True def assertIsFloatPair(value, name): assert isinstance(value, tuple) and len(value) == 2, ( f"Expected a pair for {name}" f" but got {value} instead") _assertIsNumber(value[0], name + "[0]") _assertIsNumber(value[1], name + "[1]") super().__init__( attr_readers, "betas", _GroupGetter(), _ValueConstPairFormatter(variable_attrs, isAlwaysConst, assertIsFloatPair)) def __call__(self, params): betas = super().__call__(params)["betas"] assert betas and isinstance(betas, tuple) and len(betas) == 2 assert isinstance(betas[0], tuple) and len( betas[0]) == 2, ("'betas' group attribute must be a pair") return { "beta1": (betas[0][0], betas[1]), "beta2": (betas[0][1], betas[1]) } ================================================ FILE: python/_options_config.py ================================================ #!/usr/bin/env python3 # Copyright (c) 2021 Graphcore Ltd. All rights reserved. import os import poptorch def parseAndSetOptions(options, filepath): cmds = [] with open(filepath) as f: filename = os.path.basename(f.name) prefix = "options." for line in f: # Remove whitespace stripped = line.strip() # Skip empty lines and comments if not stripped or stripped.startswith("#"): continue cmd = prefix + stripped cmds.append(cmd) code = "\n".join(cmds) try: # pylint: disable=exec-used exec(code, {}, {"poptorch": poptorch, "options": options}) except SyntaxError as err: err_class = err.__class__.__name__ detail = err.args[0] lineno = err.lineno line = err.text # pylint: disable=no-member raise poptorch.ConfigFileError("{} at line {} of {}: {}\n> {}".format( err_class, lineno, filename, detail, line)) ================================================ FILE: python/_options_impl.py ================================================ # Copyright (c) 2020 Graphcore Ltd. All rights reserved. import abc import copy import logging import datetime import torch import tqdm from ._logging import logger _begin_ipu_block = torch.ops.poptorch.begin_ipu_block # Disable tqdm locks: this might cause some visual artifacts # in the console but this will prevent dead locks in multiprocessing # applications. # https://github.com/tqdm/tqdm/issues/461#issuecomment-334343230 tqdm.tqdm.get_lock().locks = [] class ProgressBar: def __init__(self): self.compilation_time = None self._start_time = None self._bar = None self._last = 0 def __call__(self, progress: int, total: int): if self._bar is None: self._start_time = datetime.datetime.now() # Remove {rate_fmt}{postfix} from the default format # as it doesn't really make sense for a compilation process # # Note: this is *not* a f-string bar_format = "{l_bar}{bar}| {n_fmt}/{total_fmt} " bar_format += "[{elapsed}<{remaining}]" self._bar = tqdm.tqdm(desc="Graph compilation", total=total, bar_format=bar_format) self._bar.update(progress - self._last) self._last = progress self.compilation_time = datetime.datetime.now() - self._start_time if progress == total: self._bar.close() self._bar = None self.compilation_time = datetime.datetime.now() - self._start_time class OptionsDict: """Safe dictionary to store options: only keys which have been passed to the constructor can later be updated. """ def __init__(self, **default_values): # Keep a dictionary of warnings messages based on the parameter options: # these are printed when the dictionaries are consolidated. The use of a # dictionary allows a warning to be removed by the key, e.g. if there is # a warning that the default parameter has changed but the parameter is # specified explicitly. self._warnings = {} # Allow warnings to be disabled by adding them to the list self._warnings_disabled = set() # Option object will be frozen after first use. self._is_frozen = False # _values must be the last attribute set in the __init__ self._values = default_values def set(self, **kwargs): self.checkIsFrozen() for option, value in kwargs.items(): assert self.exists(option), ("Invalid option %s, valid options" " are %s") % (option, self._values.keys()) assert isinstance( value, type(self._values[option]) ), "Unexpected type %s for option %s. Expected %s" % ( type(value), option, type(self._values[option])) self._values[option] = value def createOrSet(self, **kwargs): self.checkIsFrozen() for option, value in kwargs.items(): if option in self._values: self.set(**{option: value}) else: self._values[option] = value def exists(self, option): return option in self._values def deleteIfExists(self, option): if self.exists(option): del self._values[option] def _hasattr(self, option): if option == "__class__": return True if option.startswith("_"): return option in self.__getstate__().keys() return self.exists(option) # pylint: disable=protected-access def _changeFreezeState(self, new_state): self._is_frozen = new_state for _, value in self.__dict__.items(): if isinstance(value, OptionsDict): if value._hasattr('_is_frozen'): value._is_frozen = new_state else: if hasattr(value, '_is_frozen'): value._is_frozen = new_state def _freeze(self): self._changeFreezeState(True) def _unfreeze(self): self._changeFreezeState(False) def checkIsFrozen(self, option=None): # Skip check during object initialization. if self._hasattr('_is_frozen'): if option != '_is_frozen' and self._is_frozen: raise AttributeError("Can't modify frozen Options") def __deepcopy__(self, memory): opts_class = self.__class__ copied_options = opts_class.__new__(opts_class) memory[id(self)] = copied_options for key, val in self.__dict__.items(): if key == '_is_frozen': val = False setattr(copied_options, key, copy.deepcopy(val, memory)) return copied_options def __getstate__(self): return self.__dict__ def __setstate__(self, state): self.__dict__.update(state) def __setattr__(self, option, value): # Private attributes are allowed, but should be set in the __init__ before _values # public ones must be declared in default_values. self.checkIsFrozen(option) if option.startswith("_"): # Option cannot be defined after _values definition. if self._hasattr('_values') and not self._hasattr(option): raise AttributeError( f"Invalid private attribute {option}. " f"Valid attributes: {list(self.__dict__.keys())}") super().__setattr__(option, value) else: self.set(**{option: value}) def __getattr__(self, option): if not self._hasattr(option): raise AttributeError(f"Invalid attribute {option}.") if option.startswith("_"): return self.__getstate__()[option] return self._values[option] def update(self, other): for warning in self._warnings.values(): if warning not in self._warnings_disabled: logger.warning(warning) assert not set(self._values.keys()).intersection( other), "Can't merge dictionaries, they have some keys in common" other.update(self._values) return other def toDict(self): return self.update({}) def __call__(self, option): return getattr(self, option) def __repr__(self): # Call __repr__ on v so that strings display with quotes. return (f"{type(self).__name__}(" + ", ".join(f"{k}={v.__repr__()}" for k, v in self._values.items()) + ", " + ", ".join(f"{k}={v.__repr__()}" for k, v in self.__dict__.items() if k != "_values") + ")") default_source_location_excludes = [ "install/poptorch", "site-packages/torch", "site-packages/poptorch" ] class IStageManager(abc.ABC): def __init__(self): self._next_auto_id = 0 self._current_ipu = None # We expect Torch to trace the graph 3 times, so to avoid printing # the same messages several times we store all the messages and # print the first third of them at the end. self._debug_messages = [] def clearDebug(self): self._debug_messages = [] def _debug(self, *args): if logger.isEnabledFor(logging.DEBUG): self._debug_messages.append(args) def printDebug(self): n = len(self._debug_messages) # We assume the graph was traced 3 times if: # - Number of messages can be divided by 3 # - The first message is identical to the n/3th and 2n/3th ones. is_triple_trace = n > 0 and n % 3 == 0\ and self._debug_messages[0] == self._debug_messages[n//3] \ == self._debug_messages[2*n//3] if is_triple_trace: for i in range(n // 3): logger.debug(*self._debug_messages[i]) else: # Not sure what happened: in doubt print everything for m in self._debug_messages: logger.debug(m) def nextAutoId(self): id = self._next_auto_id self._next_auto_id += 1 return str(id) @abc.abstractmethod def getStage(self, block_id): """Return the stage corresponding to the given block_id. """ def beginStage(self, user_id, ipu_id_from_block): user_id = user_id or self.nextAutoId() self._current_ipu = ipu_id_from_block stage = self.getStage(user_id) # If the user specified an ipu_id in the option use that one ipu = stage._ipu if stage._ipu is not None else ipu_id_from_block # pylint: disable=protected-access if ipu is None: self._debug( "No IPU specified for block %s: default to stage_id %d", user_id, stage._stage_id) # pylint: disable=protected-access ipu = stage._stage_id # pylint: disable=protected-access self._debug("Starting block id=%s stage=%d phase=%d ipu=%d", user_id, stage._stage_id, stage._phase_id, ipu) # pylint: disable=protected-access _begin_ipu_block(stage._stage_id, stage._phase_id, ipu) # pylint: disable=protected-access def resetAutoId(self): self._next_auto_id = 0 ================================================ FILE: python/_poplar_executor.py ================================================ # Copyright (c) 2021 Graphcore Ltd. All rights reserved. import collections import copy from datetime import timedelta import functools import itertools import os import pickle from typing import Callable, Dict, List, Optional from types import MethodType import weakref import warnings import torch # Do not import any poptorch.* here: it will break the poptorch module from . import _impl from . import _utils from . import _args_parser from . import _optimizer_attributes from . import enums from . import _printing from . import optim from . import profiling from . import poptorch_core # type: ignore from . import _poptorch_data from ._utils import accessAttributes, flattenTensorStructure, reconstructTensorStructure, isOnIpu from ._logging import logger from .options import Options, PipelinedExecution, ShardedExecution from .optim import Optimizer NO_EXECUTABLE_ERR = "Model has not been compiled or has been destroyed." # Hacky way to make sure tensors end up on the IPU rather than the CPU by default. # Note: this is only needed for backward compatibility with tracing but we will # eventually stop supporting this approach so make sure a warning is printed. class _SetDefaultDeviceType: def __init__(self): self.overrides = dict() self.saved_distribution_validate_args = None def replace(self): def create_wrapper(f): @functools.wraps(f) def _wrapper(*args, **kwargs): if "device" not in kwargs: logger.warning( "No device set in torch.%s(): forcing to IPU", f.__name__) kwargs["device"] = "ipu" return f(*args, **kwargs) return _wrapper # All the ops with FACTORY_PARAMS in /tools/pyi/gen_pyi.py for name in ("arange", "empty", "full", "full_like", "linspace", "logspace", "ones", "rand", "randint", "randn", "randperm", "range", "tensor", "zeros", "zeros_like"): func = getattr(torch, name) self.overrides[name] = func setattr(torch, name, create_wrapper(func)) def create_non_tensor_wrapper(f): @functools.wraps(f) def _wrapper(*args, **kwargs): if not any( isinstance(a, torch.Tensor) for a in itertools.chain( args, kwargs.values())) and "device" not in kwargs: logger.warning( "No device set in torch.%s(): forcing to IPU", f.__name__) kwargs["device"] = "ipu" return f(*args, **kwargs) return _wrapper # overloaded ops that take a device for some overloads for name in ["normal"]: func = getattr(torch, name) self.overrides[name] = func setattr(torch, name, create_non_tensor_wrapper(func)) # Arguments validation forces the tensors to be compared onto the IPU # then the result is sent back to the CPU. # For example: # >>> if self._validate_args: # >>> assert torch.lt(self.low, self.high).all() # pylint: disable=protected-access self.saved_distribution_validate_args = \ torch.distributions.Distribution._validate_args torch.distributions.Distribution.set_default_validate_args(False) def restore(self): # Restore the real Torch functions for name, real in self.overrides.items(): setattr(torch, name, real) torch.distributions.Distribution.set_default_validate_args( self.saved_distribution_validate_args) class _OverwriteContextManager: _subsitution_manager_types = [_SetDefaultDeviceType] def __init__(self): self.substitution_managers = [ manager_type() for manager_type in _OverwriteContextManager._subsitution_manager_types ] def __enter__(self): for overwriter in self.substitution_managers: overwriter.replace() return self def __exit__(self, exc_type, value, traceback): for overwriter in reversed(self.substitution_managers): overwriter.restore() @classmethod def registerSubsitutionManager(cls, type): if type not in cls._subsitution_manager_types: cls._subsitution_manager_types.append(type) # pylint: disable=too-many-public-methods class PoplarExecutor: """ This class should not be created directly but is a wrapper around the model that was passed into `inferenceModel` or `trainingModel`. It only has a few methods which can be used to interface with the IPU. """ _precompile_hooks: Dict[int, Callable] = collections.OrderedDict() _postcompile_hooks: Dict[int, Callable] = collections.OrderedDict() # pylint: disable=too-many-statements def __init__(self, model: 'torch.nn.Module', options: Optional['poptorch.Options'], training: bool, poptorch_version: str, optimizer: Optional['torch.optim.Optimizer'] = None, user_model: Optional['torch.nn.Module'] = None): if options: if not isinstance(options, Options): raise _impl.createPoptorchError( "Invalid type: 'options' is of " f"type {type(options)} (Expected poptorch.Options)") # Prevent the user from modifying these options. options._freeze() options = options.clone() else: options = Options() # NB model is the one which gets called, which may have its own wrapping # such as to have a loss. user_model is the one which is transformed. self._user_model = user_model or model if training: self._attribute_tracker = \ _optimizer_attributes.OptimizerAttrTracker( options) if options.defaultOutputMode(): # In training it makes sense to see only the last result, by # default. options.outputMode(enums.OutputMode.Final) if not optimizer: optimizer = optim.SGD(self._user_model.parameters(), lr=0.01) else: if options.defaultOutputMode(): # In inference it makes sense to see all the results, by default. options.outputMode(enums.OutputMode.All) if options.Training.gradient_accumulation != 1: err_msg = ( "You must set " "poptorch.Options().Training.gradientAccumulation to 1 " "or leave it as its default value (1) when running a " "poptorch.inferenceModel().") is_pipelined = (isinstance(options._execution_strategy, PipelinedExecution) and not isinstance(options._execution_strategy, ShardedExecution)) if is_pipelined: err_msg += (" Use poptorch.Options().deviceIterations " "to process a sufficient number of batches " "each run for pipelined execution instead.") raise _impl.createPoptorchError(err_msg) assert options.Training.gradient_accumulation == 1, () assert not optimizer, "Optimizer should be None for inference" self._model = model self._host_weights_version = 0 self._poptorch_version = poptorch_version self._executable = None self._outputs_structure = None self._options = options # The args parser needs to be initialised before the model gets wrapped # otherwise we will not be able to retrieve the real arguments list self._args_parser = _args_parser.ArgsParser(model) # Inputs used to compile the executable self._executable_inputs = None self._anchor_memory = {} # any anchors with unspecified output mode should receive the output # mode used for graph outputs for _, anchor in options.anchored_tensors.items(): if anchor[1]: anchor[2] = options.output_mode if anchor[2] == enums.OutputMode.EveryN: anchor[3] = options.output_return_period self._optimizer = optimizer self._ipu_optimizer_is_dirty = False self._host_rng_state_is_dirty = False self._cached_rng_state = None if self._options.exists("random_seed"): self._cached_rng_state = [self._options.random_seed] self._dict_optimizer = {} self.per_replica_params = {} self._training = training self._dirty_host_weights = False self._trace = None self._is_attached = False self._profiling = profiling.Channel( "poptorch.trainingModel" if self. training else "poptorch.inferenceModel") self._profiling.instrument(self, "copyWeightsToHost", "copyWeightsToDevice", "setOptimizer", "compile", "destroy") if optimizer: self.setOptimizer(optimizer) self._options._freeze() if self._training: # We don't want the pytorch model to keep the PopTorch one # alive so only keep a weak reference. parent = weakref.ref(self) class PoptorchModel(type(self._user_model)): def copyWeightsToHostIfNeeded(self): """ Return True if the weights on the host were dirty and have been updated. Return False if the weights were already up to date. """ p = parent() if p is not None: return p.copyWeightsToHostIfNeeded() return False def destroy(self): """Destroy the model: release the IPUs and the executable. """ p = parent() if p is not None: p.destroy() def __getattribute__(self, name): if name == "_host_weights_version": p = parent() if p is None: return None return p._host_weights_version if name in ("_buffers", "_parameters", "forward"): self.copyWeightsToHostIfNeeded() return object.__getattribute__(self, name) def __getattr__(self, name): attribute = super().__getattr__(name) if isinstance(attribute, torch.nn.parameter.Parameter): self.copyWeightsToHostIfNeeded() return attribute def state_dict(self, *args, destination=None, prefix="", keep_vars=False): """Return a shallow copy of the wrapped model's state dictionary. Note: all the elements in the state dictionary are unwrapped which means the state can be reloaded in an environment where PopTorch is not installed. """ out = collections.OrderedDict() out_cache = {} for k, v in super().state_dict(*args, destination, prefix, keep_vars).items(): v_id = id(v) # If the value occurs more than once, avoid multiple # copies. if v_id in out_cache: out[k] = out_cache[v_id] else: # If the object is wrapped then the shallow copy will # call _impl._pickleUnwrapObject and the new object will be in # the wrapped registry. # Unwrap the object if needed. v_copy = _impl.unwrapIfWrapped(copy.copy(v)) out[k] = v_copy out_cache[v_id] = v_copy return out _utils.assert_signatures_match(PoptorchModel.state_dict, torch.nn.Module.state_dict) # The mere existence of the "__torch_function__" results in a # "__getattribute__" call and hence weight copying if required. # "check_has_torch_function" and "handle_torch_function_getter" # in the PyTorch source code may explain this. # Without this, the weights will not be copied in certain # situations such as torch.equal(a, b). class PoptorchParameter(torch.nn.Parameter): def __getattribute__(self, name): p = parent() if p is not None: p.copyWeightsToHostIfNeeded() return object.__getattribute__(self, name) @classmethod def __torch_function__(cls, func, types, args=(), kwargs=None): if kwargs is None: kwargs = {} return super().__torch_function__(func, types, args, kwargs) self.PoptorchParameter = PoptorchParameter class PoptorchBuffer(torch.Tensor): def __getattribute__(self, name): p = parent() if p is not None: p.copyWeightsToHostIfNeeded() return super().__getattribute__(name) @classmethod def __torch_function__(cls, func, types, args=(), kwargs=None): if kwargs is None: kwargs = {} return super().__torch_function__(func, types, args, kwargs) self.PoptorchBuffer = PoptorchBuffer self._install_state_hooks() # __getattr__ and __getattribute__ are attributes, not methods, # unfortunately we cannot just replace them in the model object: we # have to create a wrapper class # and change the object's class. PoptorchModel.__name__ = "Poptorch%s" % type( self._user_model).__name__ self._user_model.__class__ = PoptorchModel # Register the wrapper types so that custom functions to # copy / serialize wrapped objects are set up. _impl.registerWrapperType(PoptorchModel) _impl.registerWrapperType(PoptorchParameter) _impl.registerWrapperType(PoptorchBuffer) def _install_state_hooks(self): for p in self._user_model.parameters(): p.__class__ = self.PoptorchParameter for b in self._user_model.buffers(): if not b.__class__ in (torch.Tensor, self.PoptorchBuffer): raise _impl.createPoptorchError( "All buffers must be an instance of torch.Tensor " f"(Got {type(b)})") b.__class__ = self.PoptorchBuffer def _update_optimizer_if_needed(self): if not self.isCompiled(): raise _impl.createPoptorchError(NO_EXECUTABLE_ERR) if self._ipu_optimizer_is_dirty: poptorch_core.updateOptimizers(self._executable, self._dict_optimizer) self._ipu_optimizer_is_dirty = False def _read_optim_state_dict_if_needed(self): if not isinstance(self._optimizer, Optimizer): return if self._optimizer.host_state_is_dirty: if not self.isCompiled(): raise _impl.createPoptorchError(NO_EXECUTABLE_ERR) assert not self._ipu_optimizer_is_dirty, ( "Both host " "and ipu states cannot be dirty at the same time.") # We need to return both the internal state dict and torch's # state dict so that LR schedulers work self._optimizer.set_state_dict({ **poptorch_core.readOptimizerState(self._executable), **torch.optim.Optimizer.state_dict(self._optimizer) }) # Don't trigger a copy to IPU as we've just synced. self._optimizer.ipu_state_is_dirty = False else: logger.debug("Using cached optimiser state dict") def _on_device_attach(self): """Method called every time we attach to a device.""" # Upload the weights to the IPU self.copyWeightsToDevice() # Upload the optimizer parameters if self._optimizer: self._update_optimizer_if_needed() # If the optimizer has a state: restore it. if self._optimizer and isinstance(self._optimizer, Optimizer): # If the optimiser has state to be written (from a checkpoint), # write it immediately after compilation if self._optimizer.has_state(): self._optimizer.ipu_state_is_dirty = True self._write_optim_state_dict_if_needed() else: self._optimizer.host_state_is_dirty = True self._optimizer.ipu_state_is_dirty = False if self._cached_rng_state is not None: self._copyRngStateToDevice() def _get_optim_state_dict(self): assert isinstance(self._optimizer, Optimizer) self._read_optim_state_dict_if_needed() return self._optimizer.get_state_dict() def _write_optim_state_dict_if_needed(self): if not self.isCompiled(): raise _impl.createPoptorchError(NO_EXECUTABLE_ERR) # If the new optimiser already has state (i.e. from a checkpoint), write it # to device if isinstance(self._optimizer, Optimizer) and self._optimizer.ipu_state_is_dirty: assert not self._optimizer.host_state_is_dirty, ( "Both host " "and ipu states cannot be dirty at the same time.") if self._optimizer.has_state(): # Sync the weights to host first because writeOptimizerState() is # going to write both the weights and the optimizer state self.copyWeightsToHostIfNeeded() poptorch_core.writeOptimizerState(self._executable, self._optimizer.state_dict()) self._optimizer.ipu_state_is_dirty = False def load_state_dict(self, state_dict: Dict[str, 'torch.Tensor'], strict: bool = True): """Will call ``load_state_dict()`` on the wrapped model and automatically synchronise the weights with the IPU. Returns: ``NamedTuple`` with ``missing_keys`` and ``unexpected_keys`` fields: * **missing_keys** is a list of str containing the missing keys * **unexpected_keys** is a list of str containing the unexpected keys """ out = self._user_model.load_state_dict(state_dict, strict) if self.isAttachedToDevice(): logger.debug("load_state_dict: implicit copyWeightsToDevice()") self.copyWeightsToDevice() return out def __repr__(self): # We've created out repr function to provide info on BeginBlock return _printing.module_repr(self._user_model) def __getattr__(self, attr): model_attr = getattr(self._user_model, attr) # We apply this wrapper here rather than adding it to PoptorchParameter # for two reasons: # 1) We might supply the same model to multiple PopTorch wrappers # (particularly we might supply it to trainingModel() and then # to inferenceModel()), and we need to be able to distinguish # between replicaGrouping() calls on each wrapper. # 2) We don't wrap inference parameters in PoptorchParameter normally, # but we might want to use replicaGrouping() with inference models. # If we do start doing PoptorchParameter wraps on inference models, # we'd end up pointlessly copying weights back from the device. if isinstance(model_attr, torch.nn.Parameter): model = self class ReplicaGroupingWrapper: def replicaGrouping( self, comm_group_type: enums.CommGroupType, shards: int, variable_retrieval_mode: enums.VariableRetrievalMode): model.per_replica_params[attr] = (comm_group_type, shards, variable_retrieval_mode) def __getattr__(self, attr): if attr == "replicaGrouping": return self.replicaGrouping return getattr(model_attr, attr) return ReplicaGroupingWrapper() return model_attr @property def model(self) -> 'torch.nn.Module': """Access the wrapped Torch model.""" return self._user_model @property def options(self) -> 'poptorch.Options': """Access to the options. .. seealso:: :py:class:`~poptorch.Options`""" return self._options def _debugGetPopartIR(self) -> str: return poptorch_core._getPopartIR(self._executable) # pylint: disable=protected-access def getTensorNames(self) -> List[str]: """Returns a list of all tensor names within the computational graph. Model must be compiled in advance. """ assert self._executable is not None, "Model must be compiled " \ "before calling getTensorNames" tensor_names = poptorch_core._getTensorNames(self._executable) # pylint: disable=protected-access return list(tensor_names) def getAnchoredTensor(self, short_name: str) -> torch.Tensor: assert short_name in self._anchor_memory, \ "No tensor with name " + short_name + " found." return self._anchor_memory[short_name] def copyWeightsToHostIfNeeded(self) -> bool: """ Return True if the weights on the host were dirty and have been updated. Return False if the weights were already up to date. """ if self._dirty_host_weights: logger.debug("Implicit copyWeightsToHost()") self.copyWeightsToHost() return True return False # Copy weights from the device into the memory of the model given on wrapper creation. def copyWeightsToHost(self) -> None: """ Updates the parameters used in `model` with the weights stored on device. (The weights in ``model.parameters()``) """ if not self.isCompiled(): raise _impl.createPoptorchError(NO_EXECUTABLE_ERR) # Don't trigger another copyToHost by accessing `named_parameters` self._dirty_host_weights = False weights = { **dict(self._model.named_parameters()), **dict(self._model.named_buffers()) } poptorch_core.copyWeightsToHost_impl(self._executable, tuple(weights.keys()), tuple(weights.values())) self._host_weights_version += 1 # Write from host memory to IPU memory. This is done automatically on # compilation so should be rarely used. def copyWeightsToDevice(self) -> None: """Copies the weights from ``model.parameters()`` to the IPU device. Implicitly called on first call. """ if not self.isCompiled(): raise _impl.createPoptorchError(NO_EXECUTABLE_ERR) # Don't trigger a copyToHost by accessing `named_parameters` self._dirty_host_weights = False # Trigger a IPU sync -> host if needed for # the optimizer state. if self._optimizer: self._optimizer.state_dict() weights = { **dict(self._model.named_parameters()), **dict(self._model.named_buffers()) } poptorch_core.copyWeightsToDevice_impl(self._executable, tuple(weights.keys()), tuple(weights.values())) def copyNamedBuffersToDevice(self) -> None: """Copies the buffers from ``model.parameters()`` to the IPU device. """ if not self.isCompiled(): raise _impl.createPoptorchError(NO_EXECUTABLE_ERR) # pylint: disable=protected-access if 'updatableNamedBuffers' not in self._options._Popart.options: raise _impl.createPoptorchError( "No named buffers marked as updatable via " "updatableNamedBuffers option") # Don't trigger a copyToHost by accessing `named_parameters` self._dirty_host_weights = False # Trigger a IPU sync -> host if needed for # the optimizer state. if self._optimizer: self._optimizer.state_dict() buffers = {**dict(self._model.named_buffers())} # pylint: disable=protected-access updatable_buffers = \ self._options._Popart.options['updatableNamedBuffers'] updatable_buffer_pointers = tuple(buffers[b] for b in updatable_buffers) poptorch_core.copyNamedBuffersToDevice_impl(self._executable, tuple(updatable_buffers), updatable_buffer_pointers) def setOptimizer(self, optimizer: 'torch.optim.Optimizer'): """Sets the optimiser for a training model. Will overwrite the previous one. Supported optimisers: ``optim.SGD``, ``optim.Adam``, ``optim.AdamW``, ``optim.RMSProp``, ``optim.LAMB``. """ # Optimiser state functions require a compiled executable if self.isCompiled() and optimizer != self._optimizer: # If we're setting a new optimiser, make sure the internal state of the old # optimiser has been read back so it's not lost, and then detach the old # optimiser so that its subsequent state_dict/load_state_dict calls don't # trigger optimiser state read/writes anymore if self._optimizer and isinstance(self._optimizer, Optimizer): self._read_optim_state_dict_if_needed() self._optimizer.state_dict = \ self._optimizer.get_state_dict # We only want to update the state on the IPU if it's a brand new optimizer # (Not if the params of the existing one have changed). if isinstance(optimizer, Optimizer): optimizer.ipu_state_is_dirty = True # If it's a PopTorch optimizer: instrument the state_dict() method # to implicitly transfer the state back to the host. if isinstance(optimizer, Optimizer): optimizer.state_dict = MethodType( PoplarExecutor._get_optim_state_dict, self) self._optimizer = optimizer dict_optimizer = _optimizer_attributes.convertOptimizerToDict( optimizer, self._attribute_tracker, self._options, self.isCompiled()) if dict_optimizer != self._dict_optimizer: self._dict_optimizer = dict_optimizer self._ipu_optimizer_is_dirty = True # If we need and can update the optimizer now: do it. if self.isAttachedToDevice(): self._update_optimizer_if_needed() self._write_optim_state_dict_if_needed() def _get_module_and_name(self, n): """ Given a nested attribute path, return `(module, name)` such that `module` is the object which contains the attribute `name`, relative to `self._model`. This makes it easy to access nested attributes with `getattr` and `setattr`, using the argument splat `*a` operator, i.e.: ``` getattr(*self._get_module_and_name("some_module.layer_one.weight")) ``` gets the attribute `self._model.some_module.layer_one.weight`. """ m = self._model name = n sn = n.rpartition(".") if sn[1] == ".": m = m.get_submodule(sn[0]) name = sn[2] return m, name @_impl.destroyDispatcherOnExit def _compileWithDispatch(self, in_tensors, executable_filename=None): with _OverwriteContextManager(): module_namescope = None if self.options._module_namescope_enabled: # pylint: disable=protected-access module_namescope = _impl.NameScopeHook(self._model) tensor_args = flattenTensorStructure( (in_tensors.args, in_tensors.kwargs)) mlir_compiler_options = poptorch_core.CompilerOptions() mlir_compiler_options.source_location_excludes = self._options._source_location_excludes # pylint: disable=line-too-long, protected-access dispatch_failed = False try: # pylint: disable=too-many-nested-blocks # Create the graph. Future captured calls will be written into this # graph behind the scenes. poptorch_core.createGraph( poptorch_core.TracingMode( poptorch_core.TracingMode.PopART), tensor_args, mlir_compiler_options) # Move the model parameters to the ipu and take a copy to load the # originals back once this has finished cpu_params = dict(self._model.named_parameters()) cpu_buffers = dict(self._model.named_buffers()) cpu_state = self._model.state_dict(keep_vars=True) # We need to remove the PoptorchBuffer and PoptorchParam annotations # before compiling the model. In addition, we must unwrap the whole # model to prevent IPU to CPU copies when accessing the state_dict. _impl.unwrapModelIfNecessary(self._model) if self.per_replica_params is not None: for name, param in cpu_params.items(): if name in self.per_replica_params: if param.shape == torch.Size([]): raise _impl.createPoptorchError( "Scalars cannot be passed as per-replica " "weight tensor values") param_tensor = param.narrow(0, 0, 1).squeeze(dim=0) setattr( *self._get_module_and_name(name), torch.nn.Parameter( param_tensor, requires_grad=param.requires_grad)) d = torch.device("ipu:0") poptorch_core.startParametersMove() self._model.to(d) poptorch_core.endParametersMove() # If there were any parameters and buffers (tensors), which were # aliases on the CPU (shared the same Python ID), these will have # become separate IPU tensors during the copy to IPU # # Find all such tensors, and then # 1. Keep a map from them to the earliest cpu tensor in the # cpu_state dict. # 2. Replace IPU tensors which are not but should be aliases with # that matching the earliest. # NB the "original" name is based on order of addition of the # tensors/modules and may not be a name of the parmeter which # replaced another, e.g. the case of "weight tying", but the # name of the "replaced". However, no names will be lost but the # aliases simply harmonised to be matching tensors on CPU and IPU. state = self._model.state_dict(keep_vars=True) tensors = collections.defaultdict(list) for name, tensor in cpu_state.items(): tensors[id(tensor)].append(name) # A map of parameters and buffers (tensors) on the CPU which share # the same python id, to the earliest tensor. cpu_aliases = {} aliases = [v for v in tensors.values() if len(v) > 1] for a in aliases: # NB original matches that in model.named_x() as both this as # model.state_dict() loop he same OrderedDicts in same order # and the named versions return only the first instances original = a[0] for other in a[1:]: setattr(*self._get_module_and_name(other), state[original]) cpu_aliases[other] = original # Map named unique parameters and buffers on the IPU. params = dict(self._model.named_parameters()) poptorch_core.mapParamsToNames(tuple(params.keys()), tuple(params.values())) buffers = dict(self._model.named_buffers()) poptorch_core.mapParamsToNames(tuple(buffers.keys()), tuple(buffers.values())) old_addresses = _impl.getBufferAndParameterAddresses( self._model) if self.per_replica_params is not None: for name, param in cpu_params.items(): if name in self.per_replica_params: poptorch_core.setPerReplica( name, param, *self.per_replica_params[name]) poptorch_core.startDispatch() _impl.setDispatchTracing(True) _impl.setIpuContext(True) for _, hook in PoplarExecutor._precompile_hooks.items(): hook() self._options._execution_strategy.onStartTracing() # pylint: disable=protected-access # The optimizer was created using the CPU model, therefore it points # at CPU tensors. We need to remap those to IPU tensors. # We just moved '_model' to the IPU, therefore we need to join the # two maps and then remap the parameters from the optimizer. # From: # # cpu_tensors[name] = cpu_data_ptr # ipu_tensors[name] = ipu_tensor # # we build: # # cpu_to_ipu[cpu_data_ptr] = ipu_tensor # # And then remap all the tensors from group["params"] if self._training: cpu_tensors = { **cpu_buffers, **cpu_params, } ipu_tensors = _impl.getBufferAndParameterTensors( self._model) cpu_to_ipu = { cpu_tensors[n].data_ptr(): ipu for n, ipu in ipu_tensors.items() } for index, group in enumerate( self._optimizer.param_groups): torch.ops.poptorch.optimizer_group( index, [ cpu_to_ipu[cpu.data_ptr()] for cpu in group["params"] ]) for idx, t in enumerate(tensor_args): if t.requires_grad: raise _impl.createPoptorchError( "An input tensor to an IPU model can not have " f"requires_grad set to True, however input {idx} " f"does: {t}\nYou can set requires_grad=True from " "within the model as an alternative, and return " "gradients as outputs to your model, if required.") d = torch.device("ipu:0") # Move all the inputs to the IPU tensor_args = [t.to(d) for t in tensor_args] # Re-inject moved tensors in args and kwargs: args, kwargs = reconstructTensorStructure( (in_tensors.args, in_tensors.kwargs), tensor_args) result = self._model(*args, **kwargs) if result is not None: self._outputs_structure = result output = flattenTensorStructure(result) for x in output: if not isOnIpu(x): warnings.warn( "Output expected to be on the IPU but is on %s" % x.device.type) output = [ out.int() if out.dtype == torch.long and isOnIpu(out) else out for out in output ] output = [ out.float() if out.dtype == torch.double and isOnIpu(out) else out for out in output ] poptorch_core.startOutputsMove() output = [out.cpu() for out in output] poptorch_core.endOutputsMove() poptorch_core.finalizeGraph() except: dispatch_failed = True raise finally: self._options._execution_strategy.onEndTracing() # pylint: disable=protected-access for _, hook in PoplarExecutor._postcompile_hooks.items(): hook() _impl.setIpuContext(False) _impl.setDispatchTracing(False) # Turn off the dispatcher. poptorch_core.endDispatch(dispatch_failed) # Reload the cpu model state # Get the buffer and parameter addresses after the model has ran # but before resetting the model back to the cpu new_addresses = _impl.getBufferAndParameterAddresses( self._model) def _set_param(k, v): setattr(*self._get_module_and_name(k), cpu_params[v]) for k in cpu_params: cpu_params[k].__class__ = torch.nn.Parameter _set_param(k, k) # Restore aliased parameters/buffers which will not be represented # in cpu_params or cpu_buffers for k, v in cpu_aliases.items(): _set_param(k, v) for k in cpu_buffers: setattr(*self._get_module_and_name(k), cpu_buffers[k]) # Re-install the Poptorch annotations for buffers and parameters _impl.rewrapModelIfNecessary(self._model) # Check that the buffer and parameter addresses haven't been changed # in the model # Note: this is done after resetting the model back to the cpu so # that errors thrown by this don't stop the model being in a valid # state _impl.errorOnBufferOrParameterAddressChanges( old_addresses, new_addresses) if module_namescope is not None: module_namescope.remove() # We only reach this point if dispatch didn't fail if executable_filename is not None: # Compile the captured graph using PopART. executable = poptorch_core.processDispatchAndImportExecutable( self._options.toDict(), accessAttributes, self._training, self._dict_optimizer, list(self._options.anchored_tensors.values()), executable_filename) else: # Compile the captured graph using PopART. executable = poptorch_core.compileWithManualTracing( self._options.toDict(), accessAttributes, self._training, self._dict_optimizer, list(self._options.anchored_tensors.values())) return executable @_impl.traceMethod("modelCompilation") def _compile(self, in_tensors): """On POD we want to separate compilation from device initialisation because we want only one process to compile the model, but ``loadEngineAndConnectStreams()`` must happen at the same time in all the processes (Because they need to talk to each other during the initialisation process). This is achieved by calling the equivalent of ``compileAndExport()`` from one of the processes: this will populate the PopART cache with the executable. (We use a temp file because we don't need the result, we just want the executable to be added to the cache). The caller will then call the regular ``_compile()`` method in all the processes at the same time and they should all hit the cache. """ # Compile the poplar executable based on the batchsize. in_tensors_trace_view = self._preprocessGraph(in_tensors) # Note: in single process execution or if the cache is disabled # should_compile will always be False. with _impl.distributedCacheLock(self._model, self._options) as should_compile: # Only the first process should compile if should_compile: self._executable = self._compileWithDispatch( in_tensors_trace_view) # In distributed execution mode: # At that point only the first process will have a compiled executable: # trigger the compilation process in all the other processes. if not self.isCompiled(): self._executable = self._compileWithDispatch(in_tensors_trace_view) # Load the engine and connect the streams in all the processes. # # Note: no sync point was added because we expect the above # compileWithDispatch call to be quick as all the processes should # hit the cache. # # If the cache is disabled then we expect the compilation process # to roughly take the same amount of time in all processes. # # Note: if multiple processes run on the same host, it's recommended # to enable executable caching to avoid out of memory issues due # to concurrent compilation processes. if self._options.connection_type != enums.ConnectionType.Never: poptorch_core.loadEngineAndConnectStreams(self._executable) self._is_attached = self.isAttachedToDevice() # PopTorch might have attached to a device either during # compileWithDispatch (if connection type is set to Always) or # during loadEngineAndConnectStreams (if OnDemand is used), # either way this will have occurred in the C++ backend, *not* using # PoplarExecutor.attachToDevice(), therefore we need to manually # call the _on_device_attach() trigger here. if self._is_attached: self._on_device_attach() @_impl.traceMethod("graphPreprocessing") def _preprocessGraph(self, in_tensors): self._executable_inputs = in_tensors.clone() in_tensors_trace_view = in_tensors.clone() def remove_requires_grad(tensor): if not isinstance(tensor, torch.Tensor): return tensor if tensor.requires_grad: tensor = tensor.detach() logger.warning( "Input tensor has requires_grad=True set. " "This tensor will be detached because backward pass via " "inputs is not supported.") return tensor in_tensors_trace_view.forEach(self._narrow_tensor) in_tensors_trace_view.forEach(remove_requires_grad) return in_tensors_trace_view def compile(self, *args, **kwargs) -> None: """Takes the same arguments as the wrapped PyTorch `model.__call__`. Trace and compile the wrapped model if no executable has been created yet. Note: The executable created by this method can only be executed, it cannot be exported to file. To precompile and save to file use :py:meth:`~poptorch.PoplarExecutor.compileAndExport` """ in_tensors = self._args_parser(args, kwargs, False) if self._executable is not None: logger.warning( "Call to compile() ignored: the executable is already compiled" ) else: self._compile(in_tensors) @_impl.traceMethod("loadExecutable") def loadExecutable(self, filename: str) -> None: """Load an executable previously generated using :py:meth:`~poptorch.PoplarExecutor.compileAndExport` """ serialized_data = poptorch_core.importPoptorchMetadataFromFile( filename) try: data = _poptorch_data.parse(serialized_data, self._poptorch_version) except AssertionError as e: raise _impl.createPoptorchError("Invalid file %s: %s" % (filename, e)) in_tensors_trace_view = self._preprocessGraph(data.executable_inputs) self._executable = self._compileWithDispatch( in_tensors_trace_view, executable_filename=filename) self._is_attached = self.isAttachedToDevice() if self._is_attached: self._on_device_attach() def save(self, filename: str, export_model: bool = True, save_rng_state: bool = True): """Save the compiled model to file. :param filename: Where to save the compiled executable. :param export_model: If `True` the Torch model will be saved in the file alongside the executable. :py:func:`~poptorch.load` can be used to restore both the original Torch model, the PopTorch model and the executable. If `False` then only the executable will be exported and it will be the user's responsibility to call :py:func:`~poptorch.inferenceModel` or :py:func:`~poptorch.trainingModel` to re-create the PopTorch model before calling :py:meth:`~poptorch.PoplarExecutor.loadExecutable` to restore the executable. :param save_rng_state: If `True` the random number generator's state and seed will be saved in the file alongside the executable. """ if not self.isCompiled(): raise _impl.createPoptorchError(NO_EXECUTABLE_ERR) dst_dir = os.path.dirname(filename) if dst_dir: if os.path.exists(dst_dir): assert os.path.isdir(dst_dir), ("Destination folder {dst_dir} " "is not a directory") else: os.makedirs(dst_dir) if os.path.isdir(filename): dirname = filename filename = os.path.join(dirname, "model.poptorch") logger.warning("save(): %s is a directory, saving model to %s", dirname, filename) data = _poptorch_data.PoptorchData(self._poptorch_version, self._executable_inputs, self._options) if export_model: data.training = self._training data.model = self.model data.optimizer = self._optimizer if save_rng_state: data.rng_state = self.rng_state serialized_data = pickle.dumps(data, protocol=4) with self._profiling.tracepoint("saveExecutableToFile"): poptorch_core.saveExecutableToFile(self._executable, filename) poptorch_core.appendPoptorchMetadataToFile(serialized_data, filename) @property def rng_state(self) -> List[int]: """Return the random number generator's seed & state of the compiled model.""" if not self.isCompiled(): raise _impl.createPoptorchError(NO_EXECUTABLE_ERR) if self._host_rng_state_is_dirty: self._host_rng_state_is_dirty = False self._cached_rng_state = [ poptorch_core.getRandomSeed(self._executable) ] + poptorch_core.getRngState(self._executable) return self._cached_rng_state @rng_state.setter def rng_state(self, state: List[int]): """Set the random number generator's seed & state for the compiled model.""" if not self.isCompiled(): raise _impl.createPoptorchError(NO_EXECUTABLE_ERR) self._host_rng_state_is_dirty = False self._cached_rng_state = state.copy() if self.isAttachedToDevice(): self._copyRngStateToDevice() def _copyRngStateToDevice(self): poptorch_core.setRngState(self._executable, self._cached_rng_state[0], self._cached_rng_state[1:]) @_impl.traceMethod("compileAndExport") def compileAndExport(self, filename: str, *args: List['torch.Tensor'], export_model: bool = True, **kwargs: Dict[str, 'torch.Tensor']): """Precompile an executable and save it to file. ``args`` and ``kwargs`` are the same arguments as the wrapped PyTorch ``model.__call__`` :param filename: Where to save the compiled executable. :param export_model: If `True` the Torch model will be saved in the file alongside the executable. :py:func:`~poptorch.load` can be used to restore both the original Torch model, the PopTorch model and the executable. If `False` then only the executable will be exported and it will be the user's responsibility to call :py:func:`~poptorch.inferenceModel` or :py:func:`~poptorch.trainingModel` to re-create the PopTorch model before calling :py:meth:`~poptorch.PoplarExecutor.loadExecutable` to restore the executable. """ self.compile(*args, **kwargs) self.save(filename, export_model) def cycleCount(self) -> int: """ Returns number of cycles which the IPU ran. You must run the model on IPU hardware before calling this method. :returns: number of cycles on the IPU for the last modern run. If you are using replicas, the returned value represents the first number of cycles for the first replica only.""" # pylint: disable=protected-access popart_options = self._options._Popart if not popart_options.options['instrumentWithHardwareCycleCounter']: err_msg = ("Cycle count logging is disabled. Please set option " "logCycleCount to True to enable.") raise _impl.createPoptorchError(err_msg) if not self.isCompiled(): err_msg = ("Please run the model at least once before obtaining " "cycle count.") raise _impl.createPoptorchError(err_msg) return poptorch_core.cycleCount(self._executable) def compilationTime(self) -> timedelta: """ Returns the total model compilation time. :returns: An object of type datetime.timedelta representing the compilation time. .. note:: You must compile the model before calling this method also showCompilationProgressBar option must be set to True. """ # pylint: disable=protected-access if not self.isCompiled(): err_msg = ("Please compile the model before obtaining " "compilation time.") raise _impl.createPoptorchError(err_msg) if not self._options._show_compilation_progress_bar: err_msg = ("Please set showCompilationProgressBar option " "to obtain compilation time.") raise _impl.createPoptorchError(err_msg) return self._options._progress_bar.compilation_time def __call__(self, *args: List['torch.Tensor'], **kwargs: Dict[str, 'torch.Tensor']): """ Takes the same arguments as the wrapped PyTorch `model.__call__`. .. note:: The first time the :py:class:`~poptorch.PoplarExecutor` wrapper is called, the wrapped model will be traced and compiled. """ assert self._options.connection_type != enums.ConnectionType.Never, ( "Trying to run a model on an offline device " "(ConnectionType.Never): use model.compile(inputs) instead of" " model(inputs)") # If it is compiled we take the fast path, if not we convert lists to tuples. in_tensors = self._args_parser(args, kwargs, self.isCompiled()) if not self.isCompiled(): self._compile(in_tensors) if not self._is_attached: self.attachToDevice() if not self._training: # If this is an inference model: check if the same model is not being # trained on a different IPU. # If it is: make sure the weights are updated. copyWeightsToHostIfNeeded = getattr(self._user_model, "copyWeightsToHostIfNeeded", None) if callable(copyWeightsToHostIfNeeded): copyWeightsToHostIfNeeded() if self._host_weights_version != \ self._user_model._host_weights_version: # Weights have now been updated on the Host: copy them to # the second IPU. logger.debug("Implicit copyWeightsToDevice()") self.copyWeightsToDevice() self._host_weights_version = \ self._user_model._host_weights_version self._executable_inputs.validateInputs(in_tensors) in_tensors_flat = in_tensors.asPackedFlatTuple(self._executable_inputs) # Update the optimizer state on the IPU if needed. self._write_optim_state_dict_if_needed() # Execute the poplar executable with the full size (batch * device interations) with self._profiling.tracepoint("modelExecution"): output = poptorch_core.execute(self._executable, in_tensors_flat) # Any anchored tensors will be returned at the end of the list # Pop them out and populate the anchor memory long_names = list(self._options.anchored_tensors.values()) for long_name in reversed(long_names): tensor = output.pop() keys = [ key for key, value in self._options.anchored_tensors.items() if value == long_name ] for key in keys: self._anchor_memory[key] = tensor self._host_rng_state_is_dirty = True if self._training: self._dirty_host_weights = True if self._optimizer and isinstance(self._optimizer, Optimizer): # The optimizer has been used on the IPU: its state on the host # is now out of date. self._optimizer.host_state_is_dirty = True # Provide a useful error message if the user attempts to call # backward() on an output tensor self._assign_backward_error(output) if self._outputs_structure is not None: # Only return the IPU tensors return reconstructTensorStructure(self._outputs_structure, output, isOnIpu) if len(output) == 0: return None if len(output) > 1: return output return output[0] def _assign_backward_error(self, input): def error_on_backward(): raise _impl.createPoptorchError( "backward() cannot be called explicitly on " "outputs of a PopTorch model. If you're using a trainingModel, " "the backwards pass is performed automatically when invoking " "the model. If you're using an inferenceModel, you should use " "a trainingModel instead.") if isinstance(input, (list, tuple)): for element in input: self._assign_backward_error(element) elif isinstance(input, torch.Tensor): input.backward = error_on_backward def getPerfCounters(self): """Return performance counters for the last execution of the model. Return the values (in fractional seconds) of the performance counters corresponding to the latest run of the model. The reference point of the returned value is undefined, however the difference between values is valid. The returned object is a dictionary where they keys correspond to each of the following events: * 'input': the IPU requesting an input tensor * 'input_complete': an input tensor having been transferred * 'output': the IPU requesting to transmit an output tensor * 'output_complete': an output tensor having been transferred The values of the dictionary are nested lists. The first level of nesting corresponds to an input or output index. The second level list contains the actual values as fractional seconds. Examples: * dict['input'][1][3]: performance counter for the second input tensor being requested on the third iteration of the model * dict['output_complete'][0][0]: performance counter the first output tensor having been transferred on the first iteration of the model """ if not self.isCompiled(): return { 'input': [[]], 'input_complete': [[]], 'output': [[]], 'output_complete': [[]] } def normalize(timestamps): if len(timestamps) == 0: return [[]] return timestamps values = poptorch_core.getTimestamps(self._executable) return { 'input': normalize(values[0]), 'input_complete': normalize(values[1]), 'output': normalize(values[2]), 'output_complete': normalize(values[3]) } def _computeLatency(self, from_event: str, from_reduce: Callable[[List[float]], float], to_event: str, to_reduce: Callable[[List[float]], float]): """Computes latency figures between two performance counters. :param from_event: Key for starting performance counter. :param from_reduce: Reduction function for starting counters. :param to_event: Key for ending performance counter. :param to_reduce: Reduction function for ending counters. .. seealso:: :py:meth:`~poptorch.PoplarExecutor.getPerfCounters` for the list of keys allowed. """ perf_counters = self.getPerfCounters() start_times = [] end_times = [] durations = [] num_inputs = len(perf_counters[from_event]) for step in range(0, len(perf_counters[from_event][0])): start_times.append( from_reduce([ perf_counters[from_event][i][step] for i in range(0, num_inputs) ])) num_outputs = len(perf_counters[to_event]) for step in range(0, len(perf_counters[to_event][0])): end_times.append( to_reduce([ perf_counters[to_event][i][step] for i in range(0, num_outputs) ])) if len(end_times) == 0: return (0., 0., 0.) # It is possible to have more input timestamps than output timestamps # due to other options such as gradient accumulation and output modes. # Whatever the case, the number of input ticks will always be divisible # by the number of output ticks. assert len(start_times) % len(end_times) == 0, \ "Internal PopTorch error: mismatching number of start timestamps" \ " and ending timestamps when calculating latency" # Find the group of input ticks corresponding to each output tick and # replace the whole set by its minimum factor = len(start_times) // len(end_times) start_groups = [ min(start_times[i:i + factor]) for i in range(0, len(start_times), factor) ] durations = list( map(lambda v: v[1] - v[0], zip(start_groups, end_times))) avg = sum(durations) / len(durations) return (min(durations), max(durations), avg) def getHostIpuLatency(self): """Return Host-IPU latency for the last execution of the model. The Host-IPU latency is the interval of time (in fractional seconds) between the first input tensor being requested and the last input tensor being transferred to the IPU. The result is a tuple containing the minimum, maximum and average latency for the iterations corresponding to the latest invocation of the model. """ return self._computeLatency('input', min, 'input_complete', max) def getComputeLatency(self): """Return compute latency for the last execution of the model. The compute latency is the interval of time (in fractional seconds) between the last input tensor being transferred to the IPU and the last output tensor becoming available. The result is a tuple containing the minimum, maximum and average latency for the iterations corresponding to the latest invocation of the model. """ return self._computeLatency('input_complete', max, 'output', max) def getIpuHostLatency(self): """Return IPU-Host latency for the last execution of the model. The IPU-Host latency is the interval of time (in fractional seconds) between the first output tensor becoming available and the last output tensor being written back to the host. The result is a tuple containing the minimum, maximum and average latency for the iterations corresponding to the latest invocation of the model. """ return self._computeLatency('output', min, 'output_complete', max) def getLatency(self): """Return round-trip latency for the last execution of the model. The round-trip latency is the interval of time (in fractional seconds) between the first input tensor being requested and the last output tensor being written back to the host. The result is a tuple containing the minimum, maximum and average latency for the iterations corresponding to the latest invocation of the model. """ return self._computeLatency('input', min, 'output_complete', max) def destroy(self) -> None: """Destroy the model: release the IPUs and the executable. """ if not self.isCompiled(): return if self._training: self.copyWeightsToHostIfNeeded() # Sync the optimizer's state dict back to host self._optimizer.state_dict() del self._executable self._executable = None if not self._training: return # unwrap the model, parameters and buffers if not _impl.isWrapped(self._user_model): raise _impl.createPoptorchError("model was never wrapped") _impl.unwrapModelIfNecessary(self._user_model) def _narrow_tensor(self, tensor): """There are two concepts of batch size. First is the "model" batch size then there is the concept of batching at the popart level. Here we divide by the popart batch size so the trace "sees" the model batch size but when we call execute we pass the full batch and popart will partition it up.""" input_group_count = self._options.replication_factor // \ self._options.input_group_size # Input will be in form of [ModelBatchSize * BatchPerStep, ...] so we # should slice it up so we compile by the ModelBatchSize alone. extra_poplar_batch_dims = self._options.device_iterations * \ input_group_count * self._options.Training.gradient_accumulation if not isinstance(tensor, torch.Tensor): return tensor b_size = 1 if not tensor.size() else tensor.size()[0] assert b_size % extra_poplar_batch_dims == 0, ( "Invalid batch dimension: In the input %s, the batch " "dimension (%d) must be a multiple of " "Options.deviceIterations(%d) * " "(Options.replicationFactor(%d) / " "Options.inputReplicaGrouping.input_group_size(%d)) * " "Options.Training.gradientAccumulation(%d) = %d " "because it is used to calculate the batch size which will " "be executed on the device in any given iteration. For a " "full explanation see the batching semantics page of the " "documentation." ) % (tensor.shape, b_size, self._options.device_iterations, self._options.replication_factor, self._options.input_group_size, self._options.Training.gradient_accumulation, extra_poplar_batch_dims) return tensor if tensor.shape == torch.Size([]) else tensor.narrow( 0, 0, b_size // extra_poplar_batch_dims) def isAttachedToDevice(self) -> bool: """Returns true, if the target device has been attached. False, otherwise. """ if not self.isCompiled(): return False return poptorch_core.isAttachedToDevice(self._executable) def isCompiled(self) -> bool: """Returns true if the model has been compiled (and not destroyed). False, otherwise.""" return bool(self._executable) def detachFromDevice(self) -> None: """Detach from target device. Before calling this function, the device must be attached (and the model compiled).""" if not self.isCompiled(): raise _impl.createPoptorchError(NO_EXECUTABLE_ERR) if not self._is_attached: raise _impl.createPoptorchError("Device is not attached") # Read all the states back before detaching _ = self.rng_state if self._training: self.copyWeightsToHostIfNeeded() self._read_optim_state_dict_if_needed() poptorch_core.detachFromDevice(self._executable) self._is_attached = False def attachToDevice(self) -> None: """Attach to target device. Before calling this function, the device must be detached and the model compiled.""" if not self.isCompiled(): raise _impl.createPoptorchError(NO_EXECUTABLE_ERR) assert self._options.connection_type != enums.ConnectionType.Never, ( "Trying to attach to an offline device" " (ConnectionType.Never)") if self._is_attached: raise _impl.createPoptorchError("Device is already attached") poptorch_core.attachToDevice(self._executable) poptorch_core.loadEngineAndConnectStreams(self._executable) self._is_attached = True self._on_device_attach() def _registerHook(hooks, new_hook) -> torch.utils.hooks.RemovableHandle: handle = torch.utils.hooks.RemovableHandle(hooks) hooks[handle.id] = new_hook return handle def registerPreCompileHook(hook: Callable ) -> torch.utils.hooks.RemovableHandle: """Register a hook that is called before model compilation. Raises a ``RuntimeError` if the hook is not callable. :param hook: A callable that is ran before model compilation begins. :returns: a :py:class:`torch.utils.hooks.RemovableHandle` that can be used to remove the hook using :py:func:`~remove` """ if not callable(hook): raise RuntimeError("Pre-compile hook must be callable") hooks = PoplarExecutor._precompile_hooks # pylint: disable=protected-access return _registerHook(hooks, hook) def registerPostCompileHook(hook: Callable ) -> torch.utils.hooks.RemovableHandle: """Register a hook that is called after model compilation. Raises a ``RuntimeError` if the hook is not callable. :param hook: A callable that is ran after model compilation ends. :returns: a :py:class:`torch.utils.hooks.RemovableHandle` that can be used to remove the hook using :py:func:`~remove` """ if not callable(hook): raise RuntimeError("Post-compile hook must be callable") hooks = PoplarExecutor._postcompile_hooks # pylint: disable=protected-access return _registerHook(hooks, hook) ================================================ FILE: python/_poptorch_data.py ================================================ # Copyright (c) 2020 Graphcore Ltd. All rights reserved. import pickle from typing import Any, List, Optional # Do not import any poptorch.* here: it will break the poptorch module from . import enums class PoptorchData: """Metadata to save when exporting an executable in order to be able to reload it. Note: :py:func:`~poptorch.load` can only be used if all the arguments are provided :py:meth:`~poptorch.PoplarExecutor.loadExecutable` can be used in either case (But only version and executable_inputs will be used) """ def __init__(self, version: str, executable_inputs: List[Any], options: 'poptorch.Options', training: Optional[bool] = None, model: Optional['torch.nn.Module'] = None, optimizer: Optional['torch.optim.Optimizer'] = None, random_seed: Optional[int] = None, rng_state: Optional[List[int]] = None): self.options = options self.training = training self.model = model self.version = version self.optimizer = optimizer assert executable_inputs, "The executable's inputs are missing" self.executable_inputs = executable_inputs self.random_seed = random_seed self.rng_state = rng_state @property def optimizer(self): return self._optimizer @optimizer.setter def optimizer(self, opt): self._optimizer = opt if opt is None: self.optimizer_state = None else: self.optimizer_state = opt.state_dict() def parse(serialized_data: bytes, expected_version: str): """Extract the :py:class:`~poptorch.PoptorchData` and the offset at which the PopART executable is stored from a given file. """ data = pickle.loads(serialized_data) assert data.version == expected_version, ( "PopTorch version mismatch: " f"File was created with version: {data.version}" f" and this is version {expected_version}") assert data.executable_inputs, ("Executable inputs are missing") if data.options: data.options._unfreeze() # pylint: disable=protected-access # Remove usefOfflineIpuTarget related flags if used data.options.deleteIfExists("ipu_version") if data.options.connection_type == enums.ConnectionType.Never.value: data.options.connectionType(enums.ConnectionType.Always) return data ================================================ FILE: python/_printing.py ================================================ # Copyright (c) 2021 Graphcore Ltd. All rights reserved. import torch # Override torches repr function to provide information on the pre hooks as # well. The pre hooks is where BeginBlock is added def module_repr(m: torch.nn.Module): """ Provide a string representation of a torch.nn.Module along with the corresponding pre-hooks. This will show any BeginBlocks that have been added to the model which otherwise wouldn't be displayed. """ def _add_indent(s_, numSpaces): return f'\n{numSpaces}'.join(s_.split('\n')) # pylint: disable=protected-access # We treat the extra repr like the sub-module, one item per line extra_lines = [] extra_repr = m.extra_repr() # empty string will be split into list [''] if extra_repr: extra_lines = extra_repr.split('\n') child_lines = [] for key, module in m._modules.items(): mod_str = module_repr(module) mod_str = _add_indent(mod_str, 2) child_lines.append('(' + key + '): ' + mod_str) lines = extra_lines + child_lines pre_hooks = ''.join( map(lambda x: repr(x) + ' ', m._forward_pre_hooks.values())) main_str = pre_hooks + m._get_name() + '(' if lines: # simple one-liner info, which most builtin Modules will use if len(extra_lines) == 1 and not child_lines: main_str += extra_lines[0] else: main_str += '\n ' + '\n '.join(lines) + '\n' main_str += ')' return main_str _global_print = print def print(m): """ Prints a torch.nn.Module along with the corresponding pre-hooks. This will print any BeginBlocks that have been added to the model which otherwise wouldn't be displayed. """ if isinstance(m, torch.nn.Module): _global_print(module_repr(m)) _global_print(m) ================================================ FILE: python/_utils.py ================================================ #!/usr/bin/env python3 # Copyright (c) 2021 Graphcore Ltd. All rights reserved. import ctypes import functools import inspect import itertools import json from typing import List, Generator import torch from . import poptorch_core # type: ignore from ._logging import logger ATTR_PREFIX = "attr:" def deprecated(domain, since_version, reason): """Decorator function to mark other functions as deprecated.""" def deprecated_func(func): @functools.wraps(func) def wrapped_func(*args, **kwargs): logger.warning( "%s.%s is deprecated since version %s " "and will be removed in a future release.\nReason: %s.", domain, func.__name__, since_version, reason) return func(*args, **kwargs) return wrapped_func return deprecated_func def assert_signatures_match(poptorch_method, reference_method): reference_params = inspect.signature(reference_method).parameters poptorch_params = inspect.signature(poptorch_method).parameters assert poptorch_params == reference_params, ( "Arguments mismatch: expected " f"{reference_params} but got {poptorch_params}") def accessAttributes(attribute_id_str): """Allow access to attributes""" logger.debug("Accessing attributes with: %s", attribute_id_str) if not isinstance(attribute_id_str, (str)): raise ValueError("Wrong type for attribute_id_str") # this is to allow creating of attributes from poptorch cpp if attribute_id_str.startswith('{'): return json.loads(attribute_id_str) if not attribute_id_str.startswith(ATTR_PREFIX): raise ValueError("Invalid attribute_id_str") attribute_id = int(attribute_id_str[len(ATTR_PREFIX):], 16) # NB this is undefined behavior if attribute_id does not exist attributes = ctypes.cast(attribute_id, ctypes.py_object).value logger.debug(str(attributes)) if attributes is None: return {} return attributes def isOnIpu(x): return x.device.type == "ipu" custom_arg_parsers = dict() def getCustomParser(custom_type_instance): if len(custom_arg_parsers) == 0: return None # direct lookup for exact type inside custom_arg_parsers parser = custom_arg_parsers.get(type(custom_type_instance), None) if parser is not None: return parser # search for registered parser for base class of custom_type_instance, # iterate over entire dict for custom_type, parser in custom_arg_parsers.items(): if isinstance(custom_type_instance, custom_type): return parser return None # Returns the structure `tensors` as a list of its torch.Tensor contents. def flattenTensorStructure(tensors, canonical_structure=None): def flatten(x, c): parser = getCustomParser(x) if parser is not None: yield from parser.yieldTensors(x) elif isinstance(x, dict): keys = x.keys() if c is None else c.keys() for k in keys: yield from flatten(x[k], None if c is None else c[k]) elif isinstance(x, (list, tuple)): cl = itertools.repeat(None, len(x)) if c is None else c for t, ct in zip(x, cl): yield from flatten(t, ct) elif isinstance(x, torch.Tensor): yield x # If it's not a dict/list/tuple or tensor, just ignore it return list(flatten(tensors, canonical_structure)) # Turns a flat `values` into the same structure as `structure`. # # Any non-tensor values in `structure` will be copied to the output. # # filter_fn: Optional function to additionally filter which tensors make it into # the output (eg. could supply `isOnIpu` to only get IPU tensors). def reconstructTensorStructure(structure, values, filter_fn=lambda t: True): # Copy the original structure but replace all the tensors by values from the # passed iterator. def copy_structure(x, it): parser = getCustomParser(x) if parser is not None: return parser.reconstruct(x, it) if isinstance(x, dict): return type(x)({k: copy_structure(x[k], it) for k in x.keys()}) if isinstance(x, (tuple, list)): if (hasattr(x, '_asdict') and hasattr(x, '_fields')): return type(x)(*(copy_structure(e, it) for e in x)) return type(x)(copy_structure(e, it) for e in x) if isinstance(x, torch.Tensor) and filter_fn(x): return next(it) return x return copy_structure(structure, iter(values)) def combine_batch_tensors_gen(tensors: List[List[torch.Tensor]] ) -> Generator[torch.Tensor, None, None]: """Concatenated batches tensors along dim = 0. """ for tensor_id in range(len(tensors[0])): tensors_list = [ tensors[batch_id][tensor_id] for batch_id in range(len(tensors)) ] yield torch.cat(tensors_list) def combined_batch_generator(dataloader_iterator, num_batches_to_combine, drop_last=True): """Wraps DataLoader iterator. Generates combined batches by concatenating consecutive batches tensors from dataloader_iterator along dim=0. """ tensors_to_combine = [] batch = None # iterate over next data batches for batch in dataloader_iterator: # append batch tensors to concatenate list if len(tensors_to_combine) < num_batches_to_combine: tensors_to_combine.append(flattenTensorStructure(batch)) else: # concatenate all tensors from concatenate list - create combined batch yield reconstructTensorStructure( batch, combine_batch_tensors_gen(tensors_to_combine)) tensors_to_combine = [flattenTensorStructure(batch)] if tensors_to_combine and len(tensors_to_combine) > 0 and \ len(tensors_to_combine) == num_batches_to_combine or \ not drop_last: # concatenate all tensors from concatenate list - create combined batch yield reconstructTensorStructure( batch, combine_batch_tensors_gen(tensors_to_combine)) def getIpuTensorId(x: torch.Tensor): return poptorch_core.getIpuTensorId(x) ================================================ FILE: python/enums.py ================================================ # Copyright (c) 2020 Graphcore Ltd. All rights reserved. import enum class MeanReductionStrategy(enum.IntEnum): """Specify when to divide by a mean reduction factor when ``accumulationAndReplicationReductionType`` is set to ``ReductionType.Mean``. - ``Running``: Keeps the reduction buffer as the current mean. This is preferred for numerical stability as the buffer value is never larger than the magnitude of the largest micro batch gradient. - ``Post``: Divides by the accumulationFactor and replicatedGraphCount after all of the gradients have been reduced. In some cases this can be faster then using Running, however is prone to overflow. - ``PostAndLoss`` (deprecated): Divides by the replicatedGraphCount before the backwards pass, performs the gradient reduction across micro batches, and then divides by the accumulationFactor. This is to support legacy behaviour and is deprecated. """ Running = 0 Post = 1 PostAndLoss = 2 class DataLoaderMode(enum.IntEnum): """ - ``Sync``: Access data synchronously - ``Async``: Uses an :py:class:`~poptorch.AsynchronousDataAccessor` to access the dataset - ``AsyncRebatched``: For iterable datasets by default PyTorch will round down the number of elements to a multiple of the combined batch size in each worker. When the number of workers is high and/or the batch size large this might lead to a significant part of the dataset being discarded. In this mode, the combined batch size used by the PyTorch workers will be set to 1, and the batched tensor will instead be constructed in the :py:class:`~poptorch.AsynchronousDataAccessor`. This mode is identical to Async for map-style datasets. """ Sync = 0 Async = 1 AsyncRebatched = 2 class SharingStrategy(enum.IntEnum): """Strategy to use to pass objects when creating new processes. - ``SharedMemory``: Spawn new processes and share data using shared memory: Fast but limited availability. - ``FileSystem``: Spawn new processes and shared data using the file system: slower but larger than memory. - ``Fork``: Fork new processes: no data sharing required but might cause problems if worker processes use threading. - ``ForkServer``: Similar to fork but a server process is used to fork child processes instead. This server process is single-threaded so there are no issues if worker processes use threading. """ SharedMemory = 0 FileSystem = 1 Fork = 2 ForkServer = 3 class OutputMode(enum.IntEnum): """ - ``All``: Return a result for each batch. - ``Sum``: Return the sum of all the batches - ``Final``: Return the last batch. - ``EveryN``: Return every N batches. N is passed in as `output_return_period` - ``Default``: "All" for inference, "Final" for training. """ Final = 0 EveryN = 1 All = 2 Sum = 3 Default = 4 class ConnectionType(enum.IntEnum): """ - ``Always``: Attach to the IPU from the start (Default). - ``OnDemand``: Wait until the compilation is complete and the executable is ready to be run to attach to the IPU. - ``Never``: Never try to attach to an IPU. (Useful for offline compilation, but trying to run an executable will raise an exception). """ Always = 0 OnDemand = 1 Never = 2 class HalfFloatCastingBehavior(enum.IntEnum): """ (deprecated) Only used for tracing. HalfUpCastToFloat is now the only supported option. """ FloatDowncastToHalf = 0 HalfUpcastToFloat = 1 Default = 2 class ReductionType(enum.IntEnum): """ - ``Sum``: Calculate the sum of all values - ``Mean``: Calculate the mean of all values - ``NoReduction``: Do not reduce """ Sum = 0 Mean = 1 NoReduction = 2 class SyncPattern(enum.IntEnum): """ - ``Full``: Require all IPUs to synchronise on every communication between IPUs or between IPUs and host. - ``SinglePipeline``: Allow IPUs to synchronise with the host independently, without having to synchronise with each other. This permits any one IPU to perform host IO while other IPUs are processing data. - ``ReplicaAndLadder``: Allow an IPU group to communicate with the host without requiring synchronisation between groups. This permits multiple IPU groups to alternate between performing host IO and computation. """ Full = 0 SinglePipeline = 1 ReplicaAndLadder = 2 class MatMulSerializationMode(enum.Enum): """Which dimension of the matrix multiplication to use for the serialization""" InputChannels = "input_channels" ReducingDim = "reducing_dim" OutputChannels = "output_channels" Disabled = "none" class Liveness(enum.IntEnum): """When using phased execution: - ``AlwaysLive``: The tensors always stay on the IPU between the phases. - ``OffChipAfterFwd``: The tensors are sent off the chip at the end of the forward pass and before the beginning of the backward pass. - ``OffChipAfterFwdNoOverlap``: Same as `OffChipAfterFwd`, except there is no overlapping of load and store operations between phases. This makes it a more memory-efficient mode at the cost of delayed computation. - ``OffChipAfterEachPhase``: The tensors are sent off the chip at the end of each phase. """ AlwaysLive = 0 OffChipAfterFwd = 1 OffChipAfterFwdNoOverlap = 2 OffChipAfterEachPhase = 3 class OverlapMode(enum.Enum): """ - ``NoOverlap``: The host will copy the tensor to the IPU only when required: this minimises on-chip memory use at the cost of performance. - ``OverlapAccumulationLoop``: The host will preload values for the next gradient accumulation iteration onto an IO tile. - ``OverlapDeviceIterationLoop``: The host will preload values not just for the next gradient accumulation iteration, but the next device iteration, onto an IO tile. This may require more IO tiles than the previous setting but offers greater performance. - """ NoOverlap = "no_overlap" OverlapAccumulationLoop = "overlap_accumulation_loop" OverlapDeviceIterationLoop = "overlap_device_iteration_loop" class AutoStage(enum.IntEnum): """Defines how the stages are automatically assigned to blocks when the user didn't explicitly provide stages to the ``IExecutionStrategy``'s constructor. - ``SameAsIpu``: The stage id will be set to the selected ipu number. - ``AutoIncrement``: The stage id for new blocks is automatically incremented. Examples: >>> # Block "0" >>> with poptorch.Block(ipu_id=0): ... layer() >>> # Block "1" >>> with poptorch.Block(ipu_id=1): ... layer() >>> # Block "2" >>> with poptorch.Block(ipu_id=0): ... layer() By default, the following execution strategy is used: >>> strategy = poptorch.PipelinedExecution(poptorch.AutoStage.SameAsIpu) >>> opts.setExecutionStrategy(strategy) which would translate to ``stage_id = ipu_id``: - Block "0" ipu=0 stage=0 - Block "1" ipu=1 stage=1 - Block "2" ipu=0 stage=0 Now if instead you use: >>> strategy = poptorch.PipelinedExecution(poptorch.AutoStage.AutoIncrement) >>> opts.setExecutionStrategy(strategy) The last block would be in its own stage rather than sharing one with Block "0": - Block "0" ipu=0 stage=0 - Block "1" ipu=1 stage=1 - Block "2" ipu=0 stage=2 """ SameAsIpu = 0 AutoIncrement = 1 class MultiConvPlanType(enum.IntEnum): """Selects the execution strategy for a ``poptorch.MultiConv`` - ``Parallel``: Execute multiple convolutions in parallel (Default). - ``Serial``: Execute each convolution independently. This is equivalent to using the independent convolution API. """ Parallel = 0 Serial = 1 class CommGroupType(enum.IntEnum): """Grouping to be used when distributing an input or per-replica variable among replicas. See :ref:`grouping_tensor_weights`. - ``All``: This causes :py:func:`~replicaGrouping` to have no effect, as the same variable value is distributed to all replicas. Group count is ignored. This is not valid as an input group type. - ``Consecutive``: Each replica group is made up of consecutive replicas, So for group size ``k``, the groups would be set up thus: ``{0, 1, ... k-1}, {k, ... 2k-1} ... {N-k-1, ... N-1}`` - ``Orthogonal``: Each replica group is made up by slicing the replicas orthogonally to the replica ordering. So for group size ``k``, with group count ``m = N/k``: ``{0, m, 2m, ...}, {1, m+1, 2m+1, ...} ... {m-1, 2m-1, ... N-1}`` - ``NoGrouping``: Each replica gets its own value of the variable. Group count is ignored. """ All = 0 Consecutive = 1 Orthogonal = 2 NoGrouping = 3 class VariableRetrievalMode(enum.IntEnum): """Method to be used when retrieving the value of a grouped variable from grouped replicas. See :ref:`grouping_tensor_weights`. - ``OnePerGroup``: Return one value for each replica group (takes the value from the first replica in the group). - ``AllReplicas``: Return a value from each replica. """ OnePerGroup = 0 AllReplicas = 2 ================================================ FILE: python/ops.py ================================================ # Copyright (c) 2020 Graphcore Ltd. All rights reserved. from collections import OrderedDict from typing import Callable, Dict, List, Union, Tuple, Optional import torch from . import enums from . import poptorch_core from . import _impl from ._utils import ATTR_PREFIX, flattenTensorStructure, reconstructTensorStructure _end_ipu_block = torch.ops.poptorch.end_ipu_block def ctc_beam_search_decoder(probs: "torch.Tensor", lengths: "torch.Tensor", blank: int = 0, beam_width: int = 100, top_paths: int = 1) -> List["torch.Tensor"]: """Add a connectionist temporal classification (CTC) beam search decoder to the model. Calculates the most likely top paths and their probabilities given the input logarithmic probabilities and the data lengths. :param probs: Logarithmic probabilities tensor with the shape of [input_length, batch_size, num_classes]. :param lengths: Tensor representing lengths of the inputs of shape [batch_size]. :param blank: Integer identifier of the blank class (default: 0). :param beam_width: Number of beams used during decoding (default: 100). :param top_paths: Number of most likely paths to return (default: 1). :returns: Three tensors representing paths' probabilities - of shape [batch_size, top_paths], paths' lengths - of shape [batch_size, top_paths] and the decoded paths - of shape [batch_size, top_paths, input_length]. """ if not isinstance(probs, torch.Tensor): raise _impl.createPoptorchError( "ctc_beam_search_decoder: probs must be a torch.tensor argument. " f"{type(probs)} is not supported.") if not isinstance(lengths, torch.Tensor): raise _impl.createPoptorchError( "ctc_beam_search_decoder: lengths must be a torch.tensor argument. " f"{type(lengths)} is not supported.") return torch.ops.poptorch.ctc_beam_search_decoder(probs, lengths, blank, beam_width, top_paths) def ipu_print_tensor(tensor: "torch.Tensor", title: str = "", print_gradient: bool = True, summarise_threshold: int = 1000, edge_items: int = 3, max_line_width: int = 80, digits: int = 4, float_format: str = "auto", separator: str = ", ", open_bracket: str = "(", close_bracket: str = ")") -> "torch.Tensor": """Adds an op to print the contents of the IPU tensor. When this is executed the tensor will be copied back to host and printed. When this operation is called in the backward pass it will print the gradient of the tensor. The operation is an identity operation and will return the exact same tensor. The returned tensor must be used in place of the original tensor in the rest of the program, to make sure that the print operation isn't optimised away. For example, if the original code looks like this: .. code-block:: python def forward(self, c, d, b) a = c + d return a + b If the result of ``ipu_print_tensor()`` is not used, the function will be optimised out by the graph optimiser and the tensor will not be printed. So if you want to print the value of `a`, you should do: .. code-block:: python def forward(self, c, d, b) a = c + d x = poptorch.ipu_print_tensor(a) return x + b Optionally, you can add a second string argument to be used as a title, as shown in the following example. The value of `a` will be printed after the title "summation". The value of the gradient of `a` will be printed after the title "summation_gradient" if the operation is called in the backward pass. .. code-block:: python def forward(self, c, d, b) a = c + d x = poptorch.ipu_print_tensor(a, "summation")) return x + b .. warning:: To prevent the print operation being optimised out by the graph optimiser, you must use the output of the print. :param tensor: The tensor to print. :param title: An optional title to print before the tensor value. Defaults to "". :param print_gradient: Whether to print the gradient tensor associated with this tensor. Defaults to True. :param summarise_threshold: If the number of elements of the tensor exceeds this threshold the output will be summarised. Only the edge elements will be displayed with an ellipsis indicating skipped elements. A value of 0 will disable summarisation. Defaults to 1000. :param edge_items: Number of edge elements to include at the beginning and end when summarisation is enabled. Defaults to 3. :param max_line_width: Lines longer than this limit will be split across multiple lines. A value of 0 will disable line splitting. Defaults to 75. :param digits: Number of digits to display. For integers this limit can be exceeded if any number is large enough. For floating points this does not include the exponent. The number of digits is used in conjunction analysis of the tensor to determine the width of each element to align all elements when printed. A value of 0 disables this analysis and each elements will be printed in an unaligned format. Defaults to 4. :param float_format: Determines the floating point format to use. Automatic mode determines the appropriate format based on the data. Defaults to "auto". One of: - "auto": Automatically determine the format through analysis. - "fixed": Use fixed point e.g. -100.00. - "scientific": Use scientific notation e.g. -1.123e+10. - "none": Do not display all elements with the same format :param separator: Character used to delineate values. Defaults to " ". :param open_bracket: Character used to open a tensor. Defaults to "[". :param close_bracket: Character used to close a tensor. Defaults to "]". :returns: The input tensor unchanged. """ if not isinstance(tensor, torch.Tensor): raise _impl.createPoptorchError( "ipu print tensor must take a torch.tensor argument. " f"{type(tensor)} is not supported.") float_format_dict = {"auto": 0, "fixed": 1, "scientific": 2, "none": 3} return torch.ops.poptorch.ipu_print_tensor(tensor, title, int(print_gradient), summarise_threshold, edge_items, max_line_width, digits, float_format_dict[float_format], separator, open_bracket, close_bracket) def for_loop(count: int, body: Callable[[List['torch.Tensor']], List['torch.Tensor']], inputs: List['torch.Tensor']) -> List['torch.Tensor']: """An on-device for loop. This loop will execute on device for `count` number of iterations. The body should be a Python function containing the PyTorch code you wish to execute in a loop. It should take as input the same number of tensors as it outputs. Each iteration will have the previous output passed in as input. :param count: Number of iterations of the loop. :param body: The function to be executed. :param inputs: The initial inputs to the function. """ if not isinstance(inputs, list): raise ValueError(("poptorch.for_loop expects input tensors (inputs)" " to be a list of tensors. (Object is not list)")) for ind, tensor in enumerate(inputs): if not isinstance(tensor, torch.Tensor): raise ValueError( ("poptorch.for_loop expects input tensors (inputs) to be" " a list of tensors. (Object contained in list at index" " %d is not torch.tensor)") % ind) # Clone the inputs to make sure ir reflects the fact that # body inputs are passed by value rather than by reference. cloned_inputs = [t.clone() for t in inputs] # Start the for loop. torch.ops.poptorch.start_for_loop(cloned_inputs) outputs = body(*cloned_inputs) if not isinstance(outputs, list) and not isinstance(outputs, tuple): outputs = [outputs] # End the for loop. res = torch.ops.poptorch.end_for_loop(outputs, cloned_inputs, count) return res def cond(condition: 'torch.Tensor', then_body: Callable[[List['torch.Tensor']], List['torch.Tensor']], then_inps: List['torch.Tensor'], else_body: Callable[[List['torch.Tensor']], List['torch.Tensor']], else_inps: List['torch.Tensor']) -> List['torch.Tensor']: """An on-device if/else operation. This creates two branches of instructions executed conditionally on the device. Only for inference. The `then_body` and `else_body` should be Python functions containing the PyTorch code you wish to execute conditionally on the device. The condition is passed in the form of a boolean `Tensor` and the branch to be executed is decided in runtime directly on the device. There are a few conditions on the branch functions: * `then_body` and `else_body` can accept an arbitrary number of inputs (including zero). * Tensors defined in the `cond` caller (the outer graph) can be used inside `then_body` and `else_body` implicitly just as if they were passed through the inputs list. * `then_body` and `else_body` have to return the same number of corresponding outputs. This is because the result of the `cond` op is assigned to a common list of tensors. * all the tensors utilized by `then_body` and `else_body` are passed in by copy, so updating any of the tensors inside `then_body` and `else_body` does not affect the original tensors. To update a tensor passed in, its new value has to be returned from the body and assigned to the original tensor (please note that the number of outputs from `then_body` and `else_body` has to match). :param condition: The condition controlling the execution of `then_body` and `else_body`. :param then_body: The function to be executed if `condition` is True. :param then_inps: `then_body` input tensors. :param else_body: The function to be executed if `condition` is False. :param else_inps: `else_body` input tensors. """ if not isinstance(then_inps, list) or not isinstance(else_inps, list): raise ValueError( ("poptorch.cond expects then_inps and else_inps tensors" " to be a list of tensors. (Object is not list)")) if not _impl.isRunningOnIpu(): # CPU execution path if condition: res = then_body(*then_inps) return [res] if isinstance(res, torch.Tensor) else [*res] res = else_body(*else_inps) return [res] if isinstance(res, torch.Tensor) else [*res] # Clone the inputs to make sure ir reflects the fact that # body inputs are passed by value rather than by reference. cloned_condition = condition.clone() # Start the if block. torch.ops.poptorch.start_if_block(cloned_condition) outputs_then = then_body(*then_inps) if not isinstance(outputs_then, list) and not isinstance( outputs_then, tuple): outputs_then = [outputs_then] # Start the else block. torch.ops.poptorch.start_else_block(outputs_then) outputs_else = else_body(*else_inps) if not isinstance(outputs_else, list) and not isinstance( outputs_else, tuple): outputs_else = [outputs_else] return torch.ops.poptorch.end_if_block(outputs_else, cloned_condition) def nop(tensor: "torch.Tensor") -> "torch.Tensor": """A no-operation: it is functionally the same as an identity but is never eliminated by PopART patterns or inlining, so it is useful for debugging. :param tensor: The tensor to pass to the no-op. :returns: The same tensor which was input. """ if not isinstance(tensor, torch.Tensor): raise _impl.createPoptorchError( f"nop must take a torch.tensor argument. {type(tensor)} is not " "supported.") return torch.ops.poptorch.nop(tensor) def dynamic_slice(tensor: "torch.Tensor", dim: int, start: "torch.Tensor", size: int, step: int) -> "torch.Tensor": """Torch native dynamic slices can't be properly intercepted by backends, so this op is provided to enable dynamic slicing in poptorch applications. :param tensor: The tensor to slice. :param dim: The dimension to slice along. :param start: The start index. :param size: The slice size. Must be a constant int. :param step: The slice step. Must be a constant int. :returns: The sliced tensor. """ if not isinstance(tensor, torch.Tensor): raise _impl.createPoptorchError( f"dynamic_slice must take a torch.tensor input. {type(tensor)} is " "not supported.") if not isinstance(dim, int): raise _impl.createPoptorchError("Dimension must be an integer.") if not isinstance(start, torch.Tensor): raise _impl.createPoptorchError( "Slice start argument to dynamic_slice must be a torch.tensor. " f"{type(tensor)} is not supported.") if not isinstance(size, int): raise _impl.createPoptorchError("Size must be an integer.") if not isinstance(step, int): raise _impl.createPoptorchError("Step must be an integer.") return torch.ops.poptorch.dynamic_slice(tensor, dim, start, size, step) def dynamic_update(input: "torch.Tensor", src: "torch.Tensor", dim: int, start: "torch.Tensor", size: int) -> "torch.Tensor": """Torch native dynamic slices can't be properly intercepted by backends, so this op is provided to enable dynamic update slice in poptorch applications. :param input: The tensor to update. :param src: The tensor to embed into `input` :param dim: The dimension to slice along. :param start: The start index. :param size: The slice size. Must be a constant int. :returns: The sliced tensor. """ if not isinstance(input, torch.Tensor): raise _impl.createPoptorchError( f"dynamic_update must take a torch.tensor input. {type(input)} is " "not supported.") if not isinstance(dim, int): raise _impl.createPoptorchError("Dimension must be an integer.") if not isinstance(start, torch.Tensor): raise _impl.createPoptorchError( "Slice start argument to dynamic_update must be a torch.tensor. " f"{type(start)} is not supported.") if not isinstance(src, torch.Tensor): raise _impl.createPoptorchError( "Src argument to dynamic_update must be a torch.tensor. " f"{type(src)} is not supported.") if not isinstance(size, int): raise _impl.createPoptorchError("Size must be an integer.") if input.dim() != src.dim(): raise _impl.createPoptorchError( "input and src tensors must have same dimensionality. " f"({input.dim()}) vs ({src.dim()})") if input.dtype != src.dtype: raise _impl.createPoptorchError( "input and src tensor must have same dtype. " f"({input.dtype} vs {src.dtype})") return torch.ops.poptorch.dynamic_update(input, src, dim, start, size) def recomputationCheckpoint(*tensors: List["torch.Tensor"] ) -> List["torch.Tensor"]: """Operation for checkpointing values in a computational pipeline stage. When recomputation is enabled, these values will not be recomputed and they will be stored in memory between forward and backwards passes instead. :param tensors: One or more tensors which should be check-pointed. :return: Tensors (same number and shape as the input tensors). """ # Allow passing a single list or tuple if len(tensors) == 1: if isinstance(tensors[0], (tuple, list)): return type(tensors[0])(recomputationCheckpoint(*tensors[0])) out = [] for t_in in tensors: if not isinstance(t_in, torch.Tensor): raise ValueError("All inputs must be tensors") out.append(torch.ops.poptorch.recomputation_checkpoint(t_in)) if len(out) == 1: return out[0] # Return a tuple by default since PopTorch does not support list inputs return tuple(out) def serializedMatMul(lhs: "torch.Tensor", rhs: "torch.Tensor", mode: "poptorch.MatMulSerializationMode", factor: int = 0, keep_precision: bool = False) -> "torch.Tensor": """ Calculates a matrix product using a serialized matrix multiplication. The matrix multiplication, ``lhs*rhs``, is split into separate smaller multiplications, calculated one after the other, to reduce the memory requirements of the multiplication and its gradient calculation. :param lhs: Left-hand side input matrix. :param rhs: Right-hand side input matrix. :param mode: Which dimension of the matmul to serialize on: for matrix A (m by n) multiplied by matrix B (n by p). * InputChannels: Split across the input channels (dimension m). * ReducingDim: Split across the reducing dimension (n). * OutputChannels: Split across the output channels (dimension p). * Disabled: Same as an ordinary matrix multiplication. :param factor: Number of serialized multiplications. Must be a factor of the dimension to serialize on. :param keep_precision: (Half/float16 inputs only) The forward op when serializing over ReducingDim and the backwards ops when serializing over InputChannels involve an addition step. If ``keep_precision`` is True, these additions will occur using float32 rather than half precision partials, matching those used for the individual matrix multiplications. """ assert isinstance(keep_precision, bool) assert isinstance(factor, int) assert isinstance(mode, enums.MatMulSerializationMode) out = torch.matmul(lhs, rhs) return torch.ops.poptorch.set_matmul_serialization(out, mode.value, factor, keep_precision) def set_available_memory(tensor: "torch.Tensor", available_memory_proportion: float) -> "torch.Tensor": """Sets the amount of temporary memory made available to an operation. The operators that can be tuned with this setting include: * convolution * matrix multiplication * embedding lookups * indexing operations When applied to the output of a supported operation, it controls the trade-off between execution cycles and the temporary memory used during the execution of the operation. The value should be between 0 and 1 (inclusive) and represents a proportion of available memory on the IPU. The default value is 0.6 (therefore, by default, PopTorch will not use more than 60% of IPU memory for temporary data). PopTorch passes this setting to the PopLibs operator planner, which will try to constrain the use of temporary memory to below this value. Generally, an operation that has more temporary memory available will run in fewer cycles. For a specific operation, the necessary amount of temporary memory may be more than amount specified by this option. In this case, a warning message will be generated. For more information, please refer to the `technical note `_ on optimising temporary memory usage. >>> class BasicNetwork(nn.Module): ... def __init__(self): ... super().__init__() ... self.conv = nn.Conv2d(4, 4, 3, stride=2) ... ... def forward(self, x): ... out = self.conv(x) ... out = poptorch.set_available_memory(out, 0.2) ... return out :param tensor: Output tensor from a supported operation (otherwise the statement will be an identity). :param available_memory_proportion: Proportion between 0.0 and 1.0 of tile memory to be made available for temporary memory (default 0.6). :returns: The input tensor, as if calling an identity function. """ if not isinstance(tensor, torch.Tensor): raise _impl.createPoptorchError( "You may only set available memory for torch.tensor values. " f"{type(tensor)} is not supported.") return torch.ops.poptorch.set_available_memory( tensor, available_memory_proportion) def set_overlap_for_input(input_tensors, mode: "poptorch.OverlapMode"): """Sets host overlap setting for input_tensors. You can increase performance in some cases by overlapping the copying from the host to IPUs with computation. However, this requires a number of IPU tiles to be set aside as IO tiles using :py:func:`~poptorch.options._TensorLocationOptions.numIOTiles` which may affect computation performance. You should use this function at the start of your model's `forward` method for each applicable input and use the returned tensors in future ops. :param input_tensors: The input tensors for which enable overlapping host IO. This can be either a single tensor, or any combination of tuple, list, or dict of tensors. :param mode: Control to what extent the host IO overlaps computation. :returns: the input tensors, specified for overlap. .. seealso:: :py:class:`~poptorch.OverlapMode`. """ def set_overlap_for_input_tensor(tensor): if not isinstance(tensor, torch.Tensor): raise _impl.createPoptorchError( "You may only set overlap for torch.tensor inputs. " f"{type(tensor)} is not supported.") return torch.ops.poptorch.set_overlap_for_input(tensor, mode.value) flattened = flattenTensorStructure(input_tensors) return reconstructTensorStructure( input_tensors, map(set_overlap_for_input_tensor, flattened)) def set_overlap_for_output(output_tensors, mode: "poptorch.OverlapMode"): """Sets host overlap setting for output_tensors. You can increase performance in some cases by overlapping the copying from the IPUs to host with computation. However, this requires a number of IPU tiles to be set aside as IO tiles using :py:func:`~poptorch.options._TensorLocationOptions.numIOTiles` which may affect computation performance. You should use this function at the end of your model's `forward` method, for each applicable output, just before returning the tensors. :param output_tensors: The output tensors to enable overlapping host IO for. This can be either a single tensor, or any combination of tuple, list, or dict of tensors. :param mode: Control to what extent the host IO overlaps computation. :returns: the output tensors, specified for overlap. .. seealso:: :py:class:`~poptorch.OverlapMode`. """ def set_overlap_for_output_tensor(tensor): if not isinstance(tensor, torch.Tensor): raise _impl.createPoptorchError( "You may only set overlap for torch.tensor outputs. " f"{type(tensor)} is not supported.") return torch.ops.poptorch.set_overlap_for_output(tensor, mode.value) flattened = flattenTensorStructure(output_tensors) return reconstructTensorStructure( output_tensors, map(set_overlap_for_output_tensor, flattened)) def _assertIdIsValid(name, value, expected_type): assert isinstance(value, expected_type) or \ (isinstance(value, int) and value >= 0), ( f"{name} must be either a positive integer or a " f"{expected_type.__name__}") # The next two classes do not implement the forward method # pylint: disable=abstract-method class Block(torch.nn.Module): """ A context manager to define blocks of the model. You can use ``Block`` as a context manager. This means you use Python's "with" statement as follows: >>> with poptorch.Block("Encoder"): ... self.layer = MyLayer(x) All layers called inside this scope will run on the specified IPU, if one is specified. In addition, you can combine multiple blocks into a stage. .. seealso:: :py:meth:`~poptorch.Options.setExecutionStrategy` """ # Will be set by the ExecutionStrategy before the graph is traced. # If it's None then it means it's a CPU execution of the graph so # turn the whole class into a no-op. _stages_manager = None @staticmethod def useAutoId(): """Call this method at the beginning of your ``forward()`` method to enable automatic block ID generation. Blocks with a None ``user_id`` will be assigned an automatic ID which will be the index of this block in the list of ID-less Blocks. >>> poptorch.Block.useAutoId() >>> with poptorch.Block(): # user_id = "0" ... layer() >>> with poptorch.Block("special_block"): # user_id = "special_block" ... layer() >>> with poptorch.Block(): # user_id = "1" ... layer() """ if Block._stages_manager is not None: Block._stages_manager.resetAutoId() @staticmethod def start(user_id: Optional[str] = None, ipu_id: Optional[int] = None): if Block._stages_manager is not None: Block._stages_manager.beginStage(user_id, ipu_id) def __init__(self, user_id: Optional[str] = None, ipu_id: Optional[int] = None): """ :param user_id: A user defined identifier for the block. Blocks with the same ID are considered as being a single block. Block identifiers are also used to manually specify pipelines or phases. :param ipu_id: The ID of the IPU to run on. Note that the ``ipu_id`` is an index in a multi-IPU device within PopTorch, and is separate and distinct from the device ids used by ``gc-info``. """ super().__init__() self._user_id = user_id self._ipu_id = ipu_id def __enter__(self): Block.start(self._user_id, self._ipu_id) def __exit__(self, type, value, traceback): _end_ipu_block() # Used to allow BeginBlock to be used with a function class LegacyBeginBlockFn(torch.nn.Module): def __init__(self, layer_to_call, user_id=None, ipu_id=None): super().__init__() self._user_id = user_id self._layer_to_call = layer_to_call self._ipu_id = ipu_id def __call__(self, *input, **kwargs): if Block._stages_manager is not None: if self._user_id is None: self._user_id = Block._stages_manager.nextAutoId() Block._stages_manager.beginStage(self._user_id, self._ipu_id) out = self._layer_to_call(*input, **kwargs) return out class _BlockHook(): """ A hook to define the blocks of the model. You can use ``_BlockHook`` as a forward_pre_hook for a ``torch.nn.Module`` as follows: >>> m.register_forward_pre_hook(_BlockHook(user_id, ipu_id)) All layers called after the hook has run will be run on the specified IPU, if one is specified. In addition, you can combine multiple blocks into a stage. .. seealso:: :py:meth:`~poptorch.Options.setExecutionStrategy` """ def __init__(self, user_id, ipu_id) -> None: super().__init__() self._user_id = user_id self._ipu_id = ipu_id def __call__(self, module, input): if Block._stages_manager is not None: if self._user_id is None: self._user_id = (Block._stages_manager.nextAutoId()) Block._stages_manager.beginStage(self._user_id, self._ipu_id) def __repr__(self): return f"BeginBlock(user_id={self._user_id}, ipu_id={self._ipu_id})" def removeBlocks(module): """Recursively remove BeginBlock annotations from a Module if it contains any. :param torch.nn.Module module: Module to recursively unwrap. """ assert isinstance(module, torch.nn.Module) for m in module.modules(): # pylint: disable=protected-access m._forward_pre_hooks = OrderedDict( filter(lambda elt: not isinstance(elt[1], _BlockHook), m._forward_pre_hooks.items())) def BeginBlock(layer_to_call: torch.nn.Module, user_id: str = None, ipu_id: int = None) -> torch.nn.Module: """ Define a block by modifying an existing PyTorch module. You can use this with an existing PyTorch module instance, as follows: >>> poptorch.BeginBlock(myModel.a_layer) >>> poptorch.BeginBlock(MyNewLayer()) The module and all sub-modules will be part of this block until a sub-module is modified to be in another block. In addition, if an IPU is specified, the module and its submodules will run on the specified IPU. You can combine multiple blocks into a stage. :param layer_to_call: PyTorch module to assign to the block. :param user_id: A user defined identifier for the block. Blocks with the same ID are considered as being a single block. Block identifiers are also used to manually specify pipelines or phases. :param ipu_id: The ID of the IPU to run on. Note that the ``ipu_id`` is an index in a multi-IPU device within PopTorch, and is separate and distinct from the device IDs used by ``gc-info``. .. seealso:: :py:meth:`~poptorch.Options.setExecutionStrategy` """ if not isinstance(layer_to_call, torch.nn.Module): # Previously, the function returned a new model so would work for any # callable. This was never documented but should still be permitted to # work. if callable(layer_to_call): return LegacyBeginBlockFn(layer_to_call, user_id, ipu_id) raise _impl.createPoptorchError( "module is not an instance of torch.nn.Module or " + "function.") # pylint: disable=protected-access if any( isinstance(hook, _BlockHook) for hook in layer_to_call._forward_pre_hooks.values()): raise _impl.createPoptorchError( "module has already been assigned to a block.") layer_to_call.register_forward_pre_hook(_BlockHook(user_id, ipu_id)) # There is no need to return as it is passed by reference, but this is for # backward compatibility return layer_to_call # pylint: enable=abstract-method def BlockFunction(user_id: Optional[str] = None, ipu_id: Optional[int] = None): """ A decorator to define blocks of the model. You can use ``BlockFunction`` as a decorator for an existing function, as follows: >>> @BlockFunction("Decoder", ipu_id=1) ... def decoder(self, encoder_output): ... self.decoder_b1(encoder_output) All layers inside the function and any functions called by the function will run on the specified IPU, if one is specified. In addition, you can combine multiple blocks into a stage. :param user_id: A user defined identifier for the block. Blocks with the same ID are considered as being a single block. Block identifiers are also used to manually specify pipelines or phases. :param ipu_id: The ID of the IPU to run on. Note that the ``ipu_id`` is an index in a multi-IPU device within PopTorch, and is separate and distinct from the device IDs used by ``gc-info``. .. seealso:: :py:meth:`~poptorch.Options.setExecutionStrategy` """ def decorator(func): def wrapper(*args, **kwargs): with Block(user_id, ipu_id): return func(*args, **kwargs) return wrapper return decorator # Store all attributes to prevent garbage collection attributes_lists: List[Dict[str, Union[float, int, str, list, tuple]]] = [] def custom_op(inputs: Tuple["torch.Tensor"], name: str, domain: str, domain_version: int, example_outputs: Tuple["torch.Tensor"], attributes: Optional[ Dict[str, Union[float, int, str, list, tuple]]] = None ) -> List["torch.Tensor"]: """Applies a custom operation, implemented within PopART, to the inputs. :param tuple inputs: A tuple of input tensors, for example, (x, y). :param str name: Unique name of the PopART custom op. :param str domain: Domain for the op. :param int domain_version: Version of the domain to use. :param iterable example_outputs: A tuple of tensors with the same type and shape as the outputs. The value does not matter as all values will be set to zero for tracing purposes. :param dict attributes: A dictionary of attributes for the custom op. All attribute keys must be strings. All attribute values must be floats, ints, strings, or a list/tuple containing only floats, only ints or only strings (not a mix of types within the list). :returns: The outputs of the forward op of the custom op. """ transformed_outputs = [] for output in example_outputs: # Dead code which will get eliminated but will safely allow the same # input to be provided to example_output (since it is only supposed # to be a template). Otherwise the compiler may recognise the alias. grad = output.requires_grad transformed_outputs.append( torch.zeros_like(output, requires_grad=grad, device=output.device)) if attributes is not None: # Handle attributes list for k, v in attributes.items(): if not isinstance(k, (str)): raise ValueError("All attribute keys must be strings.") if not isinstance(v, (float, int, str, list, tuple)): raise ValueError("Attribute values must be floats, ints, " "strings or a list/tuple of float, ints of " "strings.") if isinstance(v, (list, tuple)): for element in v: if not isinstance(element, (type(v[0]))): raise ValueError("The types in a list/tuple " "attribute must all be the same.") # Non-ascii cannot be converted to std::string in C++ def error_on_non_ascii(s): if isinstance(s, (list, tuple)): for v in s: error_on_non_ascii(v) if not isinstance(s, str): return for ch in s: if ord(ch) >= 128: raise ValueError(f"{s} contains non-ASCII characters.") for k in attributes.keys(): error_on_non_ascii(k) for v in attributes.values(): error_on_non_ascii(v) # The id should not change between traces, so we need to re-use any # attribute dictionaries. This more complicated because equality of # values is insufficient: [1, 2, 3] == [1.0, 2.0, 3.0] def same_attribute_types(candidate_att, search_attr): sorted_keys = sorted(candidate_att.keys()) if sorted_keys != sorted(search_attr.keys()): return False for key in sorted_keys: candidate = candidate_att[key] search = search_attr[key] if not isinstance(candidate, (type(search))): return False if isinstance(candidate, (list, tuple)): if not isinstance(candidate[0], type(search[0])): return False return True for attrib_cand in attributes_lists: if attrib_cand != attributes: continue # Equality does not imply same types if not same_attribute_types(attrib_cand, attributes): continue attributes = attrib_cand break else: attributes_lists.append(attributes) # NB None is a singleton in Python attributes_id_str = f"{ATTR_PREFIX}{hex(id(attributes))}" return torch.ops.poptorch.custom_operation(inputs, name, domain, domain_version, len(transformed_outputs), transformed_outputs, attributes_id_str) class CPU: """Allow the execution of a CPU op in the middle of an inference IPU graph. .. important:: CPU ops are only supported in inference graphs. Example: >>> class Model(torch.nn.Module): >>> def __init__(self): >>> super().__init__() >>> self.cpu = poptorch.CPU(self.myCpuOp, "MyCPUOp") >>> >>> def myCpuOp(self, x): >>> return x * 2.0 >>> >>> def forward(self, x): >>> # The arguments passed to "cpu" are forwarded to "myCpuOp" >>> out = self.cpu(x) >>> out = self.cpu(out) >>> out = self.cpu(out) >>> return out """ def __init__(self, layer_to_call: Callable, ID: str): """ Execute a given function on the CPU. :param: layer_to_call Python function to execute on the CPU. The arguments passed when the CPU wrapper is called will be forwarded to layer_to_call. :param: ID Name of the CPU op. """ self._layer_to_call = layer_to_call if isinstance(self._layer_to_call, torch.nn.Module): self._layer_to_call.requires_grad_(False) self._ID = ID self.in_shapes = None self.out_shapes = None self.inputs = None self.outputs = None def execute(self): """Implementation detail.""" outs = self._layer_to_call(*self.inputs) if isinstance(outs, (list, tuple)): for persistent_output, output in zip(self.outputs, outs): persistent_output.copy_(output) else: self.outputs[0].copy_(outs) def registerPersistentData(self): """Implementation detail.""" self.inputs = [torch.zeros(i, device='cpu') for i in self.in_shapes] self.outputs = [torch.zeros(o, device='cpu') for o in self.out_shapes] poptorch_core.registerBuffersWithCallback(self._ID, self.inputs, self.outputs) def __call__(self, *input, **kwargs): """Implementation detail.""" # Mark all subsquent ops as happening on the host. torch.ops.poptorch.call_cpu_op([*input], self._ID) if _impl.isRunningOnIpu(): cpu_input = [ torch.zeros_like(i, device="cpu", requires_grad=i.requires_grad) for i in input ] else: cpu_input = input # Keep the trace happy & get output shapes by actually calling the # layer. cpu_outputs = self._layer_to_call(*cpu_input) # Did we originally just output a single tensor? originally_single_tensor = False # Slight fixup for single tensor outputs. if not isinstance(cpu_outputs, (list, tuple)): originally_single_tensor = True cpu_outputs = [cpu_outputs] # Record metadata for our inputs & outputs, to later allocate in # permanent buffers. self.in_shapes = [i.shape for i in input] self.out_shapes = [o.shape for o in cpu_outputs] if _impl.isRunningOnIpu(): outputs = [ torch.zeros_like(o, device="ipu", requires_grad=o.requires_grad) for o in cpu_outputs ] else: outputs = cpu_outputs # End CPU host execution and show the JIT what the output looks like. outputs = torch.ops.poptorch.end_cpu_op(outputs) # Register this callback with poptorch so it knows what to call. poptorch_core.registerCPUCallBack(self, self._ID) # Just return one tensor if it was supposed to be just one. if originally_single_tensor: return outputs[0] return outputs def identity_loss(x: "torch.Tensor", reduction: "str") -> "torch.Tensor": """Marks a tensor as being part of the loss calculation and, as such, will back-propagate through it in the PopTorch autograd. This function should be called on the (final) loss of a model so that it is used as the start of backpropagation. This is equivalent to calling ``x.backward()`` on a tensor ``x`` when running on the CPU. This function is necessary to combine multiple losses into a custom loss. It ensures that the tensor is part of the loss calculation and, as such, should be part of the backpropagation in PopTorch autograd. Multiple calls to ``identity_loss`` can be made inside the same model provided they are all dependant: all marked losses must be traceable into a single final tensor itself marked by a call to ``identity_loss`` otherwise an error is raised. :param x: The calculated loss. :param reduction: Reduce the loss output as per PyTorch loss semantics. Supported values are: * ``"sum"``: Sum the losses. * ``"mean"``: Take the mean of the losses. * ``"none"``: Don't reduce the losses. :returns: The loss tensor with the specified reduction applied. """ if reduction == "sum": return torch.ops.poptorch.identity_loss(x, 0) if reduction == "mean": return torch.ops.poptorch.identity_loss(x, 1) assert reduction == "none", "Unsupported reduction type!" return torch.ops.poptorch.identity_loss(x, 2) def fps(src: "torch.Tensor", ptr: List[int], ratio: float = 0.5, random_start: bool = False) -> "torch.Tensor": """PopTorch implementation of the `torch_cluster` `fps` operator. This op is a sampling algorithm from the `"PointNet++: Deep Hierarchical Feature Learning on Point Sets in a Metric Space" `_ paper, and iteratively samples the most distant point with regard to the rest points. :param src: Point feature matrix. :param ptr: Pointer vector which defines ranges of nodes assigned to a specific sample. :param ratio: The sampling ratio. :param random_start: If set to `False`, use the first node in `src` as the starting node. :returns: A tensor of `src` point indexes. """ if not isinstance(src, torch.Tensor): raise _impl.createPoptorchError( f"`fps` must take a torch.tensor input. {type(src)} is " "not supported.") if not isinstance(ptr, list): raise _impl.createPoptorchError("`ptr` must be a list of integers.") if not len(ptr) >= 2: raise _impl.createPoptorchError( "`ptr` must containt at least 2 elements.") if not isinstance(ratio, float): raise _impl.createPoptorchError( f"`ratio` must be of float type. {type(ratio)} is not supported.") if not isinstance(random_start, bool): raise _impl.createPoptorchError( f"`random_start` must be of bool type. {type(random_start)} is " "not supported.") return torch.ops.poptorch.fps(src, ptr, ratio, random_start) def nearest(x: "torch.Tensor", y: "torch.Tensor", batch_x: Optional[Union[List[int], "torch.Tensor"]] = None, batch_y: Optional[Union[List[int], "torch.Tensor"]] = None): """PopTorch implementation of the `torch_cluster` `nearest` operator. This op clusters points in `x` together which are nearest to a given query point in `y`. :param x: Node feature matrix. :param y: Node feature matrix. :param batch_x: Batch vector, which assigns each node to a specific sample. `batch_x` needs to be sorted. :param batch_y: Batch vector, which assigns each node to a specific sample. `batch_y` needs to be sorted. """ if not isinstance(x, torch.Tensor): raise _impl.createPoptorchError( f"`nearest` must take a torch.tensor `x` input. {type(x)} is " "not supported.") if not isinstance(y, torch.Tensor): raise _impl.createPoptorchError( f"`nearest` must take a torch.tensor `y` input. {type(y)} is " "not supported.") batch_x = list() if batch_x is None else batch_x batch_y = list() if batch_y is None else batch_y batch_x_is_list = isinstance(batch_x, list) batch_y_is_list = isinstance(batch_y, list) batch_x_is_tensor = isinstance(batch_x, torch.Tensor) batch_y_is_tensor = isinstance(batch_y, torch.Tensor) if batch_x_is_list and batch_y_is_list: return torch.ops.poptorch.nearest_batch_list(x, y, batch_x, batch_y) if batch_x_is_tensor and batch_y_is_tensor: pass elif batch_x_is_list and batch_y_is_tensor: batch_x = torch.tensor(batch_x, dtype=batch_y.dtype) elif batch_x_is_tensor and batch_y_is_list: batch_y = torch.tensor(batch_y, dtype=batch_x.dtype) else: raise _impl.createPoptorchError( f"`batch_x` and `batch_y` must be torch.Tensors or lists while " f"`batch_x` is of type {type(batch_x)} and `batch_y` is of type " f"{type(batch_y)}.") return torch.ops.poptorch.nearest(x, y, batch_x, batch_y) class MultiConv(): """ Combines all convolution layers evaluated inside this scope into a single multi-convolution. Multi-convolutions allow for a set of data-independent convolutions to be executed in parallel. Executing convolutions in parallel can lead to an increase in the data throughput. For example: >>> with poptorch.MultiConv(): ... y = self.convA(x) ... v = self.convB(u) Combines the two data-independent convolutions into a single multi-convolution. Refer to the PopLibs documentation for further information on multi-convolutions. """ def __init__(self): self._available_memory_proportions = None self._partials_types = None self._plan_type = None self._per_conv_reserved_tiles = None self._cycle_back_off = None self._enable_conv_ditherings = None @staticmethod def _validatePerConvProperty(name, value, expected_scalar_type): if value is None: return value if isinstance(value, expected_scalar_type): # Wrap as tuple return (value, ) if isinstance(value, (list, tuple)) and len(value) > 0 and all( isinstance(x, expected_scalar_type) for x in value): return value raise AssertionError(f"Invalid {name}!") def availableMemoryProportions(self, value: Union[float, List[float]] ) -> "poptorch.MultiConv": """The available memory proportion per convolution, each [0, 1). For more information, please refer to the `technical note `_ on optimising temporary memory usage. :param value: Can be a ``float`` value in which case the same value is used for all of the convolutions. Otherwise, can be a ``tuple`` or ``list`` containing as many ``float`` values as the number of convolutions. :returns: ``self``, to support method chaining. """ name = "available memory proportion" value = self._validatePerConvProperty(name, value, float) self._available_memory_proportions = value return self def partialsTypes(self, value: Union[torch.dtype, List[torch.dtype]] ) -> "poptorch.MultiConv": """The partials type used for each convolution. :param value: Can be a single instance of ``torch.dtype`` in which case the same value is used for all of the convolutions. Otherwise, can be a ``tuple`` or ``list`` containing as many ``torch.dtype`` values as the number of convolutions. :returns: ``self``, to support method chaining. """ def encode_dtype(dtype): if dtype in [torch.float, torch.float32]: return 0 if dtype in [torch.half, torch.float16]: return 1 raise ValueError( 'Invalid partials types. Expecting torch.float or torch.half') if isinstance(value, (list, tuple)): value = [encode_dtype(v) for v in value] else: value = (encode_dtype(value), ) self._partials_types = value return self def enableConvDithering(self, value: Union[bool, List[bool]] ) -> "poptorch.MultiConv": """Enable per-convolution dithering. :param value: Can be a ``bool`` value in which case the same value is used for all of the convolutions. Otherwise, can be a ``tuple`` or ``list`` containing as many ``bool`` values as the number of convolutions. :returns: ``self``, to support method chaining. """ if value is None: self._enable_conv_ditherings = value elif isinstance(value, (list, tuple)): for x in value: if not isinstance(x, bool): raise ValueError("value must be bool or list of bools") self._enable_conv_ditherings = value elif isinstance(value, bool): self._enable_conv_ditherings = (value, ) else: raise ValueError("value must be bool or list of bools") return self def planType(self, value: "poptorch.MultiConvPlanType") -> "poptorch.MultiConv": """Select the multi-convolution execution strategy. :param value: An instance of :py:class:`~poptorch.MultiConvPlanType`. :returns: ``self``, to support method chaining. """ if value is None: self._plan_type = value elif isinstance(value, enums.MultiConvPlanType): self._plan_type = value else: raise AssertionError("Invalid plan type!") return self def perConvReservedTiles(self, value: int) -> "poptorch.MultiConv": """Tiles to reserve for each convolution. :param value: Number of tiles. :returns: ``self``, to support method chaining. """ assert isinstance(value, int) self._per_conv_reserved_tiles = value return self def cycleBackOff(self, value: float) -> "poptorch.MultiConv": """Cycle back off proportion. :param value: Number between 0 and 1. :returns: ``self``, to support method chaining. """ assert isinstance(value, float) self._cycle_back_off = value return self def __enter__(self): torch.ops.poptorch.begin_multi_conv() def __exit__(self, type, value, traceback): # Convert enums to ints if set plan_type = self._plan_type if plan_type is not None: plan_type = plan_type.value torch.ops.poptorch.end_multi_conv(self._available_memory_proportions, self._partials_types, plan_type, self._per_conv_reserved_tiles, self._cycle_back_off, self._enable_conv_ditherings) class NameScope: """ Create a name scope for a code block. All operators originating from this block will have their names prefixed by the given string. >>> with poptorch.NameScope("CustomString"): ... y = self.bmm(a, b) ... z = torch.relu(y) """ def __init__(self, name: str): assert isinstance(name, str), 'Parameter to NameScope must be a string' self.name = name def __enter__(self): torch.ops.poptorch.push_name_scope(self.name) def __exit__(self, type, value, traceback): torch.ops.poptorch.pop_name_scope() ================================================ FILE: python/optim.py ================================================ # Copyright (c) 2020 Graphcore Ltd. All rights reserved. import math import inspect from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Type import torch from ._logging import logger class VariableAttributes: """Track which attributes are variable or constant. Is accessible via any PopTorch optimizer via the ``variable_attrs`` attribute. >>> opt = poptorch.optim.SGD(params, lr=0.01) >>> opt.variable_attrs.isConstant("lr") """ def __init__(self, variable_attributes: List[str], allowed_attributes: List[str]) -> None: """ :param variable_attributes: list of variable attributes. :param allowed_attributes: list of all the attributes. """ self._variable_attributes = variable_attributes self._allowed_attributes = allowed_attributes def isConstant(self, attr: str) -> bool: """Return True if the attribute is marked as constant""" return attr not in self._variable_attributes def markAsConstant(self, attr: str) -> None: """Explicitly mark an attribute as constant""" assert attr in self._allowed_attributes, ( f"Unknown attribute {attr}," f" allowed values: {self._allowed_attributes}") self._variable_attributes = [ a for a in self._variable_attributes if a != attr ] def markAsVariable(self, attr: str) -> None: "Explicitly mark an attribute as variable" "" assert attr in self._allowed_attributes, ( f"Unknown attribute {attr}," f" allowed values: {self._allowed_attributes}") self._variable_attributes.append(attr) def _parseArgs(all_args: Dict[str, Any], child_attrs: Optional[List[str]] = None ) -> Tuple[Dict[str, Any], List[str]]: child_attrs = child_attrs or [] args = all_args.copy() # Remove special local() variables del args["self"] # Attributes explicitly set by the user are considered variable not_const = [k for k, v in args.items() if v is not None] # Filter out the child class attributes parent_args = { k: v for k, v in args.items() if k in not_const and k not in child_attrs } return parent_args, not_const class Optimizer: def __init__(self): self._state_dict = {"ipu_state": None, "ipu_param": None} # If True then the state needs to be uploaded to the IPU. self.ipu_state_is_dirty = False # Once the optimizer has been used on the IPU its state # on the host will become dirty. self.host_state_is_dirty = False # These functions must be overridden so that the optimiser state can be set # when the model is created def state_dict(self): return self.get_state_dict() def load_state_dict(self, state): # We also need to load torch's state dict so that LR schedulers work torch.optim.Optimizer.load_state_dict(self, state) self.set_state_dict(state) # Getter/setter for local state dict after the above functions been overridden by PoplarExecutor def get_state_dict(self): # Return both the internal state dict and torch's state dict # so that LR schedulers work return {**self._state_dict, **torch.optim.Optimizer.state_dict(self)} def set_state_dict(self, state): if not state: raise RuntimeError( "Cannot load optimizer state dictionary because it is empty.") if not ("ipu_state" in state and "ipu_param" in state): raise RuntimeError( "Only IPU optimizer states can be loaded onto the IPU.") self._state_dict = state self.ipu_state_is_dirty = True self.host_state_is_dirty = False def has_state(self): return (self._state_dict.get("ipu_state") is not None and self._state_dict.get("ipu_param") is not None) class SGD(Optimizer, torch.optim.SGD): # pylint: disable=line-too-long """ Stochastic gradient descent with optional momentum. The optimizer is based on PyTorch's implementation (`torch.optim.SGD `_) with optional loss and velocity scaling. PopTorch provides two possible variants. Both variants are mathematically identical to PyTorch but differ in their stability and efficiency. .. note:: If you set momentum to zero and do not use gradient accumulation, PopTorch will use a simple SGD variant and ignore the values of ``use_combined_accum``, ``accum_type`` and ``velocity_accum_type``. **Separate tensor variant (default)** If you set ``use_combined_accum`` to ``False`` (default), you will use a more stable but more memory intensive variant. In this case, PopTorch keeps two state tensors for each weight: one for gradient accumulation and one for velocity. It operates as follows when training: #. PopTorch runs one or more forward/backwards steps, equal the number of gradient accumulations (see :py:func:`~poptorch.options._TrainingOptions.gradientAccumulation`). Each time PopTorch sums the gradients, storing them in accumulators. #. Once all the forward and backwards have completed, PopTorch uses the summed gradients to update the velocities. At this stage, PopTorch will correct the scale based on the setting of :py:func:`~poptorch.options._TrainingOptions.accumulationAndReplicationReductionType`. PopTorch stores the velocities as optimiser states. #. Finally, PopTorch uses the velocities to update the parameters, taking into account the loss scaling and learning rate. With ``use_combined_accum`` set to False, you can independently change the data type used for storing the accumulated gradients and the velocity values using ``accum_type`` and ``velocity_accum_type``, respectively. Velocity scaling is ignored for this variant. .. note:: If the number of gradient accumulations is high, you can use off chip memory for the velocity tensors with a minimal performance hit. >>> opts.TensorLocations.setOptimizerLocation( ... poptorch.TensorLocationSettings().useOnChipStorage(False)) **Combined tensor variant** If you set `use_combined_accum`` to ``True``, you will use a less stable but more memory efficient variant. In this case PopTorch uses a single tensor (the combined tensor) for gradient accumulation and velocity. It operates as follows when training: #. PopTorch runs one or more forward/backwards steps equal the number of gradient accumulations (see :py:func:`~poptorch.options._TrainingOptions.gradientAccumulation`). For each step, PopTorch immediately calculates an increment or decrement for the combined tensors for each parameter. The amount of increment or decrement takes into account the setting of :py:func:`~poptorch.options._TrainingOptions.accumulationAndReplicationReductionType`. as well as removing loss scaling and introducing any velocity scaling. #. After running all the steps, the combined tensor will be be equal to the new velocities. PopTorch uses these to update the parameters taking into account the velocity scaling and learning rate. PopTorch ignores the `accum_type`` and ``velocity_accum_type`` values when using a combined tensor. In addition, there are no optimizer state tensors and so ``opts.TensorLocations.setOptimizerLocation`` has no effect. .. warning:: For both variants, reducing the velocity scaling during training will result in temporary over-estimation of the velocity and could cause model instability. Increasing the scaling may temporarily slow model convergence but not lead to instability. """ # Variables which don't exist in the parent optimizer class and are # global (Cannot be set per group). _child_vars = ["loss_scaling"] # All the attributes and variables which don't exist in the parent optimizer class. _child_only = _child_vars + [ "velocity_scaling", "use_combined_accum", "accum_type", "velocity_accum_type", "max_grad_norm" ] # Attributes (from the parent or child class) which can be set per group. _group_vars = [ "lr", "momentum", "dampening", "weight_decay", "nesterov", "velocity_scaling" ] def __init__(self, params: Iterable, lr: float, momentum: Optional[float] = None, dampening: Optional[float] = None, weight_decay: Optional[float] = None, nesterov: Optional[bool] = None, maximize: Optional[bool] = None, foreach: Optional[bool] = None, differentiable: Optional[bool] = None, loss_scaling: Optional[float] = None, velocity_scaling: Optional[float] = None, use_combined_accum: Optional[bool] = None, accum_type: Optional[torch.dtype] = None, velocity_accum_type: Optional[torch.dtype] = None, max_grad_norm: Optional[float] = None) -> None: """ :param iterable params: parameters to optimize. :param lr: learning rate. :param momentum: momentum factor. :param dampening: dampening term for momentum. :param weight_decay: Weight decay (L2 penalty) factor. :param nesterov: Whether to enable Nesterov momentum. Default is `False`. :param loss_scaling: Factor by which to scale the loss and hence gradients to assist numerical stability when using float16. :param velocity_scaling: Factor by which to scale the velocity values to assist numerical stability when using float16. (This applies to the combined variant only.) :param use_combined_accum: Whether to use a combined accumulator. :param accum_type: data type used for gradients. :param velocity_accum_type: data type used to store the velocity values for each parameter. :param max_grad_norm: Maximum norm of gradients. Default is `inf`. """ # pylint: disable=unused-argument # Call to locals() must be at the very top of __init__ parent_args, variables = _parseArgs(locals(), SGD._child_only) Optimizer.__init__(self) torch.optim.SGD.__init__(self, **parent_args) # Loss scaling is a global setting: store it as an attribute if loss_scaling is None: loss_scaling = 1.0 if use_combined_accum is None: use_combined_accum = False self.use_combined_accum = use_combined_accum if accum_type is None: accum_type = torch.float32 if velocity_accum_type is None: velocity_accum_type = torch.float32 self.loss_scaling = loss_scaling # Velocity scaling can be set per group: register it in defaults # and update the existing groups. if velocity_scaling is None: velocity_scaling = 1.0 # NB this will be overridden to loss_scaling in the case of the # separate tensor variant. else: if not use_combined_accum: logger.warning("velocity_scaling value ignored when " "using the separate variant " "(use_combined_accum=False). In future, this " "will lead to an error. Please update your " "code.") if use_combined_accum: self.defaults["velocity_scaling"] = velocity_scaling for group in self.param_groups: group.setdefault("velocity_scaling", velocity_scaling) if nesterov is None: nesterov = False supportedTypes = [torch.float16, torch.float32] errString = ("Accumulation types must be either torch.float32" " or torch.float16") assert accum_type in supportedTypes, errString self.accum_type = accum_type assert velocity_accum_type in supportedTypes, errString self.velocity_accum_type = velocity_accum_type if max_grad_norm is None: max_grad_norm = float("Inf") self.max_grad_norm = max_grad_norm self.variable_attrs = VariableAttributes( variables, list(self.defaults) + SGD._child_vars) def __getstate__(self) -> Dict[str, Any]: state = torch.optim.SGD.__getstate__(self) # Manually save the attributes # (groups / defaults are saved by the parent) state["variable_attrs"] = self.variable_attrs state["loss_scaling"] = self.loss_scaling state["use_combined_accum"] = self.use_combined_accum state["accum_type"] = self.accum_type state["velocity_accum_type"] = self.velocity_accum_type state["max_grad_norm"] = self.max_grad_norm # Mark the state as dirty only if there is one. state["_state_dict"] = self._state_dict state["ipu_state_is_dirty"] = self.has_state() state["host_state_is_dirty"] = False return state class Adam(Optimizer, torch.optim.Adam): """ Adam optimizer. This optimizer matches PyTorch's implementation (`torch.optim.Adam `_) with optional loss scaling. AMSGrad is currently not supported.""" # Variables which don't exist in the parent optimizer class and are # global (Cannot be set per group). _child_vars = ["loss_scaling"] # All the attributes and variables which don't exist in the parent optimizer class. _child_only = _child_vars + [ "accum_type", "first_order_momentum_accum_type", "second_order_momentum_accum_type", "max_grad_norm" ] # Attributes (from the parent or child class) which can be set per group. _group_vars = ["lr", "betas", "eps", "weight_decay", "amsgrad"] def __init__( self, params: Iterable, lr: Optional[float] = None, betas: Optional[Tuple[float, float]] = None, eps: Optional[float] = None, weight_decay: Optional[float] = None, amsgrad: Optional[bool] = None, foreach: Optional[bool] = None, maximize: Optional[bool] = None, capturable: Optional[bool] = None, differentiable: Optional[bool] = None, fused: Optional[bool] = None, loss_scaling: Optional[float] = None, accum_type: Optional[torch.dtype] = None, first_order_momentum_accum_type: Optional[torch.dtype] = None, second_order_momentum_accum_type: Optional[torch.dtype] = None, max_grad_norm: Optional[float] = None) -> None: """ :param iterable params: parameters to optimize. :param lr: learning rate :param betas: ``(beta1, beta2)`` parameters used in Adam. :param eps: term added to the denominator to ensure numerical stability. :param weight_decay: Weight decay factor. :param amsgrad: Not supported (must be False). :param loss_scaling: Factor by which to scale the loss and hence gradients to assist numerical stability when using float16. :param accum_type: data type used for gradients. :param first_order_momentum_accum_type: data type used to store the first order momentum values for each parameter. :param second_order_momentum_accum_type: data type used to store the second order momentum values for each parameter. :param max_grad_norm: Maximum norm of gradients. Default is `inf`. """ # pylint: disable=unused-argument # Call to locals() must be at the very top of __init__ parent_args, variables = _parseArgs(locals(), Adam._child_only) Optimizer.__init__(self) torch.optim.Adam.__init__(self, **parent_args) if loss_scaling is None: loss_scaling = 1.0 if accum_type is None: accum_type = torch.float32 if first_order_momentum_accum_type is None: first_order_momentum_accum_type = torch.float32 if second_order_momentum_accum_type is None: second_order_momentum_accum_type = torch.float32 if max_grad_norm is None: max_grad_norm = float("Inf") # All the child attributes are global: store them as # attributes. self.loss_scaling = loss_scaling supportedTypes = [torch.float16, torch.float32] errString = ("Accumulation types must be either torch.float32" " or torch.float16") assert accum_type in supportedTypes, errString self.accum_type = accum_type assert first_order_momentum_accum_type in supportedTypes, errString self.first_order_momentum_accum_type = \ first_order_momentum_accum_type assert second_order_momentum_accum_type in supportedTypes, errString self.second_order_momentum_accum_type = \ second_order_momentum_accum_type self.max_grad_norm = max_grad_norm self.variable_attrs = VariableAttributes( variables, list(self.defaults) + Adam._child_vars) def __getstate__(self) -> Dict[str, Any]: state = torch.optim.Adam.__getstate__(self) # Manually save the attributes # (groups / defaults are saved by the parent) state["variable_attrs"] = self.variable_attrs state["loss_scaling"] = self.loss_scaling state["accum_type"] = self.accum_type state["first_order_momentum_accum_type"] = \ self.first_order_momentum_accum_type state["second_order_momentum_accum_type"] = \ self.second_order_momentum_accum_type state["max_grad_norm"] = self.max_grad_norm # Mark the state as dirty only if there is one. state["_state_dict"] = self._state_dict state["ipu_state_is_dirty"] = self.has_state() state["host_state_is_dirty"] = False return state class AdamW(Optimizer, torch.optim.AdamW): """ Adam optimizer with true weight decay. This optimizer matches PyTorch's implementation (`torch.optim.AdamW `_) with optional loss scaling. AMSGrad is currently not supported.""" # Variables which don't exist in the parent optimizer class and are # global (Cannot be set per group). _child_vars = ["loss_scaling"] # All the attributes and variables which don't exist in the parent optimizer class. _child_only = _child_vars + [ "bias_correction", "accum_type", "first_order_momentum_accum_type", "second_order_momentum_accum_type", "max_grad_norm", ] # Attributes (from the parent or child class) which can be set per group. _group_vars = ["lr", "betas", "weight_decay", "eps", "amsgrad"] def __init__( self, params: Iterable, lr: Optional[float] = None, betas: Optional[Tuple[float, float]] = None, eps: Optional[float] = None, weight_decay: Optional[float] = None, amsgrad: Optional[bool] = None, maximize: Optional[bool] = None, foreach: Optional[bool] = None, capturable: Optional[bool] = None, differentiable: Optional[bool] = None, fused: Optional[bool] = None, loss_scaling: Optional[float] = None, bias_correction: Optional[bool] = None, accum_type: Optional[torch.dtype] = None, first_order_momentum_accum_type: Optional[torch.dtype] = None, second_order_momentum_accum_type: Optional[torch.dtype] = None, max_grad_norm: Optional[float] = None) -> None: """ :param iterable params: parameters to optimize. :param lr: learning rate :param betas: ``(beta1, beta2)`` parameters used in AdamW. :param eps: term added to the denominator to ensure numerical stability. :param weight_decay: Weight decay factor. :param amsgrad: Not supported (must be False). :param loss_scaling: Factor by which to scale the loss and hence gradients to assist numerical stability when using float16. :param bias_correction: True: compute Adam with bias correction. :param accum_type: data type used for gradients. :param first_order_momentum_accum_type: data type used to store the first order momentum values for each parameter. :param second_order_momentum_accum_type: data type used to store the second order momentum values for each parameter. :param max_grad_norm: Maximum norm of gradients. Default is `inf`. """ # pylint: disable=unused-argument # Call to locals() must be at the very top of __init__ parent_args, variables = _parseArgs(locals(), AdamW._child_only) Optimizer.__init__(self) torch.optim.AdamW.__init__(self, **parent_args) if loss_scaling is None: loss_scaling = 1.0 if bias_correction is None: bias_correction = True if accum_type is None: accum_type = torch.float32 if first_order_momentum_accum_type is None: first_order_momentum_accum_type = torch.float32 if second_order_momentum_accum_type is None: second_order_momentum_accum_type = torch.float32 if max_grad_norm is None: max_grad_norm = float("Inf") self.loss_scaling = loss_scaling self.bias_correction = bias_correction supportedTypes = [torch.float16, torch.float32] errString = ("Accumulation types must be either torch.float32" " or torch.float16") assert accum_type in supportedTypes, errString self.accum_type = accum_type assert first_order_momentum_accum_type in supportedTypes, errString self.first_order_momentum_accum_type = \ first_order_momentum_accum_type assert second_order_momentum_accum_type in supportedTypes, errString self.second_order_momentum_accum_type = \ second_order_momentum_accum_type self.max_grad_norm = max_grad_norm self.variable_attrs = VariableAttributes( variables, list(self.defaults) + AdamW._child_vars) def __getstate__(self) -> Dict[str, Any]: state = torch.optim.AdamW.__getstate__(self) # Manually save the attributes # (groups / defaults are saved by the parent) state["variable_attrs"] = self.variable_attrs state["loss_scaling"] = self.loss_scaling state["bias_correction"] = self.bias_correction state["accum_type"] = self.accum_type state["first_order_momentum_accum_type"] = \ self.first_order_momentum_accum_type state["second_order_momentum_accum_type"] = \ self.second_order_momentum_accum_type state["max_grad_norm"] = self.max_grad_norm # Mark the state as dirty only if there is one. state["_state_dict"] = self._state_dict state["ipu_state_is_dirty"] = self.has_state() state["host_state_is_dirty"] = False return state class RMSprop(Optimizer, torch.optim.RMSprop): """ RMSprop optimizer with optional L2 penalty. This optimizer matches PyTorch's implementation ( `torch.optim.RMSprop `_) with optional loss scaling. However, if the use_tf_variant flag is set to True, it will instead match the TensorFlow implementation which differs from PyTorch's implementation in three ways: 1) The average squared gradients buffer is initialized to ones. 2) The small epsilon constant is applied inside the square root. 3) Learning rate is accumulated in the momentum buffer if momentum is used.""" # Variables which don't exist in the parent optimizer class and are # global (Cannot be set per group). _child_vars = ["loss_scaling"] # All the attributes and variables which don't exist in the parent optimizer class. _child_only = _child_vars + [ "accum_type", "first_order_momentum_accum_type", "second_order_momentum_accum_type", "use_tf_variant" ] # Attributes (from the parent or child class) which can be set per group. _group_vars = [ "lr", "momentum", "weight_decay", "alpha", "eps", "centered" ] def __init__( self, params: Iterable, lr: Optional[float] = None, alpha: Optional[float] = None, eps: Optional[float] = None, weight_decay: Optional[float] = None, momentum: Optional[float] = None, centered: Optional[bool] = None, foreach: Optional[bool] = None, maximize: Optional[bool] = None, differentiable: Optional[bool] = None, loss_scaling: Optional[float] = None, accum_type: Optional[torch.dtype] = None, first_order_momentum_accum_type: Optional[torch.dtype] = None, second_order_momentum_accum_type: Optional[torch.dtype] = None, use_tf_variant: Optional[bool] = None) -> None: """ :param iterable params: parameters to optimize. :param lr: learning rate. :param alpha: smoothing constant. :param eps: term added to the denominator to ensure numerical stability. :param weight_decay: L2 penalty coefficient. :param momentum: momentum factor. :param centered: True: compute centred RMSprop in which the gradient is normalized by an estimate of its variance. :param loss_scaling: Factor by which to scale the loss and hence gradients to assist numerical stability when using float16. :param accum_type: data type used for gradients. :param first_order_momentum_accum_type: data type used to store the first order momentum values for each parameter. :param second_order_momentum_accum_type: data type used to store the second order momentum values for each parameter. :param use_tf_variant: False: If True, use the TensorFlow variant of RMSProp. """ # pylint: disable=unused-argument # Call to locals() must be at the very top of __init__ parent_args, variables = _parseArgs(locals(), RMSprop._child_only) Optimizer.__init__(self) torch.optim.RMSprop.__init__(self, **parent_args) if loss_scaling is None: loss_scaling = 1.0 if accum_type is None: accum_type = torch.float32 if first_order_momentum_accum_type is None: first_order_momentum_accum_type = torch.float32 if second_order_momentum_accum_type is None: second_order_momentum_accum_type = torch.float32 if use_tf_variant is None: use_tf_variant = False self.loss_scaling = loss_scaling supportedTypes = [torch.float16, torch.float32] errString = ("Accumulation types must be either torch.float32" " or torch.float16") assert accum_type in supportedTypes, errString self.accum_type = accum_type assert first_order_momentum_accum_type in supportedTypes, errString self.first_order_momentum_accum_type = \ first_order_momentum_accum_type assert second_order_momentum_accum_type in supportedTypes, errString self.second_order_momentum_accum_type = \ second_order_momentum_accum_type self.use_tf_variant = use_tf_variant self.variable_attrs = VariableAttributes( variables, list(self.defaults) + RMSprop._child_vars) def __getstate__(self) -> Dict[str, Any]: state = torch.optim.RMSprop.__getstate__(self) # Manually save the attributes # (groups / defaults are saved by the parent) state["variable_attrs"] = self.variable_attrs state["loss_scaling"] = self.loss_scaling state["accum_type"] = self.accum_type state["first_order_momentum_accum_type"] = \ self.first_order_momentum_accum_type state["second_order_momentum_accum_type"] = \ self.second_order_momentum_accum_type state["use_tf_variant"] = self.use_tf_variant # Mark the state as dirty only if there is one. state["_state_dict"] = self._state_dict state["ipu_state_is_dirty"] = self.has_state() state["host_state_is_dirty"] = False return state class LAMB(Optimizer, torch.optim.Optimizer): """ Layer-wise Adaptive Moments (LAMB) optimizer (biased version). Based on "Large Batch Optimization for Deep Learning: Training BERT in 76 minutes" (https://arxiv.org/abs/1904.00962). The scaling function phi(z) is fixed as min(z, max_weight_norm); """ # Variables which don't exist in the parent optimizer class and are # global (Cannot be set per group). _child_vars = ["loss_scaling"] # All the attributes and variables which don't exist in the parent optimizer class. _child_only = _child_vars + [ "bias_correction", "accum_type", "first_order_momentum_accum_type", "second_order_momentum_accum_type" ] # Attributes (from the parent or child class) which can be set per group. _group_vars = ["lr", "weight_decay", "betas", "eps", "max_weight_norm"] def __init__(self, params: Iterable, lr: Optional[float] = None, betas: Tuple[float, float] = None, eps: Optional[float] = None, weight_decay: Optional[float] = None, bias_correction: Optional[bool] = None, loss_scaling: Optional[float] = None, max_weight_norm: Optional[float] = None, accum_type: Optional[torch.dtype] = None, first_order_momentum_accum_type: Optional[torch.dtype] = None, second_order_momentum_accum_type: Optional[torch.dtype] = None ) -> None: """ :param iterable params: parameters to optimize. :param lr: learning rate :param betas: ``(beta1, beta2)`` parameters used in LAMB. :param eps: term added to the denominator to ensure numerical stability/ :param weight_decay: weight decay factor. :param bias_correction: True: compute LAMB with bias correction. :param loss_scaling: Factor by which to scale the loss and hence gradients to assist numerical stability when using float16. :param max_weight_norm: maximum value of the output of scaling function, phi(). Set to None to disable scaling function. :param accum_type: data type used for gradients. :param first_order_momentum_accum_type: data type used to store the first order momentum values for each parameter. :param second_order_momentum_accum_type: data type used to store the second order momentum values for each parameter. """ # pylint: disable=unused-argument # Call to locals() must be at the very top of __init__ _, variables = _parseArgs(locals(), []) if max_weight_norm is None: max_weight_norm = 65500.0 # FP16 Max if lr is None: lr = 1e-3 if betas is None: betas = (0.9, 0.999) if eps is None: eps = 1e-8 if weight_decay is None: weight_decay = 1e-2 if bias_correction is None: bias_correction = True if loss_scaling is None: loss_scaling = 1.0 if accum_type is None: accum_type = torch.float32 if first_order_momentum_accum_type is None: first_order_momentum_accum_type = torch.float32 if second_order_momentum_accum_type is None: second_order_momentum_accum_type = torch.float32 defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, max_weight_norm=max_weight_norm) Optimizer.__init__(self) torch.optim.Optimizer.__init__(self, params, defaults) supportedTypes = [torch.float16, torch.float32] errString = """Accumulation types must be either torch.float32 or torch.float16""" assert accum_type in supportedTypes, errString assert first_order_momentum_accum_type in supportedTypes, errString assert second_order_momentum_accum_type in supportedTypes, errString self.bias_correction = bias_correction self.loss_scaling = loss_scaling self.max_weight_norm = max_weight_norm self.accum_type = accum_type self.first_order_momentum_accum_type = \ first_order_momentum_accum_type self.second_order_momentum_accum_type = \ second_order_momentum_accum_type self.variable_attrs = VariableAttributes( variables, list(self.defaults) + LAMB._child_vars) def step(self, closure: Optional[Callable] = None) -> Optional[float]: loss = None if closure is not None: loss = closure() for group in self.param_groups: for p in group["params"]: if p.grad is None: continue grad = p.grad.data state = self.state[p] if len(state) == 0: state["step"] = 0 state["exp_avg"] = torch.zeros_like(p.data) state["exp_avg_sq"] = torch.zeros_like(p.data) state["step"] += 1 exp_avg, exp_avg_sq = state["exp_avg"], state["exp_avg_sq"] beta1, beta2 = group["betas"] if self.bias_correction: bias_correction1 = 1 - beta1**state["step"] bias_correction2 = 1 - beta2**state["step"] else: bias_correction1 = 1 bias_correction2 = 1 exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1) exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2) denom = (exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_( group["eps"]) upd = ((exp_avg / bias_correction1) / denom) + group["weight_decay"] * p.data r1 = p.data.pow(2).sum().sqrt() r2 = upd.pow(2).sum().sqrt() r1_ = r1.clamp(max=self.max_weight_norm) if r1_ == 0 or r2 == 0: trust = 1.0 else: trust = r1_ / r2 p.data.add_(upd, alpha=-group['lr'] * trust) return loss def __getstate__(self) -> Dict[str, Any]: state = torch.optim.Optimizer.__getstate__(self) # Manually save the attributes # (groups / defaults are saved by the parent) state["variable_attrs"] = self.variable_attrs state["loss_scaling"] = self.loss_scaling state["bias_correction"] = self.bias_correction state["accum_type"] = self.accum_type state["first_order_momentum_accum_type"] = \ self.first_order_momentum_accum_type state["second_order_momentum_accum_type"] = \ self.second_order_momentum_accum_type # Mark the state as dirty only if there is one. state["_state_dict"] = self._state_dict state["ipu_state_is_dirty"] = self.has_state() state["host_state_is_dirty"] = False return state def _check_constructor_match_parent(child_class: Type[torch.optim.Optimizer] ) -> None: parent = child_class.__bases__[1] parent_params = inspect.signature(parent.__init__).parameters child_params = inspect.signature(child_class.__init__).parameters extra_args = child_class._child_only # pylint: disable=protected-access assert len(parent_params) + len(extra_args) == len(child_params), ( f"Expected {len(parent_params) + len(extra_args)} parameters but got " f"{len(child_params)}") child_params = iter(child_params.items()) for idx, (_, param) in enumerate(parent_params.items()): _, child_param = next(child_params) assert child_param.name == param.name, ( f"Mismatch for parameter {idx}: expected" f"'{param}' but got '{child_param}'") for extra_arg in extra_args: name, _ = next(child_params) assert name == extra_arg, (f"Expected an extra argument named " f"'{extra_arg}' but got '{name}'") _check_constructor_match_parent(SGD) _check_constructor_match_parent(Adam) _check_constructor_match_parent(AdamW) _check_constructor_match_parent(RMSprop) ================================================ FILE: python/options.py ================================================ # Copyright (c) 2020 Graphcore Ltd. All rights reserved. import os import json import copy from typing import Optional, Union, Dict, Any, List, Set import torch from . import enums from ._logging import logger from . import _options_config from . import _options_impl from . import ops class Attribute(): _current_attrs = {} def __init__(self, **kwargs): self._kwargs = kwargs self._saved = {} def __enter__(self): self._saved = copy.deepcopy(Attribute._current_attrs) for attr, dictionary in self._kwargs.items(): for k, v in dictionary.items(): torch.ops.poptorch.set_attribute(attr, k, v) if attr in Attribute._current_attrs: Attribute._current_attrs[attr].update(dictionary) else: Attribute._current_attrs[attr] = dictionary def __exit__(self, type, value, traceback): for attr, dictionary in self._kwargs.items(): saved_dict = self._saved.get(attr, {}) for k in dictionary.keys(): if k not in saved_dict: torch.ops.poptorch.clear_attribute(attr, k) else: torch.ops.poptorch.set_attribute(attr, k, saved_dict[k]) Attribute._current_attrs = self._saved # Used by _options_config, defined here so that it is reported # to the user as a "poptorch.options.ConfigFileError" class ConfigFileError(Exception): pass class _JitOptions(_options_impl.OptionsDict): """Options related to PyTorch's JIT compiler. Can be accessed via :py:attr:`poptorch.Options.Jit`: >>> opts = poptorch.Options() >>> opts.Jit.traceModel(True) """ def traceModel(self, trace_model: bool) -> "poptorch.options._JitOptions": """ DO NOT USE: about to be removed. """ logger.warning("[Deprecated] Do not call options.Jit.traceModel(): " "options.Jit.traceModel(False) is now the default, " "and True is no longer supported, therefore this " "function will be removed shortly") if trace_model: raise ValueError( "options.Jit.traceModel(True) is no longer supported") return self class _PrecisionOptions(_options_impl.OptionsDict): """ Options related to processing the PyTorch JIT graph prior to lowering to PopART Can be accessed via :py:attr:`poptorch.Options.Precision`: >>> opts = poptorch.Options() >>> opts.Precision.enableFloatingPointExceptions(True) """ def __init__(self, popart_options: "poptorch.options._PopartOptions") -> None: self._popart_options = popart_options super().__init__() def halfFloatCasting( self, half_float_casting: "poptorch.HalfFloatCastingBehavior" # pylint: disable=unused-argument ) -> "poptorch.options._PrecisionOptions": """ DO NOT USE: about to be removed. """ logger.warning("[Deprecated] Do not call " "options.Precision.halfFloatCasting(): " "HalfUpcastToFloat is now the only supported option " "and matches PyTorch's behaviour so you don't need " "to explicitly set it.") return self def runningStatisticsAlwaysFloat(self, value: bool ) -> "poptorch.options._PrecisionOptions": """ DO NOT USE: about to be removed. """ if not isinstance(value, bool): raise ValueError( "runningStatisticsAlwaysFloat needs to be set to a bool") logger.warning("[Deprecated] Do not call " "options.Precision.runningStatisticsAlwaysFloat(): " "False is now the only supported option " "and matches PyTorch's behaviour so you don't need " "to explicitly set it.") return self def enableFloatingPointExceptions( self, enabled: bool) -> "poptorch.options._PrecisionOptions": """Set whether floating point exceptions are enabled on the IPU. When enabled, an exception will be generated when the IPU encounters any one of the following: * Operation resulting in subtraction of infinities * Divisions by zero or by infinity * Multiplications between zero and infinity * Real operations producing complex results * Comparison where any one operand is Not-a-Number :param enabled: * True: raise ``RuntimeError`` on floating point exception * False: do not raise ``RuntimeError`` (default) """ assert isinstance(enabled, bool), \ "enableFloatingPointExceptions needs to be set to a bool" self._popart_options.set("enableFloatingPointChecks", enabled) return self def enableStochasticRounding(self, enabled: bool ) -> "poptorch.options._PrecisionOptions": """Set whether stochastic rounding is enabled on the IPU. Stochastic rounding rounds up or down a values to half (float16) randomly such that that the expected (mean) result of rounded value is equal to the unrounded value. It can improve training performance by simulating higher precision behaviour and increasing the speed or likelihood of model convergence. However, the model is non-deterministic and represents a departure from (deterministic) standard IEEE FP16 behaviour. In the general case, we recommend enabling stochastic rounding for training where convergence is desirable, but not for inference where non-determinism may be undesirable. :param enabled: * True: Enable stochastic rounding on the IPU. * False: Disable stochastic rounding. """ self._popart_options.set("enableStochasticRounding", enabled) return self def setPartialsType(self, dtype: torch.dtype ) -> "poptorch.options._PrecisionOptions": """Set the data type of partial results for matrix multiplication and convolution operators. The matrix multiplication and convolution operators store intermediate results known as partials as part of the calculation. You can use this option to change the data type of the partials. Using ``torch.half`` reduces on-chip memory use at the cost of precision. :param torch.dtype type: The type to store partials, which must be either ``torch.float`` or ``torch.half`` """ type_str = '' if dtype in [torch.float, torch.float32]: type_str = 'float' elif dtype in [torch.half, torch.float16]: type_str = 'half' else: raise ValueError("parameter to setPartialsType should be either" \ "torch.float or torch.half") self._popart_options.set("partialsTypeMatMuls", type_str) self._popart_options.set("convolutionOptions", {"partialsType": type_str}) return self class _TrainingOptions(_options_impl.OptionsDict): """Options specific to model training. .. note:: You must not set these options for inference models. Can be accessed via :py:attr:`poptorch.Options.Training`: >>> opts = poptorch.Options() >>> opts.Training.gradientAccumulation(4) """ def __init__(self, popart_options: "poptorch.options._PopartOptions") -> None: self._popart_options = popart_options super().__init__(gradient_accumulation=1, accumulation_and_replication_reduction_type=enums. ReductionType.Mean, meanAccumulationAndReplicationReductionStrategy=enums. MeanReductionStrategy.Post) def gradientAccumulation(self, gradient_accumulation: int ) -> "poptorch.options._TrainingOptions": """Number of micro-batches to accumulate for the gradient calculation. Accumulate the gradient ``gradient_accumulation`` times before updating the model using the gradient. Other frameworks may refer to this setting as "pipeline depth". Accumulate the gradient ``gradient_accumulation`` times before updating the model using the gradient. Each micro-batch (a batch of size equal to the ``batch_size`` argument passed to :py:class:`~poptorch.DataLoader`) corresponds to one gradient accumulation. Therefore ``gradient_accumulation`` scales the global batch size (number of samples between optimiser updates). .. note:: Increasing ``gradient_accumulation`` does not alter the (micro-)batch size used for batch normalisation. A large value for ``gradient_accumulation`` can improve training throughput by amortising optimiser update costs, most notably when using :py:class:`~poptorch.PipelinedExecution` or when training is distributed over a number of replicas. However, the consequential increase in the number of samples between optimiser updates can have an adverse impact on training. The reason why the efficiency gains are most notable when training with models with multiple IPUs which express pipelined model parallelism (via :py:class:`~poptorch.PipelinedExecution` or by default and annotating the model :py:class:`~poptorch.BeginBlock` or :py:class:`~poptorch.Block`) is because the pipeline has "ramp up" and "ramp down" steps around each optimiser update. Increasing the gradient accumulation factor in this instance reduces the proportion of time spent in the "ramp up" and "ramp down" phases, increasing overall throughput. When training involves multiple replicas, including the cases of sharded and phased execution, each optimiser step incurs a communication cost associated with the reduction of the gradients. By accumulating gradients, you can reduce the total number of updates required and thus reduce the total amount of communication. .. note:: Increasing the global batch size can have adverse effects on the sample efficiency of training so it is recommended to use a low or unity gradient accumulation count initially, and then try increasing to achieve higher throughput. You may also need to scale other hyper-parameters such as the optimiser learning rate accordingly. """ self.set(gradient_accumulation=gradient_accumulation) return self def _check_reduction_arg(self, reduction_type, name): incorrect_instance = not isinstance(reduction_type, enums.ReductionType) no_red = reduction_type == enums.ReductionType.NoReduction if incorrect_instance or no_red: raise ValueError(name + " must be set to " "poptorch.ReductionType.Mean or " "poptorch.ReductionType.Sum") def accumulationAndReplicationReductionType( self, reduction_type: "poptorch.ReductionType" ) -> "poptorch.options._TrainingOptions": """Set the type of reduction applied to reductions in the graph. When using, a value for greater than one for :py:func:`~poptorch.options._TrainingOptions.gradientAccumulation` or for :py:func:`~poptorch.Options.replicationFactor`, PopTorch applies a reduction to the gradient outputs from each replica, and to the accumulated gradients. This reduction is independent of the model loss reduction (summing a mean-reduced loss and a sum-reduced loss in a PyTorch model is valid). This setting governs both the accumulation of the loss gradients in replicated graphs and of all of the gradients when using gradient accumulation. :param reduction_type: * Mean (default): Reduce gradients by calculating the mean of them. * Sum: Reduce gradients by calculating the sum of them. """ self._check_reduction_arg(reduction_type, "accumulationAndReplicationReductionType") self.set(accumulation_and_replication_reduction_type=reduction_type) self._warnings_disabled.add( "accumulation_and_replication_reduction_type") return self def setMeanAccumulationAndReplicationReductionStrategy( self, mean_reduction_strategy: "poptorch.MeanReductionStrategy" ) -> "poptorch.options._TrainingOptions": """Specify when to divide by a mean reduction factor when ``accumulationAndReplicationReductionType`` is set to ``ReductionType.Mean``. The default reduction strategy depends on the optimizer used. The default strategy is `Running` when the `accum_type` of the optimizer is set to half-precision (float16) format. Otherwise the `Post` strategy is used as this strategy is typically more performant but the `Post` strategy is less numerically robust. :param mean_reduction_strategy: * Running: Keeps the reduction buffer as the current mean. This is preferred for numerical stability as the buffer value is never larger than the magnitude of the largest micro batch gradient. * Post: Divides by the accumulationFactor and replicatedGraphCount after all of the gradients have been reduced. In some cases this can be faster then using Running, however is prone to overflow. * PostAndLoss (deprecated): Divides by the replicatedGraphCount before the backwards pass, performs the gradient reduction across micro batches, and then divides by the accumulationFactor. This is to support legacy behaviour and is deprecated. """ self.set(meanAccumulationAndReplicationReductionStrategy= mean_reduction_strategy) return self def setAutomaticLossScaling(self, enabled: bool ) -> "poptorch.options._TrainingOptions": """Set whether automatic loss scaling is enabled on the IPU. When using float16/half values for activations, gradients, and weights, the loss value needs to be scaled by a constant factor to avoid underflow/overflow. This adjustment is known as loss scaling. This setting automatically sets a global loss scaling factor during training. Note: Automatic loss scaling is a preview feature. It is well tested and enabled in some of our example applications, but may not behave as expected in all models. Recommendation: if your model with automatic loss scaling enabled does not converge or triggers a compilation error, then you will need to set the loss scale manually. :param enabled: * True: Enable automatic loss scaling on the IPU. * False: Disable automatic loss scaling. """ self._popart_options.set("automaticLossScalingSettings.enabled", enabled) return self def setConvolutionDithering(self, enabled: bool ) -> "poptorch.options._TrainingOptions": """Enable convolution dithering. If true, then convolutions with different parameters will be laid out from different tiles in an effort to improve tile balance in models. Use ``MultiConv`` to apply this option to specific set of convolutions. :param enabled: Enables or disables convolution dithering for all convolutions. """ self._popart_options.set("convolutionOptions", {"enableConvDithering": enabled}) return self class _PopartOptions: """Options specific to the PopART backend. Only for advanced users. Most options from `popart.SessionOptions` can be set using this class. .. note:: there is no mapping for the various PopART enums so integers need to be used instead. Can be accessed via :py:attr:`poptorch.Options._Popart`: >>> opts = poptorch.Options() >>> opts._Popart.set("autoRecomputation", 3) # RecomputationType::Pipeline >>> opts._Popart.set("syntheticDataMode", >>> int(popart.SyntheticDataMode.RandomNormal)) """ def __init__(self) -> None: self._is_frozen = False self.options = {} self.set("instrumentWithHardwareCycleCounter", False) self.set("rearrangeAnchorsOnHost", False) def __deepcopy__(self, memory): copied_options = _PopartOptions() memory[id(self)] = copied_options for key, val in self.__dict__.items(): if key == '_is_frozen': val = False setattr(copied_options, key, copy.deepcopy(val, memory)) return copied_options def checkIsFrozen(self, option=None): # Skip check during object initialization. if hasattr(self, '_is_frozen'): if option != '_is_frozen' and self._is_frozen: raise AttributeError("Can't modify frozen Options") def set(self, key: str, value: Union[int, float, str, List[str], Set[str]] ) -> "poptorch.options._PopartOptions": self.checkIsFrozen() self.options[key] = value return self def setEngineOptions(self, engine_options: Dict[str, str] ) -> "poptorch.options._PopartOptions": self.set('engineOptions', engine_options) return self def setPatterns(self, patterns: Dict[str, bool], level: int = 2) -> "poptorch.options._PopartOptions": """Override the default patterns of PopART's compiler. :param patterns: Dictionary of pattern names to enable / disable. :param level: Integer value corresponding to the ``popart::PatternsLevel`` to use to initialise the ``Patterns``. """ assert isinstance(level, int) assert isinstance(patterns, dict) self.set("patterns_level", level) self.set("patterns", patterns) return self def __repr__(self): repr_body = ", ".join(f"{k}={v.__repr__()}" for k, v in self.options.items()) return f"{type(self).__name__}({repr_body})" class _DistributedOptions(_options_impl.OptionsDict): """Options related to distributed execution. You should not use these when using PopRun/PopDist. Instead use ``popdist.poptorch.Options`` to set these values automatically. Can be accessed via :py:attr:`poptorch.Options.Distributed`: >>> opts = poptorch.Options() >>> opts.Distributed.configureProcessId(0, 2) """ def __init__(self) -> None: self._gcd_mappings = {} super().__init__(num_distributed_processes=1, distributed_process_id=0, ipuof_configs={}) self.setEnvVarNames("OMPI_COMM_WORLD_SIZE", "OMPI_COMM_WORLD_RANK") def disable(self) -> "poptorch.options._DistributedOptions": """Ignore the current options / environment variables and disable distributed execution. """ self.set(num_distributed_processes=1, distributed_process_id=0) return self def setEnvVarNames(self, var_num_processes: str, var_process_id: str ) -> "poptorch.options._DistributedOptions": """Utility to read and set `processId` and `numProcesses` from environment variables. Useful if you use a third party library to manage the processes used for the distributed execution such as mpirun. For example: ``mpirun -np 4 myscript.py`` By default the OpenMPI ``OMPI_COMM_WORLD_SIZE`` and ``OMPI_COMM_WORLD_RANK`` variables are used. """ return self.configureProcessId( int(os.environ.get(var_process_id, "0")), int(os.environ.get(var_num_processes, "1"))) def configureProcessId(self, process_id: int, num_processes: int ) -> "poptorch.options._DistributedOptions": """Manually set the current process ID and the total number of processes. :param int process_id: The ID of this process. :param int num_processes: The total number of processes the execution is distributed over. """ self.set(distributed_process_id=process_id) self.set(num_distributed_processes=num_processes) return self @property def processId(self) -> int: """Id of the current process.""" return self.distributed_process_id @property def numProcesses(self) -> int: """Total number of processes the execution is distributed over.""" return self.num_distributed_processes class TensorLocationSettings(_options_impl.OptionsDict): """Define where a tensor is stored >>> opts = poptorch.Options() >>> opts.TensorLocations.setActivationLocation( ... poptorch.TensorLocationSettings().useOnChipStorage(False)) """ def minElementsForOffChip(self, min_elements: int ) -> "poptorch.TensorLocationSettings": """A minimum number of elements below which offloading won't be considered.""" assert isinstance(min_elements, int) self.createOrSet(minElementsForOffChip=min_elements) return self def minElementsForReplicatedTensorSharding( self, min_elements: int) -> "poptorch.TensorLocationSettings": """Only enable replicated tensor sharding (RTS) for tensors with more than `min_elements` elements.""" assert isinstance(min_elements, int) self.createOrSet(minElementsForReplicatedTensorSharding=min_elements) return self def useOnChipStorage(self, use: bool = True ) -> "poptorch.TensorLocationSettings": """Permanent tensor storage :param bool use: True: use on chip memory. False: use off chip memory. None: keep it undefined. """ if use is None: self.deleteIfExists("onChip") else: assert isinstance(use, bool) self.createOrSet(onChip=int(use)) return self def useReplicatedTensorSharding(self, use: bool = True ) -> "poptorch.TensorLocationSettings": """Enable replicated tensor sharding (relevant for weights and optimiser states) """ assert isinstance(use, bool) self.createOrSet(useReplicatedTensorSharding=int(use)) return self def useIOTilesToLoad(self, use: bool = True ) -> "poptorch.TensorLocationSettings": """Load tensor through IO tiles :param use: Use IO tiles if True, use Compute tiles if False. """ assert isinstance(use, bool) self.createOrSet(useIOTilesToLoad=int(use)) return self def useIOTilesToStore(self, use: bool = True ) -> "poptorch.TensorLocationSettings": """Use IO tiles to store tensors. (relevant for replicated tensor sharded tensors) :param use: Use IO tiles if True, use Compute tiles if False. """ assert isinstance(use, bool) self.createOrSet(useIOTilesToStore=int(use)) return self class _TensorLocationOptions(_options_impl.OptionsDict): """Options controlling where to store tensors. Can be accessed via :py:attr:`poptorch.Options.TensorLocations`: >>> opts = poptorch.Options() >>> opts.TensorLocations.setActivationLocation( ... poptorch.TensorLocationSettings().useOnChipStorage(False)) """ def numIOTiles(self, num_tiles: int) -> "poptorch.TensorLocationSettings": """ Assigns the number of tiles on the IPU to be IO rather than compute. Allocating IO (input/output) tiles reduces the number of IPU tiles available for computation but allows you to reduce the latency of copying tensors from host to the IPUs using the function :py:func:`~poptorch.set_overlap_for_input`, IPUs to host using the function :py:func:`~poptorch.set_overlap_for_output` or to use off-chip memory with reduced by setting the option :py:meth:`~poptorch.TensorLocationSettings.useIOTilesToLoad`. As reducing the number of computation tiles may reduce performance, you should not use any IO tiles until you have successfully run your model and used profiling to identify "streamCopy" entries which take up a significant proportion of execution time. """ assert isinstance(num_tiles, int) err_msg = "numIOTiles must be an even number between 32 and 192." assert num_tiles >= 32, err_msg assert num_tiles <= 192, err_msg assert num_tiles % 2 == 0, err_msg self.createOrSet(numIOTiles=num_tiles) return self def setActivationLocation(self, location: "poptorch.TensorLocationSettings" ) -> "poptorch.options._TensorLocationOptions": """ :param location: Update tensor location settings for activations. """ assert isinstance(location, TensorLocationSettings) self.createOrSet(location_activation=location.toDict()) return self def setWeightLocation(self, location: "poptorch.TensorLocationSettings" ) -> "poptorch.options._TensorLocationOptions": """ :param location: Update tensor location settings for weights. """ assert isinstance(location, TensorLocationSettings) self.createOrSet(location_weight=location.toDict()) return self def setOptimizerLocation(self, location: "poptorch.TensorLocationSettings" ) -> "poptorch.options._TensorLocationOptions": """ :param location: Update tensor location settings for optimiser states. """ assert isinstance(location, TensorLocationSettings) self.createOrSet(location_optimizer=location.toDict()) return self def setAccumulatorLocation(self, location: "poptorch.TensorLocationSettings" ) -> "poptorch.options._TensorLocationOptions": """ :param poptorch.TensorLocationSettings location: Update tensor location settings for accumulators. """ assert isinstance(location, TensorLocationSettings) self.createOrSet(location_accumulator=location.toDict()) return self BlockId = str class Stage: """ The various execution strategies are made of `Stages`: a stage consists of one of more `Blocks` running on one IPU. .. seealso:: :py:class:`~poptorch.PipelinedExecution`, :py:class:`~poptorch.ShardedExecution`, :py:class:`~poptorch.ParallelPhasedExecution`, :py:class:`~poptorch.SerialPhasedExecution`. """ def __init__(self, *block_ids: BlockId) -> None: assert all(isinstance(b, str) for b in block_ids), ( "Block IDs are " f"supposed to be strings but got {block_ids}") self._blocks = block_ids self._stage_id = -1 self._phase_id = -1 self._ipu = None @property def blocks(self) -> List[BlockId]: """List of blocks this stage is made of.""" return self._blocks def ipu(self, ipu: int) -> "poptorch.Stage": """Set the IPU on which this stage will run""" assert isinstance(ipu, int) self._ipu = ipu return self def _setStage(self, stage: int) -> "poptorch.Stage": if stage is not None: self._stage_id = stage return self class _DefaultStageManager(_options_impl.IStageManager): def __init__(self, auto_stage: "poptorch.AutoStage") -> None: super().__init__() self._next_id = 1 self._block_map = {} self._auto_stage = auto_stage def getStage(self, block_id: BlockId) -> "poptorch.Stage": if block_id not in self._block_map: stage = Stage(block_id) if self._auto_stage == enums.AutoStage.SameAsIpu: assert self._current_ipu is not None, ( f"poptorch.AutoStage.SameAsIpu was selected but no " f"IPU was specified for block {block_id}") stage_id = self._current_ipu else: stage_id = self._next_id self._next_id += 1 stage._setStage(stage_id) # pylint: disable=protected-access self._block_map[block_id] = stage return self._block_map[block_id] class _IExecutionStrategy: def __init__(self, stages_manager, block_map): self._block_map = block_map self._stages_manager = stages_manager def stage(self, block_id): """Return the :py:class:`~poptorch.Stage` the given block is belongs to. :param str block_id: A block ID. """ assert block_id in self._block_map, f"Unknown block {block_id}" return self._block_map[block_id] def onStartTracing(self): self._stages_manager.clearDebug() ops.Block._stages_manager = self._stages_manager # pylint: disable=protected-access def onEndTracing(self): self._stages_manager.printDebug() ops.Block._stages_manager = None # pylint: disable=protected-access def backendOptions(self): return {} class Phase: """Represents an execution phase""" def __init__(self, *arg: Union[BlockId, "poptorch.Stage"]): """ Create a phase. :param arg: must either be one or more :py:class:`Stages`, or one or more blocks ``user_id``. If one or more strings are passed they will be interpreted as :py:class:`~poptorch.Block` IDs representing a single :py:class:`~poptorch.Stage`. Within a ``Phase``, the stages will be executed in parallel. >>> with poptorch.Block("A"): ... layer() >>> with poptorch.Block("B"): ... layer() >>> p = Phase(poptorch.Stage("A").ipu(0)) >>> # 2 stages made of one block each >>> p = Phase(poptorch.Stage("A").ipu(0), poptorch.Stage("B").ipu(1)) >>> p = Phase("A","B") # One Stage made of 2 blocks """ if all(isinstance(elt, Stage) for elt in arg): self.stages = arg else: assert all(isinstance(elt, str) for elt in arg), \ "All arguments must either be block IDs (strings) or " \ "Stages: " + str([type(elt) for elt in arg]) self.stages = [Stage(*arg)] def stage(self, idx): return self.stages[idx] def ipus(self, *ipus): """Assign one IPU for each stage contained in this Phase. The number of IPUs passed must match the number of stages in the Phase. """ assert len(ipus) == len(self.stages), ( f"Phase contains " f"{len(self.stages)} stages but you provided {len(ipus)} ipus") for stage, ipu in zip(self.stages, ipus): stage.ipu(ipu) class PipelinedExecution(_IExecutionStrategy): def __init__(self, *args): """Pipeline the execution of the graph partitions. These partitions can be: a :py:class:`~poptorch.Stage`, a :py:class:`~poptorch.Block` or a :py:class:`~poptorch.BeginBlock`. If none of these are passed, an :py:class:`~poptorch.AutoStage` strategy can be passed instead to decide how the stage IDs are created. By default, `poptorch.AutoStage.SameAsIpu` is used: The stage ID will be set to the selected IPU number. This implies that each unique :py:class:`~poptorch.Block` or :py:class:`~poptorch.BeginBlock` in the graph must have their `ipu_id` explicitly set when using `AutoStage`. Example 1: Blocks `user_id` are known, IPUs are inferred. >>> with poptorch.Block("A"): ... layer1() >>> with poptorch.Block("B"): ... layer2() >>> with poptorch.Block("C"): ... layer3() >>> with poptorch.Block("D"): ... layer4() >>> opts = poptorch.Options() >>> # Create a 4 stages pipeline based on `user_id`, 4 IPUs will be used. >>> opts.setExecutionStrategy(poptorch.PipelinedExecution("A","B", ... "C","D")) Stages can also be set explicitly: >>> # Create a 2 stages pipeline with the blocks `user_id`, 2 IPUs will be used. >>> opts.setExecutionStrategy(poptorch.PipelinedExecution( ... poptorch.Stage("A","B"), ... poptorch.Stage("C","D"))) Example 2: Blocks `ipu_id` are known, use default AutoStage. >>> poptorch.Block.useAutoId() >>> with poptorch.Block(ipu_id=0): ... layer1() >>> with poptorch.Block(ipu_id=1): ... layer2() >>> with poptorch.Block(ipu_id=2): ... layer3() >>> with poptorch.Block(ipu_id=3): ... layer4() >>> # Automatically create a 4-stage pipeline matching the block `ipu_id`. >>> opts.setExecutionStrategy(poptorch.PipelinedExecution()) >>> # Note: poptorch.PipelinedExecution() >>> # is the default execution strategy when blocks are defined. Example 3: Non-consecutive stages placed on the same IPU. >>> with poptorch.Block(ipu_id=0): ... layer1() >>> with poptorch.Block(ipu_id=1): ... layer2() >>> with poptorch.Block(ipu_id=0): ... layer3() >>> # Automatically create a 3-stage pipeline forcing the stage >>> # IDs to be incremental. >>> opts.setExecutionStrategy(poptorch.PipelinedExecution( ... poptorch.AutoStage.AutoIncrement)) :param args: Either a :py:class:`~poptorch.AutoStage` strategy or an explicit list of stages or block IDs. :type args: poptorch.AutoStage, [str], [poptorch.Stage] """ block_map = {} auto_stage = enums.AutoStage.SameAsIpu if len(args) == 1 and isinstance(args[0], enums.AutoStage): auto_stage = args[0] else: for stage_id, arg in enumerate(args): # arg must either be a Stage, a block_id or a list of block_ids if isinstance(arg, Stage): stage = arg elif isinstance(arg, str): stage = Stage(arg) else: assert all(isinstance(elt, str) for elt in arg) stage = Stage(*arg) stage._setStage(stage_id) # pylint: disable=protected-access for block in stage.blocks: assert block not in block_map, ( f"{block} associated " f"with more than one stage") logger.debug( "block %s added to stage %d%s", block, stage_id, " on IPU %d" % stage._ipu if stage._ipu is not None else '') block_map[block] = stage if block_map: class PipelineStageManager(_options_impl.IStageManager): def __init__(self, block_map): super().__init__() self._block_map = block_map def getStage(self, block_id): assert block_id in self._block_map, ( f"Unknown Block " f"'{block_id}' list of expected Blocks: " f"{list(self._block_map.keys())}") return self._block_map[block_id] stages_manager = PipelineStageManager(block_map) else: stages_manager = _DefaultStageManager(auto_stage) super().__init__(stages_manager, block_map) def backendOptions(self): return {"execution_mode": 0} class ShardedExecution(PipelinedExecution): """Will shard the execution of the passed Stages or if no stage is passed will consider each unique Block `ipu_id` encountered during tracing as a different stage. >>> with poptorch.Block(ipu_id=0): ... layer() >>> with poptorch.Block(ipu_id=1): ... layer() >>> with poptorch.Block(ipu_id=2): ... layer() >>> opts = poptorch.Options() >>> # Automatically create 3 shards based on the block names >>> opts.setExecutionStrategy(poptorch.ShardedExecution()) :param args: Either a :py:class:`~poptorch.AutoStage` strategy or an explicit list of stages or block IDs. :type args: poptorch.AutoStage, [str], [poptorch.Stage] """ def backendOptions(self): return {"execution_mode": 1} class _IPhasedExecution(_IExecutionStrategy): """Common interface for Phased execution strategies""" def __init__(self, *phases: Union["poptorch.Phase", List["poptorch.Stage"], List[BlockId]]): """Execute the model's blocks in phases :param phases: Definition of phases must be either: - a list of :py:class:`~poptorch.Phase` - a list of list of :py:class:`~poptorch.Stage` - a list of list of :py:class:`~poptorch.Block` IDs (Each list of blocks will be considered as a single :py:class:`~poptorch.Stage`) :type phases: [:py:class:`~poptorch.Phase`], [[:py:class:`~poptorch.Stage`]], [[str]] """ self._tensors_liveness = enums.Liveness.AlwaysLive self._separate_backward_phase = False self._phases = [] block_map = {} for phase_id, args in enumerate(phases): if isinstance(args, Phase): phase = args else: if not isinstance(args, list): args = [args] phase = Phase(*args) self._phases.append(phase) for _, stage in enumerate(phase.stages): stage._phase_id = phase_id for block in stage.blocks: assert block not in block_map, (f"{block} associated " "with more than one stage") logger.debug( "block %s added to phase %d%s", block, phase_id, " on IPU %d" % stage._ipu if stage._ipu is not None else '') block_map[block] = stage if phases: class PhaseManager(_options_impl.IStageManager): def __init__(self, block_map): super().__init__() self._block_map = block_map def getStage(self, block_id): assert block_id in self._block_map, ( f"Unknown Block " f"'{block_id}' list of expected Blocks: " f"{list(self._block_map.keys())}") return self._block_map[block_id] stages_manager = PhaseManager(block_map) else: # TODO(T30127): Define what the default strategy should be. # stages_manager = _DefaultStageManager(enums.AutoStage.SameAsIpu) assert phases, ( "There is currently no AutoStage for " "PhasedExecution, please explicitly specify the phases") super().__init__(stages_manager, block_map) def phase(self, phase: int) -> "poptorch.Phase": """Return the requested :py:class:`~poptorch.Phase` :param phase: Index of the phase """ assert isinstance( phase, int) and phase >= 0, "Phases are identified by positive integers" return self._phases[phase] def useSeparateBackwardPhase(self, use: bool = True): """Given a forward pass with 3 phases (0,1,2), by default the phases will run as follows: :: fwd: bwd: phase 0 -> phase 4 phase 1 -> phase 3 phase 2 -> phase 2 .. note:: The end of the forward pass and the beginning of the backward pass are part of the same phase. If ``useSeparateBackwardPhase(True)`` is used then no phase will be shared between the forward and backward passes: :: fwd: bwd: phase 0 -> phase 6 phase 1 -> phase 5 phase 2 -> phase 4 """ assert isinstance(use, bool) self._separate_backward_phase = use return self def backendOptions(self) -> Dict[str, Union[int, bool]]: return { "execution_mode": 2, "separate_backward_phase": self._separate_backward_phase, "tensors_liveness": self._tensors_liveness.value } class ParallelPhasedExecution(_IPhasedExecution): """Phases are executed in parallel alternating between two groups of IPUs. For example: - phase 0 runs on ipu 0 & 2 - phase 1 runs on ipu 1 & 3 - phase 2 runs on ipu 0 & 2 >>> poptorch.Block.useAutoId() >>> with poptorch.Block(): # user_id = "0" ... layer() >>> with poptorch.Block(): # user_id = "1" ... layer() >>> with poptorch.Block(): # user_id = "2" ... layer() >>> with poptorch.Block(): # user_id = "3" ... layer() >>> with poptorch.Block(): # user_id = "4" ... layer() >>> with poptorch.Block(): # user_id = "5" ... layer() >>> opts = poptorch.Options() >>> strategy = poptorch.ParallelPhasedExecution([ ... poptorch.Phase(poptorch.Stage("0"), poptorch.Stage("1")), ... poptorch.Phase(poptorch.Stage("2"), poptorch.Stage("3")), ... poptorch.Phase(poptorch.Stage("4"), poptorch.Stage("5"))]) >>> strategy.phase(0).ipus(0,2) >>> strategy.phase(1).ipus(1,3) >>> strategy.phase(2).ipus(0,2) >>> opts.setExecutionStrategy(strategy) """ def backendOptions(self) -> Dict[str, Union[int, bool]]: return {**super().backendOptions(), "serial_phases_execution": False} def sendTensorsOffChipAfterFwd(self, off_chip: bool = True ) -> "poptorch.ParallelPhasedExecution": assert isinstance(off_chip, bool) if off_chip: self._tensors_liveness = enums.Liveness.OffChipAfterFwd else: self._tensors_liveness = enums.Liveness.AlwaysLive return self class SerialPhasedExecution(_IPhasedExecution): """All the phases run serially on a single group of IPUs. For example: - phase 0 runs on ipu 0 & 1 - phase 1 runs on ipu 0 & 1 - phase 2 runs on ipu 0 & 1 >>> with poptorch.Block("A"): ... layer() >>> with poptorch.Block("A2"): ... layer() >>> with poptorch.Block("B"): ... layer() >>> with poptorch.Block("B2"): ... layer() >>> with poptorch.Block("C"): ... layer() >>> with poptorch.Block("C2"): ... layer() >>> opts = poptorch.Options() >>> strategy = poptorch.SerialPhasedExecution([ ... poptorch.Phase(poptorch.Stage("A"), poptorch.Stage("A2")), ... poptorch.Phase(poptorch.Stage("B"), poptorch.Stage("B2")), ... poptorch.Phase(poptorch.Stage("C"), poptorch.Stage("C2"))]) >>> strategy.phase(0).ipus(0,1) >>> strategy.phase(1).ipus(0,1) >>> strategy.phase(2).ipus(0,1) >>> opts.setExecutionStrategy(strategy) """ def setTensorsLiveness(self, liveness: "poptorch.Liveness" ) -> "poptorch.SerialPhasedExecution": """See :py:class:`~poptorch.Liveness` for more information """ assert isinstance(liveness, enums.Liveness) self._tensors_liveness = liveness return self def backendOptions(self) -> Dict[str, Union[int, bool]]: return {**super().backendOptions(), "serial_phases_execution": True} # pylint: disable=too-many-public-methods class Options(_options_impl.OptionsDict): """Set of all options controlling how a model is compiled and executed. Pass an instance of this class to the model wrapping functions :py:func:`~poptorch.inferenceModel` and :py:func:`~poptorch.trainingModel` to change how the model is compiled and executed. An instance includes general options set within this class such as :py:func:`~poptorch.Options.deviceIterations` as well as properties referring to categories of options such as ``Training``. >>> opts = poptorch.Options() >>> opts.deviceIterations(10) >>> opts.Training.gradientAccumulation(4) """ def __init__(self) -> None: self._jit = _JitOptions() self._popart = _PopartOptions() self._graphProcessing = _PrecisionOptions(self._popart) self._training = _TrainingOptions(self._popart) self._distributed = _DistributedOptions() self._tensor_locations = _TensorLocationOptions() self._execution_strategy = PipelinedExecution() # Don't pass it to super().__init__() -> we don't want it to be passed to the backend with the other # options. (It is passed to createGraph() instead). self._source_location_excludes = copy.copy( _options_impl.default_source_location_excludes) self._progress_bar = _options_impl.ProgressBar() self.relaxOptimizerAttributesChecks(False) self.showCompilationProgressBar(True) self._module_namescope_enabled = True super().__init__(replication_factor=1, input_group_size=1, input_cgt=enums.CommGroupType.Consecutive, broadcast_buffers=True, device_iterations=1, log_dir=".", max_repeat_logs=4, auto_round_num_ipus=False, anchored_tensors={}, output_mode=enums.OutputMode.Default.value, output_return_period=1, connection_type=enums.ConnectionType.Always.value, sync_pattern=enums.SyncPattern.Full.value, available_memory_proportion={}) path = os.environ.get("POPTORCH_CACHE_DIR", "") if path: logger.info("POPTORCH_CACHE_DIR is set: setting cache path to %s", path) self.enableExecutableCaching(path) self.from_json(os.environ.get("POPTORCH_DEFAULT_OPTIONS", r"{}")) def from_json(self, string: str): """Sets values of the object from a JSON string. The format of the JSON string is: {"name.of.accessor": value} Examples: >>> Options().from_json( ... '{"Precision.enableFloatingPointExceptions":true}' ... ) >>> Options().from_json('{"_Popart.set":["OptionName", 1]}') """ def string_to_enum(value): try: enum_type, enum_value = value.split(".") except ValueError: return value try: enum = getattr(getattr(enums, enum_type), enum_value) except AttributeError: return value return getattr(enum, enum_value) values_dict = json.loads(string) for option, v in values_dict.items(): active_obj = self for attribute in option.split("."): active_obj = getattr(active_obj, attribute) # This parses strings into enum type and values if isinstance(v, str) and "." in v: v = string_to_enum(v) if isinstance(v, list): active_obj(*v) else: active_obj(v) return self def sourceLocationExcludes(self, excludes: List[str]) -> "poptorch.Options": """ When printing the IR all the frames containing one of the excluded strings will be ignored. This is helpful to get the IR to trace back to user code rather than some function inside a framework. :param excludes: Replace the current list of exclusions with this one. """ self._source_location_excludes = excludes return self def appendToLocationExcludes(self, *excludes: str) -> "poptorch.Options": """ When printing the IR all the frames containing one of the excluded strings will be ignored. This is helpful to get the IR to trace back to user code rather than some function inside a framework. :param excludes: Append these exclusions to the existing list of exclusions. """ self._source_location_excludes += excludes return self def showCompilationProgressBar(self, show: bool = True) -> "poptorch.Options": """Show / hide a progress bar while the model is being compiled. (The progress bar is shown by default) """ self._show_compilation_progress_bar = show return self def loadFromFile(self, filepath: str) -> "poptorch.Options": """Load options from a config file where each line in the file corresponds to a single option being set. To set an option, simply specify how you would set the option within a Python script, but omit the ``options.`` prefix. For example, if you wanted to set ``options.deviceIterations(1)``, this would be set in the config file by adding a single line with contents ``deviceIterations(1)``. This method can be called multiple times on the same `Options` object. The options will not be reset to their defaults in between. For example, if ``c1.cfg`` contains the following:: deviceIterations(32) replicationFactor(2) and ``c2.cfg`` contains the following:: deviceIterations(4) then calling: .. code-block:: python options.loadFromFile('c1.cfg') options.loadFromFile('c2.cfg') is equivalent to calling: .. code-block:: python options.deviceIterations(4) options.replicationFactor(2) """ _options_config.parseAndSetOptions(self, filepath) return self def relaxOptimizerAttributesChecks(self, relax: bool = True ) -> "poptorch.Options": """Controls whether unexpected attributes in :py:func:`~poptorch.PoplarExecutor.setOptimizer()` lead to warnings or debug messages. By default PopTorch will print warnings the first time it encounters unexpected attributes in :py:func:`~poptorch.PoplarExecutor.setOptimizer()`. :param relax: * True: Redirect warnings to the debug channel. * False: Print warnings about unexpected attributes (default behaviour). """ # Doesn't need to be stored in the OptionsDict because it's only used # by the python side. self._relax_optimizer_checks = relax return self @property def TensorLocations(self) -> "poptorch.options._TensorLocationOptions": """Options related to tensor locations. .. seealso:: :py:class:`~poptorch.options._TensorLocationOptions`""" return self._tensor_locations @property def Distributed(self) -> "poptorch.options._DistributedOptions": """Options specific to running on multiple IPU server (IPU-POD). You should not use these when using PopRun/PopDist. Instead use ``popdist.poptorch.Options`` to set these values automatically. .. seealso:: :py:class:`~poptorch.options._DistributedOptions`""" return self._distributed @property def Jit(self) -> "poptorch.options._JitOptions": """Options specific to upstream PyTorch's JIT compiler. .. seealso:: :py:class:`~poptorch.options._JitOptions`""" return self._jit @property def Precision(self) -> "poptorch.options._PrecisionOptions": """Options specific to the processing of the JIT graph prior to lowering to PopART. .. seealso:: :py:class:`~poptorch.options._PrecisionOptions`""" return self._graphProcessing @property def Training(self) -> "poptorch.options._TrainingOptions": """Options specific to training. .. seealso:: :py:class:`~poptorch.options._TrainingOptions`""" return self._training @property def _Popart(self) -> "poptorch.options._PopartOptions": """Options specific to the PopART backend. (Advanced users only).""" return self._popart def autoRoundNumIPUs(self, auto_round_num_ipus: bool = True ) -> "poptorch.Options": """Whether or not to round up the number of IPUs used automatically: the number of IPUs requested must be a power of 2. By default, an error occurs if the model uses an unsupported number of IPUs to prevent you unintentionally overbooking IPUs. :param auto_round_num_ipus: * True: round up the number of IPUs to a power of 2. * False: error if the number of IPUs is not supported. """ self.set(auto_round_num_ipus=auto_round_num_ipus) return self def deviceIterations(self, device_iterations: int) -> "poptorch.Options": """Number of iterations the device should run over the data before returning to the user (default: 1). This is equivalent to running the IPU in a loop over that the specified number of iterations, with a new batch of data each time. However, increasing ``deviceIterations`` is more efficient because the loop runs on the IPU directly. """ self.set(device_iterations=device_iterations) return self def setExecutionStrategy( self, strategy: Union["poptorch.ParallelPhasedExecution", "poptorch.SerialPhasedExecution"] ) -> "poptorch.Options": """Set the execution strategy to use to partition the graph. :param strategy: Must be an instance of once of the execution strategy classes. .. seealso:: :py:class:`~poptorch.PipelinedExecution`, :py:class:`~poptorch.ShardedExecution`, :py:class:`~poptorch.ParallelPhasedExecution`, :py:class:`~poptorch.SerialPhasedExecution`. """ assert isinstance(strategy, _IExecutionStrategy) self._execution_strategy = strategy return self def setAvailableMemoryProportion( self, available_memory_proportion: Dict[str, float]): """Sets the amount of temporary memory made available on a per-IPU basis. Use this setting to control the amount of temporary memory available to operations such as: * convolution * matrix multiplication * embedding lookups * indexing operations Parameter should be a dictionary of IPU IDs and float values between 0 and 1. (for example, ``{"IPU0": 0.5}``) The floating point value has the same meaning and effect as documented in :py:func:`~poptorch.set_available_memory`. """ actual_memory = {} for key, mem in available_memory_proportion.items(): assert key.startswith("IPU"), ( "Available memory proportions are expected" " to be in a dictionary of {\"IPU0\": 0.5}" " where the 0 in IPU is the index of the" " IPU. Invalid key: %s" % key) ipu_id = int(key[3:]) actual_memory[ipu_id] = mem self.createOrSet(available_memory_proportion=actual_memory) return self def replicationFactor(self, replication_factor: int) -> "poptorch.Options": """Number of times to replicate the model (default: 1). Replicating the model increases the data throughput of the model as PopTorch uses more IPUs. This leads to the number of IPUs used being scaled by ``replication_factor``, for example, if your model uses 1 IPU, a ``replication_factor`` of 2 will use 2 IPUs; if your model uses 4 IPUs, a replication factor of 4 will use 16 IPUs in total. :param replication_factor: Number of replicas of the model to create. """ self.set(replication_factor=replication_factor) return self def inputReplicaGrouping(self, input_group_size: int, input_group_type: "poptorch.CommGroupType" ) -> "poptorch.Options": """Allows the input batches to be split between groups of replicas, in a similar way to what :py:func:`~replicaGrouping` does for weight tensors. :param input_group_size: Number of replicas to place in each input replica group. Must be a factor of ``replication_factor``. Defaults to 1, which will divide the input evenly among all replicas. :param input_group_type: Arrangement type to use when placing replicas into input replica groups. Cannot be ``poptorch.CommGroupType.All``. Defaults to ``poptorch.CommGroupType.Consecutive``. For an explanation of the arrangement types, see :py:class:`~poptorch.CommGroupType` and :numref:`grouping_tensor_weights`. """ if input_group_type == enums.CommGroupType.NoGrouping: input_group_size = 1 input_group_type = enums.CommGroupType.Consecutive if input_group_type == enums.CommGroupType.All: raise ValueError('input_group_type cannot be All') if self.replication_factor < input_group_size: raise ValueError('input_group_size cannot be larger than ' 'replication_factor') if (self.replication_factor % input_group_size) != 0: raise ValueError('input_group_size must be a factor of ' 'replication_factor') self.set(input_group_size=input_group_size) self.set(input_cgt=input_group_type) return self def broadcastBuffers(self, broadcast_buffers: bool = True): """Broadcast buffers to all replicas. Only non-broadcast buffers are currently supported, which means each replica will hold a set of buffers not in sync with other replicas' buffers. To enable non-broadcast buffers, set this option to `False`. """ self.set(broadcast_buffers=broadcast_buffers) return self def logDir(self, log_dir: str) -> "poptorch.Options": """Set the log directory :param log_dir: Directory where PopTorch saves log files (default: current directory) """ self.set(log_dir=log_dir) return self def maxRepeatLogs(self, max_lines: Optional[int]) -> "poptorch.Options": """For often-repeated log lines, set the maximum number of repeated lines that will be logged. :param max_lines: If `None`, show all log messages. Otherwise suppress repeated messages after `max_lines` lines. The default is to suppress after 4 lines. """ self.set(max_repeat_logs=(1 << 64) - 1 if max_lines is None else max_lines) return self def modelName(self, name: str) -> "poptorch.Options": """Set the model name :param name: Name of the model defaults to "inference" or "training" depending on the type of model created. Used when profiling to set the subdirectory of the report directory to output the profiling too. """ self.createOrSet(model_name=name) return self def enableExecutableCaching(self, path: str) -> "poptorch.Options": """Load/save Poplar executables to the specified ``path``, using it as a cache, to avoid recompiling identical graphs. :param path: File path for Poplar executable cache store; setting ``path`` to None`` disables executable caching. """ if path is None: self._Popart.set("enableEngineCaching", False) else: self._Popart.set("cachePath", path) self._Popart.set("enableEngineCaching", True) return self def useIpuModel(self, use_model: bool) -> "poptorch.Options": """Whether to use the IPU Model or physical hardware (default) The IPU model simulates the behaviour of IPU hardware but does not offer all the functionality of an IPU. Please see the Poplar and PopLibs User Guide for further information. This setting takes precedence over the ``POPTORCH_IPU_MODEL`` environment variable. :param use_model: * True: Use the IPU Model. * False: Use IPU hardware. """ self.createOrSet(use_model=use_model) return self def connectionType(self, connection_type: "poptorch.ConnectionType" ) -> "poptorch.Options": """When to connect to the IPU (if at all). :param connection_type: * ``Always``: Attach to the IPU from the start (default). * ``OnDemand``: Wait until the compilation is complete and the executable is ready to be run to attach to the IPU. * ``Never``: Never try to attach to an IPU: this is useful for offline compilation, but trying to run an executable will raise an exception. For example: >>> opts = poptorch.Options() >>> opts.connectionType(poptorch.ConnectionType.OnDemand) """ assert isinstance(connection_type, enums.ConnectionType) self.set(connection_type=connection_type.value) return self def syncPattern(self, sync_pattern: "poptorch.SyncPattern" ) -> "poptorch.Options": """Controls synchronisation in multi-IPU systems. This option can be used to allow subsets of IPUs to overlap their work. For example, one set of IPUs could be communicating with the host while other IPUs are processing data. This option is typically used together with replicated execution, in which case it takes effect on a per-replica basis. If replication is not used, it will apply to all IPUs. :param sync_pattern: * ``Full``: Require all IPUs to synchronise on every communication between IPUs or between IPUs and host. This is the default. * ``SinglePipeline``: Allow IPUs to synchronise with the host independently, without having to synchronise with each other. This permits any one IPU to perform host IO while other IPUs are processing data. * ``ReplicaAndLadder``: Allow an IPU group to communicate with the host without requiring synchronisation between groups. This permits multiple IPU groups to alternate between performing host IO and computation. """ assert isinstance(sync_pattern, enums.SyncPattern) self.set(sync_pattern=sync_pattern.value) return self def useIpuId(self, ipu_id: int) -> "poptorch.Options": """ Use the IPU device specified by the ID (as provided by `gc-info `__). A device ID may refer to a single or to a group of IPUs (a multi-IPU device). The number of IPUs associated with the ID must be equal to the number of IPUs used by your annotated model multiplied by the replication factor. For example if your model uses 1 IPU and the replication factor is 2 you will need to provide a device ID with 2 IPU; if your model is pipelined across 4 IPUs and the replication factor is 4, you will need to provide a device ID which represents a multi-IPU device of 16 IPUs. You can use the the command-line tool `gc-info`: running `gc-info -l`, shows each device ID and a list of IPUs associated with the ID. :param int ipu_id: IPU device ID of a single-IPU or multi-IPU device """ assert isinstance(ipu_id, int) self.createOrSet(ipu_id=ipu_id) return self def useOfflineIpuTarget(self, ipu_version: int = 2) -> "poptorch.Options": """Create an offline IPU target that can only be used for offline compilation. .. note:: the offline IPU target cannot be used if the IPU model is enabled. :param ipu_version: IPU version to target (1 for Mk1, 2 for Mk2, 21 for Mk2 with FP8 support). Default: 2. """ self.connectionType(enums.ConnectionType.Never) self.createOrSet(ipu_version=ipu_version) return self def anchorTensor(self, short_name: str, long_name: str, output_mode: Optional["poptorch.OutputMode"] = None, output_return_period: Optional[int] = 1): """Anchor a tensor such that it may be retrieved after a model run. :param str short_name: User defined name to be used for retrieval :param str long_name: The PopART name of the tensor to be anchored :param poptorch.OutputMode output_mode: Specifies when data should be returned. Default to None, in which case the tensor will use the same output mode used for model outputs. :param int output_return_period: Return period if output mode is ``EveryN``. Defaults to 1. """ if output_mode != enums.OutputMode.EveryN: output_return_period = 1 value = [long_name, output_mode is None] value += [output_mode, output_return_period] self.anchored_tensors[short_name] = value def outputMode(self, output_mode: "poptorch.OutputMode", output_return_period: Optional[int] = None ) -> "poptorch.Options": """ Specify which data to return from a model. :param poptorch.OutputMode output_mode: * ``All``: Return a result for each batch. * ``Sum``: Return the sum of all the batches. * ``Final``: Return the last batch. * ``EveryN``: Return every N batches: N is passed in as ``output_return_period``. * Default: `All` for inference, `Final` for training. For example: >>> opts = poptorch.Options() >>> opts.outputMode(poptorch.OutputMode.All) ... # or >>> opts.outputMode(poptorch.OutputMode.EveryN, 10) """ assert isinstance(output_mode, enums.OutputMode) # Check the anchor return period makes sense. if output_mode == enums.OutputMode.EveryN: assert output_return_period and output_return_period > 0, ( "EveryN" " anchor must have output_return_period set to valid" " positive integer") elif output_return_period: logger.info( "Anchor return period argument ignored with output_mode" " set to %s", output_mode) self.set(output_mode=output_mode.value, output_return_period=output_return_period or 1) return self def defaultOutputMode(self) -> bool: """ :return: * True: :py:func:`~poptorch.Options.outputMode` is currently set to default. * False: :py:func:`~poptorch.Options.outputMode` is not set to default. """ return self.output_mode == enums.OutputMode.Default def randomSeed(self, random_seed: int) -> "poptorch.Options": """Set the seed for the random number generator on the IPU. :param random_seed: Random seed integer. """ assert isinstance(random_seed, int) torch.manual_seed(random_seed) self.createOrSet(random_seed=random_seed) return self def enableStableNorm(self, enabled: bool) -> "poptorch.Options": """Set whether a stable version of norm operators is used. This stable version is slower, but more accurate than its unstable counterpart. :param enabled: * True: Use stable norm calculation. * False: Do not use stable norm calculation. """ self._Popart.set("enableStableNorm", enabled) return self def enableSyntheticData(self, enabled: bool) -> "poptorch.Options": """Set whether host I/O is disabled and synthetic data is generated on the IPU instead. This can be used to benchmark models whilst simulating perfect I/O conditions. :param enabled: * True: Use data generated from a random normal distribution on the IPU. Host I/O is disabled. * False: Host I/O is enabled and real data is used. """ # popart.SyntheticDataMode # 0 = Off # 1 = Zeros # 2 = RandomNormal mode = 2 if enabled else 0 self._Popart.set("syntheticDataMode", mode) return self def logCycleCount(self, log_cycle_count: bool) -> "poptorch.Options": """Log the number of IPU cycles used in executing the main graph. The cycle count will be printed when this option is enabled by setting the environment variable ``POPTORCH_LOG_LEVEL=DEBUG``. This option requires IPU hardware to run. Note: This will have a small detrimental impact on performance. :param log_cycle_count: * True: Enable logging the IPU cycle count. * False: Do not enable IPU cycle count logging. """ self._Popart.set("instrumentWithHardwareCycleCounter", log_cycle_count) return self def enableProfiling(self, profile_dir: Optional[str] = None ) -> "poptorch.Options": """Enable profiling report generation. To generate debug information associated with the profiling data, please specify ``autoReport.directory``, and either ``autoReport.all`` or ``autoReport.outputDebugInfo`` in the ``POPLAR_ENGINE_OPTIONS`` environment variable. e.g. .. code-block:: bash POPLAR_ENGINE_OPTIONS={"autoReport.directory":"/profile/output",\\ "autoReport.all":"true"}`` or: .. code-block:: bash POPLAR_ENGINE_OPTIONS={"autoReport.directory":"/profile/output",\\ "autoReport.outputDebugInfo":"true"}`` Debug information and the rest of the profiling data will be stored in ``/profile/output directory``. Values specified in the environment variable take precedence over ``profile_dir`` when both are given. :param str profile_dir: path to directory where report will be created. Defaults to current directory. """ env_engine_opts = os.getenv('POPLAR_ENGINE_OPTIONS', default='') env_override = ('debug.allowOutOfMemory' in env_engine_opts) or \ ('autoReport.directory' in env_engine_opts) or \ ('autoReport.all' in env_engine_opts) if env_override: logger.warning( 'Profiling setting overridden by environment variable. ' 'Check content of POPLAR_ENGINE_OPTIONS.') opts = self._popart.options.get('engineOptions', {}) opts['debug.allowOutOfMemory'] = 'true' opts['autoReport.directory'] = profile_dir or '.' opts['autoReport.all'] = 'true' self._popart.setEngineOptions(opts) return self def disableModuleNamescope(self) -> "poptorch.Options": """ Disable option adding name scope for each operator present in the module. This option is enabled by default. The operator name scope is be based on the names appearing in the named_modules function from torch.nn.Module. For example: >>> class Model(torch.nn.Module): >>> def __init__(self, num_groups, num_channels): >>> super().__init__() >>> self.gn = torch.nn.GroupNorm(num_groups, num_channels) >>> def forward(self, x): >>> return self.gn2(x) With namescope enabled the name will be gn/GroupNormalization, with disabled it will be GroupNormalization. """ self._module_namescope_enabled = False return self def updatableNamedBuffers(self, buffers: List[str]) -> "poptorch.Options": """ List of model named buffers that can be updated with call to buffersFromHost(). This allows to update just a subset of model weights instead of all or them as it happens with weightsFromHost() call. """ self._Popart.set('updatableNamedBuffers', buffers) return self def toDict(self) -> Dict[str, Any]: """ Merge all the options, except for the JIT and Precision options, into a single dictionary to be serialised and passed to the C++ backend. At this stage, any warnings are printed based on options set e.g. if a default option changes. :meta private: """ assert not self.defaultOutputMode( ), "An output mode must be picked before serialisation" out = self._execution_strategy.backendOptions() out.update(self._popart.options) out = self.update(out) out = self._training.update(out) out = self._distributed.update(out) out = self._tensor_locations.update(out) if self._show_compilation_progress_bar: out["compilation_progress_bar_fn"] = self._progress_bar return out def clone(self) -> "poptorch.Options": """Create an unfrozen deep copy of the current options. """ return copy.deepcopy(self) def __repr__(self): """Repr which recurses through the "properties" of the class to find the objects to print.""" # Call __repr__ on v so that strings display with quotes. property_names = [ p for p in dir(type(self)) if isinstance(getattr(type(self), p), property) ] return (f"{type(self).__name__}(" + ", ".join(f"{k}={v.__repr__()}" for k, v in self._values.items()) + ", " + ", ".join(f"{prop}={getattr(self, prop)}" for prop in property_names) + ")") ================================================ FILE: python/poptorch.cpp ================================================ // Copyright (c) 2020 Graphcore Ltd. All rights reserved. #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "poptorch_err/ExceptionHandling.hpp" #include "poptorch_err/ExceptionInfo.hpp" #include "poptorch_logging/Error.hpp" #include "poptorch_logging/LoggingLight.hpp" #include "poptorch_logging/Tracepoint.hpp" #include "poptorch/DispatchTracer.hpp" #include "poptorch/LowerToPopart.hpp" #include "poptorch/LowerToPopartFactories.hpp" #include "poptorch/SessionOptionsParser.hpp" #include "poptorch/Utils.hpp" #include "popart_compiler/CodeletsCompilation.hpp" #include "popart_compiler/Compiler.hpp" #include "popart_compiler/Utils.hpp" #include "pytorch_bridge/CompilerOptions.hpp" // All the functions here are called directly from python, therefore it's ok for // us to catch all exceptions and convert them to PoptorchError #define PTC(f) \ PoptorchCatchWrapperImpl::wrap namespace poptorch { namespace { // Everything in this namespace is a workaround because // torch::jit::toTraceableStack() is broken:torch::jit::as_module() fails to // initialise its static local variable ScriptModule and segfaults as a // result. namespace jit { using namespace torch::jit; TypePtr inferType(py::handle input) { // Try tensor types if (THPVariable_Check(input.ptr())) { return TensorType::get(); } if (input.is(py::none())) { return NoneType::get(); } if (six::isTuple(input)) { py::tuple tuple = py::cast(input); std::vector element_types; element_types.reserve(tuple.size()); for (py::handle elem : tuple) { element_types.push_back(inferType(elem)); } return TupleType::create(element_types); } else if (PyDict_Check(input.ptr())) { // Check to make sure we can generate useful input/output types auto dict = py::cast(input); size_t len = py::len(dict); ERROR_ON_MSG(len == 0, "Dictionary inputs must have entries"); TypePtr key_type = nullptr; TypePtr value_type = nullptr; for (auto entry : dict) { // Try to infer the key type and unify it with the existing one auto entry_key_type = inferType(entry.first); auto unified_key = unifyOrInitializeType(key_type, entry_key_type); ERROR_ON_MSG(!unified_key, c10::str("Dictionary inputs to traced functions must have " "consistent type. Found ", key_type->repr_str(), " and ", entry_key_type->repr_str())); // Try to infer the value type and unify it with the existing one auto entry_value_type = inferType(entry.second); auto unified_value = unifyOrInitializeType(value_type, entry_value_type); ERROR_ON_MSG(!unified_value, c10::str("Dictionary inputs to traced functions must have " "consistent type. Found ", value_type->repr_str(), " and ", entry_value_type->repr_str())); key_type = *unified_key; value_type = *unified_value; } return DictType::create(key_type, value_type); } else if (PyList_Check(input.ptr())) { auto list = py::cast(input); size_t len = py::len(list); ERROR_ON_MSG(len == 0, "List trace inputs must have elements"); TypePtr element_type = nullptr; for (auto elem : list) { auto this_element_type = inferType(elem); auto unified_type = unifyOrInitializeType(element_type, this_element_type); ERROR_ON_MSG(!unified_type, c10::str("List inputs to traced functions must have " "consistent element type. Found ", element_type->repr_str(), " and ", this_element_type->repr_str())); element_type = *unified_type; } return ListType::create(element_type); } ERROR("Only nested lists and tuples of tensors are supported"); } // Cut down version of torch::jit::toTraceableStack which only supports nested // tuples and lists of tensors. Stack toTraceableStack(const py::tuple &inputs) { return toIValue(inputs, inferType(inputs)).toTupleRef().elements().vec(); } } // namespace jit template class CallOnExit : Func { public: explicit CallOnExit(Func f) : Func(std::move(f)) {} ~CallOnExit() { std::invoke(*static_cast(this)); } }; // Keep a static map to gather up all the cpu calls. CPUCallbackMap callbacks; bool alreadyRegistered(const std::string &ID) { return callbacks.find(ID) != callbacks.end(); } void registerBuffersWithCallback( const std::string &ID, std::vector &input_tensors, // NOLINT std::vector &output_tensors // NOLINT ) { auto itr = callbacks.find(ID); ERROR_ON_MSG(itr == callbacks.end(), "Callback has not been registered."); popart_compiler::CallbackMetadata &metadata = itr->second; // Track the input tensors. Our python creates a persistent storage location // for the inputs and outputs. for (at::Tensor &tensor : input_tensors) { metadata.input_pointers.push_back(tensor.data_ptr()); } // Same for output. for (at::Tensor &tensor : output_tensors) { tensor = tensor.contiguous(); metadata.output_pointers.push_back(tensor.data_ptr()); } } // Python interface to map a given CPU op with the IR calls. void registerCPUCallBack(const py::object &obj, const std::string &ID) { // Map the string identifier to the metadata. bool inserted; decltype(callbacks)::iterator it; std::tie(it, inserted) = callbacks.try_emplace(ID); // Skip if we've already added a callback for this function. if (!inserted) { return; } // Structure to store the information given by python to be forwarded to the // backend. popart_compiler::CallbackMetadata &metadata = it->second; // Wrap that in a lambda so we don't have to expose the naked pytorch function // pointer thing. metadata.the_callback = [=]() { // We wrap the user call in a function called "execute" obj.attr("execute")(); }; metadata.buffer_registration_callback = [=]() { obj.attr("registerPersistentData")(); }; } void initCallbackBuffers() { for (auto &pair : callbacks) { pair.second.buffer_registration_callback(); } } class PybindValue : public IPyValue { public: template ::value, int> = 0> explicit PybindValue(T obj) { _maybe_obj = obj; _value = _maybe_obj; } template ::value, int> = 0> explicit PybindValue(T handle) : _value(handle) {} std::function toFunction() const override { py::function py_func = _value.cast(); return [py_func](int x, int y) { py::gil_scoped_acquire acquire; py_func(x, y); }; } bool isBoolean() const override { return py::isinstance(_value); } bool toBoolean() const override { return _value.cast(); } bool isDouble() const override { // Python's float type is actually double // precision. return py::isinstance(_value); } double toDouble() const override { return _value.cast(); } bool isInt() const override { return py::isinstance(_value); } std::uint64_t toUInt64() const override { return _value.cast(); } std::int64_t toInt64() const override { return _value.cast(); } bool isString() const override { return py::isinstance(_value); } std::string toString() const override { if (isString()) { return _value.cast(); } if (isInt()) { return std::to_string(_value.cast()); } ERROR("Don't know how to convert type " << _value.get_type() << " to string"); } bool isSetListOrTuple() const override { return py::isinstance(_value) || py::isinstance(_value) || py::isinstance(_value); } void forEachInList(std::function fn) const override { for (auto option : _value.cast()) { fn(PybindValue(option)); } } bool isDict() const override { return py::isinstance(_value); } void forEachInDict(std::function fn) const override { for (auto option : _value.cast()) { fn(PybindValue(option.first), PybindValue(option.second)); } } std::unique_ptr getFromDict(const std::string &key) const override { auto dict = _value.cast(); if (!dict.contains(key)) { return nullptr; } return std::make_unique(dict[key.c_str()]); } std::uint64_t getListSize() const override { return _value.cast().size(); } std::unique_ptr getFromList(const std::uint64_t index) const override { auto list = _value.cast(); if (index >= list.size()) { return nullptr; } return std::make_unique(list[index]); } std::string type() const override { return py::str(_value.get_type()); } private: // pybind11 handles do not keep a reference to the python object so it might // disappear if the parent doesn't hold a reference to it, so just to be safe // keep a reference if possible. py::object _maybe_obj; py::handle _value; }; template T getOptimizerValue(const py::dict &d, const std::string &key) { ERROR_ON_MSG(!d.contains(key), "Missing optimizer value for '" << key << "' in " << py::str(d.cast())); return d[key.c_str()].cast(); } template void getOptimizerValue(T &value, const py::dict &d, const std::string &key) { value = getOptimizerValue(d, key); } void copyParametersDict(popart_compiler::Optimizer *out, const py::dict &in) { logging::LogContext ctx_func("copyParametersDict"); out->parameters.resize(in.size()); std::uint64_t param_idx = 0; for (auto optimizer_field : in) { auto ¶m = out->parameters[param_idx]; param_idx++; const std::string name = optimizer_field.first.cast(); logging::LogContext ctx("attr: " + name); std::pair p = optimizer_field.second.cast>(); ERROR_ON(name.size() >= sizeof(param.name)); // We need to use a C-style string here to avoid ABI issues. snprintf(reinterpret_cast(param.name), sizeof(param.name), "%s", name.c_str()); param.value = p.first; param.is_const = p.second; } } // Process the user provided dictionary and extract the relevant optimizer // information. std::vector parseOptimizers(const py::dict &opt) { if (opt.empty()) { return {}; } popart_compiler::OptimizerType type = popart_compiler::OptimizerType::NONE; std::uint64_t num_groups; type = static_cast( getOptimizerValue(opt, "optimizer_type")); auto defaults = getOptimizerValue(opt, "defaults"); auto groups = getOptimizerValue(opt, "groups"); num_groups = groups.size(); std::vector optimizers; // Note: all the group variables and optimizer variables are // automatically forwarded to the Compiler backend however // the optimizer attributes are extracted here. bool use_tf_variant = false; if (type == popart_compiler::OptimizerType::RMSPROP || type == popart_compiler::OptimizerType::RMSPROP_CENTERED) { getOptimizerValue(use_tf_variant, opt, "useTfVariant"); } float max_grad_norm = std::numeric_limits::infinity(); if (opt.contains("maxGradNorm")) { getOptimizerValue(max_grad_norm, opt, "maxGradNorm"); } if (opt.contains("accumType")) { bool accum_type = false; bool first_order_momentum_accum_type = false; bool second_order_momentum_accum_type = false; // Indicate whether the optimizer should use float16 types getOptimizerValue(accum_type, opt, "accumType"); if (type == popart_compiler::OptimizerType::SGD1 || type == popart_compiler::OptimizerType::SGD2) { getOptimizerValue(first_order_momentum_accum_type, opt, "velocityAccumType"); } else { getOptimizerValue(first_order_momentum_accum_type, opt, "firstOrderMomentumAccumType"); getOptimizerValue(second_order_momentum_accum_type, opt, "secondOrderMomentumAccumType"); } // Create one Optimizer per parameter group + 1 for defaults for (std::uint64_t i = 0; i <= num_groups; ++i) { optimizers.emplace_back(type, accum_type, first_order_momentum_accum_type, second_order_momentum_accum_type, use_tf_variant, max_grad_norm); } } else { // Create one Optimizer per parameter group + 1 for defaults for (std::uint64_t i = 0; i <= num_groups; ++i) { optimizers.emplace_back(type, use_tf_variant, max_grad_norm); } } copyParametersDict(optimizers.data(), defaults); // For each group copy all the attributes // Start at 1: index 0 is 'defaults' std::uint64_t group = 1; for (auto group_attr : groups) { copyParametersDict(&optimizers[group], group_attr.cast()); ++group; } return optimizers; } std::map getParameterBuffers(const pybind11::tuple &names, const pybind11::tuple &tensors) { ERROR_ON(names.size() != tensors.size()); std::map parameters; torch::jit::Stack stack = jit::toTraceableStack(tensors); for (std::uint64_t i = 0; i < names.size(); ++i) { parameters.insert( {names[i].cast(), stack[i].toTensor().data_ptr()}); } return parameters; } // We have three sets of tensors. // 1. Tensors in the graph from jit::trace. // 2. Tensors in the original user model. // 3. Tensors in the graph from jit::trace which lowerGraph has removed unused // tensors from. We remap them by mapping the indices of 1. to the tensors of 3. // and then creating a new vector using 3 with that map as a guide to tell us // which tensors have been culled. std::vector remapTensors(const pybind11::dict &python_tensors, const pybind11::dict &model_parameters, const std::vector &traced_tensors) { // Create a set of the pointers actually in use. std::unordered_map tensor_pointers; for (std::size_t i = 0; i < traced_tensors.size(); ++i) { tensor_pointers.insert({traced_tensors[i].data_ptr(), i}); } std::vector returnee; returnee.resize(traced_tensors.size()); for (auto element : model_parameters) { auto option_name = element.first.cast(); // Get the original tensor which the. auto dict_itr = python_tensors[element.first]; at::Tensor traced_tensor = dict_itr.cast(); auto itr = tensor_pointers.find(traced_tensor.data_ptr()); if (itr != tensor_pointers.end()) { at::Tensor tensor = element.second.cast(); returnee[itr->second] = tensor; } } return returnee; } // python_names and python_tensors are the parameters from the python trace. // And trace_tensors is a subset of python_tensors (The unused parameters have // been removed). So we build a map[tensor] = name based on the python trace // which we then use to build the list of the names of the parameters in // traced_tensors. std::vector getParameterNames(const pybind11::dict &python_tensors, const std::vector &traced_tensors) { // Create a set of the pointers actually in use. std::unordered_map tensor_pointers; for (std::size_t i = 0; i < traced_tensors.size(); ++i) { tensor_pointers.insert({traced_tensors[i].data_ptr(), i}); } // Get the names of each tensor which hasn't been removed as unused. std::vector names; names.resize(tensor_pointers.size()); // Extract the python strings into an actual language. for (auto element : python_tensors) { at::Tensor tensor = element.second.cast(); auto itr = tensor_pointers.find(tensor.data_ptr()); if (itr != tensor_pointers.end()) { std::string option_name = element.first.cast(); names[itr->second] = option_name; } } return names; } AnchorList parseAnchors(const py::list &list) { AnchorList map; for (auto elem : list) { auto anchor = elem.cast(); map.push_back(Anchor(anchor[0].cast(), anchor[2].cast(), anchor[3].cast())); } return map; } void parseSessionOptionsVoid(const py::dict &opts) { SessionOptionsParser{PybindValue(opts)}; } void buildTensorList(const torch::jit::IValue &value, std::vector *tensors, bool allow_tensor_only = false) { if (value.isTuple()) { ERROR_ON(allow_tensor_only); for (auto &element : value.toTuple()->elements()) { buildTensorList(element, tensors); } } else if (value.isList()) { ERROR_ON(allow_tensor_only); for (const auto element : value.toList()) { buildTensorList(element, tensors); } } else if (value.isTensor()) { tensors->push_back(value.toTensor()); } else { ERROR("Unsupported value " << value.tagKind()); } } torch::jit::script::Module *asModule(py::handle h) { return reinterpret_cast( pybind11::detail::values_and_holders( reinterpret_cast(h.ptr())) .begin() ->value_ptr()); } void identifyZeroSizedTensors(const std::vector &tensors) { for (const at::Tensor &tensor : tensors) { auto sizes = tensor.sizes(); if (std::any_of(sizes.begin(), sizes.end(), [](auto dim) { return dim == 0; })) { std::stringstream err; err << "Zero-sized tensors are unsupported (Got shape ["; for (std::size_t i = 0; i < sizes.size() - 1; i++) { err << sizes[i] << ", "; } err << sizes[sizes.size() - 1] << "])."; ERROR(err.str()); } } } poptorch::LowerToPopart lowerToPopartFromDispatch(const pybind11::dict &options, const py::function &attribute_accessor, bool training, const py::dict &opt_dict, const py::list &anchors) { auto cleanup = CallOnExit([] { // Clear the callbacks after compilation. callbacks.clear(); }); SessionOptionsParser options_parser{PybindValue(options)}; AnchorList anchors_list = parseAnchors(anchors); std::vector optimizers = parseOptimizers(opt_dict); return lowerToPopartFromDispatch( options_parser, training, std::move(anchors_list), []() { initCallbackBuffers(); }, std::move(optimizers), [&attribute_accessor](const std::string &attributes_id_str) { return std::make_unique( attribute_accessor(attributes_id_str)); }, callbacks); } void mapParamsToNames(const pybind11::tuple &names, const pybind11::tuple &tensors) { ERROR_ON(names.size() != tensors.size()); torch::jit::Stack stack = jit::toTraceableStack(tensors); for (uint64_t i = 0; i < names.size(); ++i) { const auto name = names[i].cast(); const auto tensor = stack[i].toTensor(); setParameterName(tensor, name); } } void setPerReplica(const std::string ¶m_name, py::handle tensor, int comm_group_type, int shards, int variable_retrieval_mode) { at::Tensor t = torch::jit::toTypeInferredIValue(tensor).toTensor(); setParameterPerReplica(param_name, t, comm_group_type, shards, variable_retrieval_mode); } std::string convertToString(const std::vector &str) { return std::string(str.data(), str.size()); } std::vector convertToCharVec(const std::string &str) { return std::vector(str.begin(), str.end()); } pybind11::list toPythonList(std::vector &&outputs) { pybind11::list pylist(outputs.size()); for (std::size_t i = 0; i < outputs.size(); ++i) { pylist[i] = torch::jit::toPyObject(std::move(outputs[i])); } return pylist; } class Error : public py::object { public: Error() = default; Error(handle scope, const char *name, handle base = PyExc_Exception) { std::string full_name = scope.attr("__name__").cast() + std::string(".") + name; m_ptr = PyErr_NewException(full_name.c_str(), base.ptr(), nullptr); if (hasattr(scope, "__dict__") && scope.attr("__dict__").contains(name)) { pybind11::pybind11_fail( "Error during initialization: multiple incompatible " "definitions with name \"" + std::string(name) + "\""); } scope.attr(name) = *this; } // Sets the current python myexception to this exception object with the given // message void setWhat(const std::string &message) { _what = message; } const std::string &getWhat() { return _what; } void setErrorIndicator() { PyErr_SetString(m_ptr, _what.c_str()); } void setMessage(const std::string &message) { py::object x = py::cast(message); PyObject_SetAttrString(m_ptr, "message", x.ptr()); } void setType(const std::string &type) { py::object x = py::cast(type); PyObject_SetAttrString(m_ptr, "type", x.ptr()); } void setLocation(const std::string &location) { py::object x = py::cast(location); PyObject_SetAttrString(m_ptr, "location", x.ptr()); } private: std::string _what; }; class RecoverableError : public Error { public: using Error::Error; void setRecoveryAction(const std::string &recoveryAction) { py::object x = py::cast(recoveryAction); PyObject_SetAttrString(m_ptr, "recovery_action", x.ptr()); } }; std::unique_ptr error; std::unique_ptr recoverable_error; std::unique_ptr unrecoverable_error; /* * This structure enables poptorch.Error objects to be thrown python-side from * both our pybind11 interface and torch's own. Our pybind11 exception handler * catches this class specifically, whilst torch's catches any PyTorchError * subclass and uses it to deduce the python type using the overridden * python_type() method. * The function convertToPoptorchExceptionOrRethrow() processes all the * exception types we're interested in, extracts detail, and marshals them as * instances of PoptorchErrorInfo which is then used to create instances of this * class. We put try..catch wrappers round every pybind11 entry point using the * macro CATCH_AND_RETHROW_AS_POPTORCH_EXCEPTION and pass them to * convertToPoptorchExceptionOrRethrow(). */ struct PoptorchError : public torch::PyTorchError { public: explicit PoptorchError(const PoptorchErrorInfo &info) : torch::PyTorchError(info.long_message), _info(info) {} PyObject *python_type() override { return setupPyError(false); } void setErrorIndicator() const { setupPyError(true); } private: PyObject *setupPyError(bool set_indicator) const; public: const PoptorchErrorInfo _info; }; PyObject *PoptorchError::setupPyError(bool set_indicator) const { for (int64_t i = _info.stack.size() - 1; i >= 0; --i) { poptorch::logging::LogContext::push(_info.stack[i].c_str()); } Error *err = nullptr; switch (_info.category) { case ErrorCategory::RuntimeRecoverable: { recoverable_error->setRecoveryAction(_info.recovery_action); err = recoverable_error.get(); break; } case ErrorCategory::RuntimeUnrecoverable: { err = unrecoverable_error.get(); break; } default: { err = error.get(); break; } } err->setType(_info.type); err->setMessage(_info.message); err->setLocation(_info.location); // Note: on Ubuntu 20.04 PyErr_SetString(), i.e setWhat(), // needs to be the last call in register_exception_translator() err->setWhat(_info.long_message); if (set_indicator) { err->setErrorIndicator(); } return err->ptr(); } void doThrowPoptorchError(const PoptorchErrorInfo &info) { throw PoptorchError(info); } } // namespace namespace bindings { void initialiseExceptionHandling(pybind11::handle m) { error = std::make_unique(m, "Error"); recoverable_error = std::make_unique(m, "RecoverableError", *error); unrecoverable_error = std::make_unique(m, "UnrecoverableError", *error); poptorch::setPoptorchErrorThrower(doThrowPoptorchError); } void copyWeightsToHostImpl( const std::shared_ptr &executable, const pybind11::tuple ¶meter_names, const pybind11::tuple ¶meter_tensors) { poptorch::logging::Tracepoint tp{"copyWeightsToHost"}; // Copy the weights or warn if this is before first time compilation. if (!executable) { logging::log( logging::Level::Warn, "Call to copyWeightsToHost ignored as model has not been compiled " "(PopTorch will compile models on first invocation)."); } else { executable->copyWeightsToHost( getParameterBuffers(parameter_names, parameter_tensors)); } } void copyWeightsToDeviceImpl( const std::shared_ptr &executable, const pybind11::tuple ¶meter_names, const pybind11::tuple ¶meter_tensors) { poptorch::logging::Tracepoint tp{"copyWeightsToDevice"}; // Copy the weights or warn if this is before first time compilation. if (!executable) { logging::log( logging::Level::Warn, "Call to copyWeightsToDevice ignored as model has not been compiled " "(PopTorch will compile models on first invocation)."); } else { executable->copyWeightsToDevice( getParameterBuffers(parameter_names, parameter_tensors)); } } void copyNamedBuffersToDeviceImpl( const std::shared_ptr &executable, const pybind11::tuple &buffer_names, const pybind11::tuple &buffer_tensors) { poptorch::logging::Tracepoint tp{"copyNamedBuffersToDevice"}; // Copy the named buffers or warn if this is before first time compilation. if (!executable) { logging::log( logging::Level::Warn, "Call to copyNamedBuffersToDevice ignored as model has not been " "compiled (PopTorch will compile models on first invocation)."); } else { executable->copyNamedBuffersToDevice( getParameterBuffers(buffer_names, buffer_tensors)); } } std::string getPopartIR(const std::shared_ptr &executable) { ERROR_ON_MSG(!executable, "No built executable"); return executable->getPopartIR(); } py::set getTensorNames(const std::shared_ptr &executable) { ERROR_ON_MSG(!executable, "No built executable"); return py::cast(executable->getTensorNames()); } void detachFromDevice( const std::shared_ptr &executable) { poptorch::logging::Tracepoint tp{__FUNCTION__}; ERROR_ON_MSG(!executable, "No built executable"); executable->detachFromDevice(); } void attachToDevice( const std::shared_ptr &executable) { poptorch::logging::Tracepoint tp{__FUNCTION__}; ERROR_ON_MSG(!executable, "No built executable"); executable->attachToDevice(); } bool isAttachedToDevice( const std::shared_ptr &executable) { ERROR_ON_MSG(!executable, "No built executable"); return executable->isAttachedToDevice(); } void setLogLevel(std::uint64_t level) { ERROR_ON(level > static_cast(logging::Level::Off) || level == 5); logging::setLogLevel(static_cast(level)); } void loadEngineAndConnectStreams( const std::shared_ptr &executable) { poptorch::logging::Tracepoint tp{__FUNCTION__}; ERROR_ON_MSG(!executable, "No built executable"); executable->loadEngineAndConnectStreams(); } void updateOptimizers( const std::shared_ptr &executable, const py::dict &optimizer_dict) { poptorch::logging::Tracepoint tp{__FUNCTION__}; ERROR_ON_MSG(!executable, "No built executable"); // Create an empty optimizer for inference, this will not be applied. std::vector optimizers = parseOptimizers(optimizer_dict); executable->updateOptimizers(optimizers); } std::vector execute(const std::shared_ptr &executable, const pybind11::tuple &inputs) { poptorch::logging::Tracepoint tp{__FUNCTION__}; ERROR_ON_MSG(!executable, "No built executable"); // Create a jit stack from the incoming pytorch tensors. torch::jit::Stack input_stack = jit::toTraceableStack(inputs); // And turn convert them into at tensors which we can then resolve the // address of. std::vector input_tensors; for (const torch::jit::IValue &value : input_stack) { buildTensorList(value, &input_tensors); } std::vector output_tensors = executable->run(input_tensors); std::vector returnee; // Reshape the output tensors in the structure expected by the user auto tensor_it = output_tensors.begin(); const auto &output_types = executable->outputTypes(); auto type_it = output_types.begin(); ERROR_ON(type_it == output_types.end()); // First tuple encodes the number of (actual) outputs std::uint64_t num_outputs = type_it->num_elements; std::function process_output; process_output = [&]() -> pybind11::object { // NOLINT ERROR_ON_MSG(type_it == output_types.end(), "Invalid OutputTypes object"); switch (type_it->type) { case popart_compiler::OutputElemType::Tensor: { ERROR_ON_MSG(tensor_it == output_tensors.end(), "Not enough tensors to unpack"); auto object = torch::jit::toPyObject(*tensor_it); tensor_it++; return object; } case popart_compiler::OutputElemType::Tuple: { std::int64_t num_elements = type_it->num_elements; pybind11::tuple pytuple(num_elements); for (std::int64_t i = 0; i < num_elements; ++i) { type_it++; pytuple[i] = process_output(); } return std::move(pytuple); } case popart_compiler::OutputElemType::List: { std::int64_t num_elements = type_it->num_elements; pybind11::list pylist(num_elements); for (std::int64_t i = 0; i < num_elements; ++i) { type_it++; pylist[i] = process_output(); } return std::move(pylist); } default: ERROR("Unsupported OutputType"); } }; for (std::uint64_t i = 0; i < num_outputs; ++i) { type_it++; returnee.push_back(process_output()); } ERROR_ON_MSG(tensor_it != output_tensors.end(), "Not all the output tensors were unpacked"); return returnee; } void setRngState(std::shared_ptr &executable, std::uint64_t seed, const std::vector &rng_state) { poptorch::logging::Tracepoint tp{__FUNCTION__}; ERROR_ON_MSG(!executable, "No built executable"); auto &compiler = executable->getCompiler(); compiler.setRngState(seed, rng_state); } std::uint64_t getRandomSeed(const std::shared_ptr &executable) { poptorch::logging::Tracepoint tp{__FUNCTION__}; ERROR_ON_MSG(!executable, "No built executable"); const auto &compiler = executable->getCompiler(); return compiler.getRandomSeed(); } std::vector getRngState(const std::shared_ptr &executable) { poptorch::logging::Tracepoint tp{__FUNCTION__}; ERROR_ON_MSG(!executable, "No built executable"); const auto &compiler = executable->getCompiler(); return compiler.getRngState(); } py::dict readOptimizerState( const std::shared_ptr &executable) { poptorch::logging::Tracepoint tp{__FUNCTION__}; py::dict optim_state; py::dict state_tensors; py::dict param_tensors; ERROR_ON_MSG(!executable, "No built executable"); auto &compiler = executable->getCompiler(); std::vector metadata_list = compiler.optimizerTensorMetadataList(); std::vector host_buffers; for (const popart_compiler::TensorMetadata &meta : metadata_list) { at::Tensor tensor = at::empty({meta.shape}, onnxStrToScalarType(meta.dtype)).contiguous(); if (meta.num_bytes == -1) { // num_bytes == -1 indicates it's an optimiser state tensor (variable) host_buffers.push_back(tensor.data_ptr()); state_tensors[py::cast(meta.id)] = py::cast(tensor); } else { // Otherwise it's a stream/constant optimiser parameter that we can copy // immediately std::memcpy(tensor.data_ptr(), meta.data, meta.num_bytes); param_tensors[py::cast(meta.id)] = py::cast(tensor); } } compiler.fillHostOptimizerStateTensorData(host_buffers); optim_state["ipu_state"] = std::move(state_tensors); optim_state["ipu_param"] = std::move(param_tensors); return optim_state; } void writeOptimizerState( const std::shared_ptr &executable, const py::dict &optim_state) { poptorch::logging::Tracepoint tp{__FUNCTION__}; ERROR_ON_MSG(!executable, "No built executable"); auto &compiler = executable->getCompiler(); std::vector metadata_list = compiler.optimizerTensorMetadataList(); std::vector host_buffers; auto state = optim_state["ipu_state"]; auto params = optim_state["ipu_param"]; for (const popart_compiler::TensorMetadata &meta : metadata_list) { if (meta.num_bytes == -1) { // num_bytes == -1 indicates it's an optimiser state tensor (variable) if (!state.contains(py::cast(meta.id))) { logging::log( logging::Level::Warn, std::string("writeOptimizerState: ignoring missing state " + std::string(meta.id)) .c_str()); host_buffers.push_back(nullptr); continue; } at::Tensor tensor = state[py::cast(meta.id)].cast(); host_buffers.push_back(tensor.data_ptr()); } else { if (!params.contains(py::cast(meta.id))) { logging::log( logging::Level::Warn, std::string("writeOptimizerState: ignoring missing parameter " + std::string(meta.id)) .c_str()); continue; } // Otherwise it's a stream/constant optimiser parameter that we can copy // immediately at::Tensor tensor = params[py::cast(meta.id)].cast(); std::memcpy(meta.data, tensor.data_ptr(), meta.num_bytes); } } compiler.writeDeviceOptimizerStateTensorData(host_buffers); } std::vector getTimestamps(const std::shared_ptr &executable) { poptorch::logging::Tracepoint tp{__FUNCTION__}; ERROR_ON_MSG(!executable, "No built executable"); const auto &compiler = executable->getCompiler(); popart_compiler::Timestamps ts = compiler.getTimestamps(); py::list input; py::list input_complete; py::list output; py::list output_complete; for (const auto &t : ts.input) { input.append(py::cast(t)); } for (const auto &t : ts.input_complete) { input_complete.append(py::cast(t)); } for (const auto &t : ts.output) { output.append(py::cast(t)); } for (const auto &t : ts.output_complete) { output_complete.append(py::cast(t)); } return {input, input_complete, output, output_complete}; } bool pyIsGraphNondeterministic(py::handle h) { auto *module = asModule(h); auto forward = module->get_method("forward"); auto graph_and_tensors = torch::jit::LowerGraph(*forward.graph(), module->_ivalue()); auto graph = graph_and_tensors.first; const auto &nodes = graph->nodes(); return std::any_of(nodes.begin(), nodes.end(), [](const torch::jit::Node *n) { return poptorch::isNondeterministic(*n); }); } void saveExecutableToFile( const std::shared_ptr &executable, const std::string &export_filename) { poptorch::logging::Tracepoint tp{__FUNCTION__}; executable->getCompiler().saveExecutableToFile(export_filename.c_str()); } void appendPoptorchMetadataToFile(const std::string &serialized_poptorch_data, const std::string &export_filename) { poptorch::logging::Tracepoint tp{__FUNCTION__}; popart_compiler::Compiler::appendPoptorchMetadataToFile( serialized_poptorch_data.c_str(), serialized_poptorch_data.size(), export_filename.c_str()); } uint64_t cycleCount(const std::shared_ptr &executable) { ERROR_ON_MSG(!executable, "No built executable"); return executable->getCompiler().getCycleCount(); } py::bytes importPoptorchMetadataFromFile(const std::string &import_filename) { poptorch::logging::Tracepoint tp{__FUNCTION__}; std::vector metadata_buffer = popart_compiler::Compiler::importPoptorchMetadataFromFile( import_filename.c_str()); return py::bytes(metadata_buffer.data(), metadata_buffer.size()); } std::shared_ptr processDispatchAndImportExecutable( const pybind11::dict &options, const py::function &attribute_accessor, bool is_training, const py::dict &opt_dict, const py::list &anchors, const std::string &import_filename) { auto lower = lowerToPopartFromDispatch(options, attribute_accessor, is_training, opt_dict, anchors); return lower.loadExecutableFromFile(import_filename); } std::shared_ptr compileWithManualTracing( const pybind11::dict &options, const py::function &attribute_accessor, bool is_training, const py::dict &opt_dict, const py::list &anchors) { poptorch::logging::Tracepoint tp{__FUNCTION__}; logging::log(logging::Level::Debug, "Compile with manual tracing"); auto lower = lowerToPopartFromDispatch(options, attribute_accessor, is_training, opt_dict, anchors); py::gil_scoped_release release; return lower.compile(); } void setPopartLogLevelUInt(std::uint64_t level) { ERROR_ON(level > static_cast(logging::Level::Off) || level == 5); popart_compiler::setPopartLogLevel(static_cast(level)); } } // namespace bindings } // namespace poptorch PYBIND11_MODULE(poptorch_core, m) { // NOLINT py::class_> give_me_a_name(m, "InternalPoplarExecutable"); py::class_(m, "CompilerOptions") .def(py::init<>()) .def_property( "source_location_excludes", [](const poptorch::CompilerOptions &options) { std::vector excludes; std::transform(options.dispatcher.source_location_excludes.begin(), options.dispatcher.source_location_excludes.end(), std::back_inserter(excludes), &poptorch::convertToString); return excludes; }, [](poptorch::CompilerOptions &options, const std::vector &val) { options.dispatcher.source_location_excludes.clear(); std::transform( val.begin(), val.end(), std::back_inserter(options.dispatcher.source_location_excludes), &poptorch::convertToCharVec); }, "When printing the IR all the frames containing one of the excluded" "strings will be ignored.\n\n" "This is helpful to get the IR to trace back to user code rather" "than some function inside a framework."); m.def("isGraphNondeterministic", PTC(poptorch::bindings::pyIsGraphNondeterministic)); m.def("saveExecutableToFile", PTC(poptorch::bindings::saveExecutableToFile)); m.def("appendPoptorchMetadataToFile", PTC(poptorch::bindings::appendPoptorchMetadataToFile)); m.def("cycleCount", PTC(poptorch::bindings::cycleCount)); m.def("importPoptorchMetadataFromFile", PTC(poptorch::bindings::importPoptorchMetadataFromFile)); m.def("execute", PTC(poptorch::bindings::execute)); m.def("updateOptimizers", PTC(poptorch::bindings::updateOptimizers)); m.def("getTimestamps", PTC(poptorch::bindings::getTimestamps)); m.def("readOptimizerState", PTC(poptorch::bindings::readOptimizerState)); m.def("setRngState", PTC(poptorch::bindings::setRngState)); m.def("getRngState", PTC(poptorch::bindings::getRngState)); m.def("getRandomSeed", PTC(poptorch::bindings::getRandomSeed)); m.def("writeOptimizerState", PTC(poptorch::bindings::writeOptimizerState)); m.def("loadEngineAndConnectStreams", PTC(poptorch::bindings::loadEngineAndConnectStreams)); m.def("copyWeightsToDevice_impl", PTC(poptorch::bindings::copyWeightsToDeviceImpl)); m.def("copyNamedBuffersToDevice_impl", PTC(poptorch::bindings::copyNamedBuffersToDeviceImpl)); m.def("copyWeightsToHost_impl", PTC(poptorch::bindings::copyWeightsToHostImpl)); m.def("ipuHardwareVersion", PTC(poptorch::popart_compiler::ipuHardwareVersion), py::arg("numIpus") = 1); m.def("setCustomCodeletsPath", PTC(poptorch::popart_compiler::setCustomCodeletsPath)); m.def("setLogLevel", PTC(poptorch::bindings::setLogLevel), py::arg("level") = 2); m.def("setPopartLogLevel", PTC(poptorch::bindings::setPopartLogLevelUInt)); m.def("_getPopartIR", PTC(poptorch::bindings::getPopartIR)); m.def("_getTensorNames", PTC(poptorch::bindings::getTensorNames)); m.def("detachFromDevice", PTC(poptorch::bindings::detachFromDevice)); m.def("attachToDevice", PTC(poptorch::bindings::attachToDevice)); m.def("isAttachedToDevice", PTC(poptorch::bindings::isAttachedToDevice)); m.def("registerCPUCallBack", PTC(poptorch::registerCPUCallBack)); m.def("isAlreadyRegistered", PTC(poptorch::alreadyRegistered)); m.def("registerBuffersWithCallback", PTC(poptorch::registerBuffersWithCallback)); m.def("_validateOptions", PTC(poptorch::parseSessionOptionsVoid)); py::enum_(m, "TracingMode") .value("PopART", poptorch::TracingMode::POPART) .export_values(); m.def("poptorchAtExit", PTC(poptorch::poptorchAtExit)); m.def("destroyDispatcher", PTC(poptorch::destroyDispatcher)); m.def("startDispatch", PTC(poptorch::startDispatch)); m.def("isCompilingWithDispatcher", PTC(poptorch::isCompilingWithDispatcher)); m.def("endDispatch", PTC(poptorch::endDispatch)); m.def("startParametersMove", PTC(poptorch::startParametersMove)); m.def("endParametersMove", PTC(poptorch::endParametersMove)); m.def("startOutputsMove", PTC(poptorch::startOutputsMove)); m.def("endOutputsMove", PTC(poptorch::endOutputsMove)); m.def("createGraph", PTC(poptorch::createGraph)); m.def("mapParamsToNames", PTC(poptorch::mapParamsToNames)); m.def("setPerReplica", PTC(poptorch::setPerReplica)); m.def("finalizeGraph", PTC(poptorch::finalizeGraph)); m.def("compileWithManualTracing", PTC(poptorch::bindings::compileWithManualTracing)); m.def("processDispatchAndImportExecutable", PTC(poptorch::bindings::processDispatchAndImportExecutable)); m.def("_throwTestError", PTC(poptorch::popart_compiler::throwTestError)); m.def("getIpuTensorId", PTC(poptorch::getIpuTensorId)); poptorch::bindings::initialiseExceptionHandling(m); py::enum_(m, "TestErrorType") .value("Poptorch", poptorch::popart_compiler::TestErrorType::Poptorch) .value("Popart", poptorch::popart_compiler::TestErrorType::Popart) .value("PopartInternal", poptorch::popart_compiler::TestErrorType::PopartInternal) .value("Poplibs", poptorch::popart_compiler::TestErrorType::Poplibs) .value("PoplarUnrecoverable", poptorch::popart_compiler::TestErrorType::PoplarUnrecoverable) .value("PoplarUnknown", poptorch::popart_compiler::TestErrorType::PoplarUnknown) .value( "PoplarRecoverableFullReset", poptorch::popart_compiler::TestErrorType::PoplarRecoverableFullReset) .value("PoplarLinkError", poptorch::popart_compiler::TestErrorType::PoplarLinkError); py::register_exception_translator( [](std::exception_ptr p) { // NOLINT: Don't change 'p' to a const& try { if (p) { std::rethrow_exception(p); } } catch (const poptorch::PoptorchError &e) { e.setErrorIndicator(); } }); } ================================================ FILE: python/profiling.py ================================================ # Copyright (c) 2020 Graphcore Ltd. All rights reserved. import os from typing import Dict from ._logging import logger if os.environ.get("PVTI_OPTIONS") is None: _pvti_available = False else: try: import libpvti as pvti # type: ignore _pvti_available = True except ImportError as e: logger.info("Tracepoints disabled (Couldn't import libpvti: %s)") _pvti_available = False class Channel: """Profiling channel. .. note:: If the ``libpvti`` profiling library is not available at runtime this class becomes a no-op. Example: >>> channel = poptorch.profiling.Channel("MyApp") >>> with channel.tracepoint("TimeThis"): ... functionToTime() >>> channel.instrument(myobj, "methodName", "otherMethod") """ def __init__(self, name): if _pvti_available: self._tracepoint_prefix = name self._channel = _Channels.getOrCreate(name) def instrument(self, obj, *methods): """Instrument the methods of an object. :param obj: Object to instrument :param methods: One or more methods to wrap in profiling trace points. """ if _pvti_available: pvti.instrument(obj, methods, self._channel) return obj def tracepoint(self, name): """Create a context tracepoint >>> with channel.tracepoint("DoingSomething"): ... expensiveCall() :param name: Name associated to this tracepoint. """ if _pvti_available: tracepoint_name = self._tracepoint_prefix + "." + name return pvti.Tracepoint(self._channel, tracepoint_name) return _DummyTracepoint() class _DummyTracepoint: """Dummy context used when pvti is not available""" def __enter__(self): pass def __exit__(self, type, value, traceback): pass class _Channels: """Singleton library of registered Channels""" _channels: Dict[str, 'pvti.Channel'] = {} @staticmethod def getOrCreate(name): if name not in _Channels._channels: _Channels._channels[name] = pvti.createTraceChannel(name) return _Channels._channels.get(name) ================================================ FILE: python/py.typed ================================================ # Marker file for PEP 561. ================================================ FILE: python/testing.py ================================================ # Copyright (c) 2020 Graphcore Ltd. All rights reserved. import torch # Return true if both the structure and the content of ref and other match def allclose(ref, other): if isinstance(ref, torch.Tensor): return torch.allclose(other, ref) if isinstance(ref, tuple): if not isinstance(other, tuple) or len(ref) != len(other): return False elif isinstance(ref, list): if not isinstance(other, list) or len(ref) != len(other): return False else: assert "%s not supported" % type(ref) return all([allclose(r, other[i]) for i, r in enumerate(ref)]) ================================================ FILE: requirements.txt ================================================ # IMPORTANT: Keep requirements in sync with ./config.buildenv.py --extra-index-url https://download.pytorch.org/whl/cpu torch==2.0.1 torchaudio==2.0.2 torchvision==0.15.2 expecttest==0.1.3 lit==0.11.1 pytest==6.2.5 setuptools==58.0.4 tqdm==4.46.1 transformers==4.12.2 typing-extensions==4.1.1 # Use old version for wheel.pep425tags support (new versions removed it). wheel<0.35 -r poptorch_geometric/requirements.txt ================================================ FILE: scripts/PopAtenHandlers.py ================================================ #!/usr/bin/env python3 # Copyright (c) 2020 Graphcore Ltd. All rights reserved. import math import os from popgen.api import convert, expand, forward, generate, simplify from popgen.helpers import alpha, cfloat, cint, clong, clong_list, \ cstr, dimension, empty_initializer, output_shape, \ output_type, reduction, tensor_list, tensor_long, \ tensor_shape, tensor_type from popgen.operatorfactory import op from utils import _utils script = "PopAtenHandlers.py" output_dir = os.path.join(_utils.sources_dir(), "poptorch/source/popart_canonicalization") selu_alpha = 1.6732632423543772848170429916717 selu_lambda = 1.0507009873554804934193349852946 # simplification rules simplify("expm1", lambda x: op.exp(x) - 1.) simplify("log1p", lambda x: op.log(1. + x)) simplify("reciprocal", lambda x: 1. / x) simplify("div", lambda x, y: 1. / y * x) # unary operators opers = [ "abs", "acos", "acosh", "asin", "asinh", "atan", "atanh", "ceil", "cos", "cosh", "detach", "erf", "exp", "expm1", "floor", "isnan", "log", "log1p", "logical_not", "neg", "reciprocal", "relu", "round", "sigmoid", "sin", "sinh", "sign", "sqrt", "tan", "tanh" ] for oper in opers: convert(oper, 1) convert("t", 1, "transpose") convert("silu", 1, "swish") expand("erfc", lambda x: 1. - op.erf(x)) expand("log2", lambda x: op.log(x) / math.log(2)) expand("log10", lambda x: op.log(x) / math.log(10)) expand("log_sigmoid", lambda x: op.log(op.sigmoid(x))) forward("log_sigmoid_forward", "log_sigmoid") expand( "rand", lambda x: op.randomUniform(x, output_shape(), cfloat(1.), cfloat( 0.), output_type())) expand( "randn", lambda: op.randomNormal(empty_initializer(), output_shape(), cfloat(0.), cfloat(1.), output_type())) expand("rsqrt", lambda x: 1. / op.sqrt(x)) expand("selu", lambda x: op.selu(x, cfloat(selu_alpha), cfloat(selu_lambda))) expand("square", lambda x: x * x) # binary operators opers = ["atan2", "div", "fmod", "pow", "remainder"] for oper in opers: convert(oper, 2) convert("eq", 2, "equal") convert("gt", 2, "greater") convert("lt", 2, "less") convert("logical_and", 2) convert("logical_or", 2) expand("cat", lambda x, y: op.concat(tensor_list(x), clong(y))) expand("elu", lambda x, y, z: op.selu(x, cfloat(y), cfloat(z))) expand("ge", lambda x, y: x >= y) expand("le", lambda x, y: x <= y) expand("leaky_relu", lambda x, y: op.leakyrelu(x, cfloat(y))) expand("ne", lambda x, y: x != y) expand("pixel_shuffle", lambda x, y: op.depthtospace(x, clong(y), cstr("CRD"))) expand("reflection_pad1d", lambda x, y: op.reflectionPad(x, clong_list(y))) expand("replication_pad1d", lambda x, y: op.edgePad(x, clong_list(y))) expand("rsub", lambda x, y: y - x) def celu_handler(x, a): val = a * (op.exp(x / a) - 1.) return op.max(x, 0.) + op.min(0., val) def hardshrink_handler(x, l): return op.where(op.abs(x) > op.abs(l), x, 0.) def softshrink_handler(x, l): r = op.where(x > l, x - l, 0.) return op.where(x < -l, x + l, r) forward("reflection_pad2d", "reflection_pad1d") forward("replication_pad2d", "replication_pad1d") forward("replication_pad3d", "replication_pad1d") # ternary operators convert("masked_fill", 3, "where", [1, 2, 0]) convert("where", 3) expand("constant_pad_nd", lambda x, l, c: op.constantPad( x, clong_list(l), cfloat(c))) expand("hardtanh", lambda x, a, b: op.clip(x, cfloat(b), cfloat(a))) expand( "normal_", lambda x, c1, c2: op.randomNormal(x, tensor_shape(x), cfloat( c1), cfloat(c2))) expand("sub", lambda x, y, a: op.sub(x, alpha(y, a))) expand( "uniform_", lambda x, a, b: op.randomUniform(x, tensor_shape(x), cfloat(b), cfloat(a))) expand( "topk", lambda x, c, l: op.topk(x, tensor_long(c), dimension(l, tensor_type(x)))) expand("threshold", lambda x, threshold, val: op.where(x > threshold, x, val)) expand("index_select", lambda x, d, i: op.gather(x, i, dimension(d, tensor_type(x)))) # loss handlers def hinge_embedding_loss_handler(x, y, delta, red): red = reduction(clong(red)) loss = op.where(y.equal(1.), x, 0.) loss = op.where(y.equal(-1.), op.max(0., delta - x), loss) return op.identityloss(loss, red) def l1_loss_handler(x, y, red): red = reduction(clong(red)) loss = op.l1loss(x - y, cfloat(1.), red) return op.identityloss(loss, cint(2)) def margin_ranking_loss_handler(x1, x2, y, margin, red): red = reduction(clong(red)) loss = op.max(-y * (x1 - x2) + margin, 0.) return op.identityloss(loss, red) def mse_loss_handler(x, y, red): red = reduction(clong(red)) loss = (x - y) * (x - y) return op.identityloss(loss, red) def smooth_l1_loss_handler(x, y, red, beta): red = reduction(clong(red)) delta = op.abs(x - y) loss = op.where(delta < beta, 0.5 * delta * delta / beta, delta - 0.5 * beta) return op.identityloss(loss, red) def soft_margin_loss_handler(x, y, red): red = reduction(clong(red)) loss = op.log(1. + op.exp(-y * x)) return op.identityloss(loss, red) # everything else # NOTE: alpha and beta are swapped with a gemm expand( "addmm", lambda x, y, z, beta, alpha: op.gemm(y, z, x, cfloat( alpha), cfloat(beta), clong(0), clong(0))) generate(script, "c10::aten", output_dir + "/AtenHandlers.gen.cpp", globals()) ================================================ FILE: scripts/PopParse.py ================================================ #!/usr/bin/env python3 # Copyright (c) 2020 Graphcore Ltd. All rights reserved. import enum import argparse import logging import os import re import sys import clang.cindex from popgen import onnx from utils import _utils logger = logging.getLogger("PopParse") _utils.set_logger(logger) parser = argparse.ArgumentParser() parser.add_argument("-c", "--clang", type=str, help="Manually set path to clang headers") parser.add_argument("-D", "--debug", action='store_true', help="Enable debug printing") args = parser.parse_args() poplar_dir = onnx.find_poplar_includes() popart_dir = onnx.find_popart_includes() onnx.init(popart_dir, args.clang, args.debug) jsonOutput = onnx.parse() logging_level = logging.DEBUG if args.debug else logging.INFO logging.basicConfig(level=logging_level) # List of options which cannot be resolved with clang, e.g. referring to values # in external sources options_not_resolved = ["defaultPrefetchBufferingDepth"] # List of SessionOptions attributes PopTorch decided to not support options_not_handled = [ "bufferingDepthMap", "developerSettings", "prefetchBufferingDepthMap", "matmulOptions", "tensorLocationSettingsOverride", "autodiffSettings", "scheduleNonWeightUpdateGradientConsumersEarly", "matmulOptions", # Handled by PopTorch but not detected by this parser: "activationTensorLocationSettings", "replicatedCollectivesSettings", "automaticLossScalingSettings", "weightTensorLocationSettings", "optimizerStateTensorLocationSettings", "accumulatorTensorLocationSettings", "replicatedGraphCount", "accumulationReductionType", "executionPhaseSettings", "accumulateOuterFragmentSettings", "batchSerializationSettings", "automaticLossScalingSettings", "autodiffSettings", "_enableRngStateManagement", "createImplicitPipeliningFwdOnlyProgram", ] class OptionType(enum.IntEnum): Bool = 0 Int = 1 Float = 2 String = 3 Container = 4 Enum = 5 Object = 6 # check container_options def parse_session_options(root_node): # pylint: disable=too-many-statements # Build the list of options handled by PopTorch: handled = {} checks = { r".*container_options, \"(.*)\",.*": OptionType.Container, r" *ADD_POPART_ENUM_OPTION\(([^,]+),.*": OptionType.Enum, r" *ADD_POPART_STRING_OPTION\((.*)\).*": OptionType.String, r" *ADD_POPART_UINT64_OPTION\((.*)\).*": OptionType.Int, r" *ADD_POPART_BOOL_OPTION\((.*)\).*": OptionType.Bool, r" *ADD_POPART_DOUBLE_OPTION\((.*)\).*": OptionType.Float } for line in open( os.path.join(_utils.sources_dir(), "popart_compiler", "source", "SessionOptions.cpp"), "r"): for expr, type in checks.items(): m = re.match(expr, line) if m: handled[m.group(1)] = type break def find_session_options(node): if node.kind == clang.cindex.CursorKind.STRUCT_DECL and \ node.spelling == "SessionOptions": return node for c in node.get_children(): n = find_session_options(c) if n: return n return None def get_child(parent, child_type): child = None for c in parent.get_children(): if c.kind == child_type: assert child is None, ( f"More than one child of " f"{parent.spelling} has type {str(child_type)}") child = c return child opts = find_session_options(root_node) expected = {} # Build the list of attributes in Popart's SessionOptions for c in opts.get_children(): if c.kind != clang.cindex.CursorKind.FIELD_DECL: continue if c.spelling in options_not_resolved: continue children = list(c.get_children()) # deal with CursorKind.UNEXPOSED_REF # this shows up when there is an implicit cast between the literal # initializer and the storage type of the structure member uc = get_child(c, clang.cindex.CursorKind.UNEXPOSED_EXPR) or c if (get_child(c, clang.cindex.CursorKind.CXX_BOOL_LITERAL_EXPR) or get_child(uc, clang.cindex.CursorKind.CXX_BOOL_LITERAL_EXPR)): expected[c.spelling] = OptionType.Bool elif (get_child(c, clang.cindex.CursorKind.INTEGER_LITERAL) or get_child(uc, clang.cindex.CursorKind.INTEGER_LITERAL)): expected[c.spelling] = OptionType.Int elif (get_child(c, clang.cindex.CursorKind.FLOATING_LITERAL) or get_child(uc, clang.cindex.CursorKind.FLOATING_LITERAL)): expected[c.spelling] = OptionType.Float else: opt_type = get_child(c, clang.cindex.CursorKind.TEMPLATE_REF) if opt_type: if opt_type.spelling in ["set", "vector", "map"]: expected[c.spelling] = OptionType.Container elif opt_type.spelling in ["function"]: continue else: assert False, f"Template not supported {opt_type.spelling}" else: opt_type = get_child(c, clang.cindex.CursorKind.TYPE_REF) assert opt_type, (f"Can't find type of {c.spelling}: " f"{[str(d.kind) for d in children]}") if opt_type.spelling in ("std::string", "std::__cxx11::string"): expected[c.spelling] = OptionType.String elif opt_type.spelling == \ "class popart::SessionOptions::NumIOTiles": expected[c.spelling] = OptionType.Int elif opt_type.spelling.startswith("enum "): expected[c.spelling] = OptionType.Enum elif opt_type.spelling.startswith("struct "): expected[c.spelling] = OptionType.Object elif opt_type.spelling.startswith("class "): expected[c.spelling] = OptionType.Object elif opt_type.spelling == "int64_t" or \ opt_type.spelling == "size_t": expected[c.spelling] = OptionType.Int else: assert False, f"Type not supported {opt_type.spelling}" missing_mismatched = [] for opt, type in expected.items(): if opt in options_not_handled: continue if opt not in handled: missing_mismatched.append( f"Option {opt} not handled by PopTorch Type: {str(type)}. You" " need to add the relevant macro in SessionOptions.cpp or to" " options_not_handled in this script.") elif handled[opt] != type: missing_mismatched.append( (f"Type mismatch for option {opt}: Popart type {str(type)} " f"PopTorch: {str(handled[opt])}")) assert not missing_mismatched, "\n".join(missing_mismatched) index = clang.cindex.Index.create() session_file = os.path.join(popart_dir, "popart", "sessionoptions.hpp") tu = index.parse(session_file, args=[ "-std=c++14", "-I" + popart_dir, "-I" + poplar_dir, "-DONNX_NAMESPACE=onnx", ]) parse_session_options(tu.cursor) # `prelu' is supported, PyTorch's definition requires a reshape before passing # to the ONNX op. UnsupportedOps = ["abort", "ctcloss", "gru", "rnn", "tensorremap", "prelu"] ## Implicit cast support # Casting on all args # yapf: disable CastingOps = [ "add", "atan2", "bitshift", "call", "clip", "concat", "convtranspose", "div", "dynamicadd", "dynamiczero", "equal", "fmod", "gemm", "greater", "gru", "instancenormalization", "less", "logical_and", "logical_or", "logical_xor", "lstm", "matmul", "max", "maxroipool", "mean", "min", "mod", "mul", "pow", "range", "remainder", "rnn", "scan", "sequenceconstruct", "sub", "sum", ] # yapf: enable # Also Einsum, GreaterOrEqual, LessOrEqual CastingExceptFirstArgsOps = ["where"] CastingExceptSecondArgsOps = [ "dequantizelinear", "scatterelements", "scatternd" ] # Also Pad but only after >= 11 CastingExceptThirdArgsOps = ["roialign"] CastingExceptFourthFifthArgsOps = [] # Implicit casting ops not in these catagories: # QLinearConv, QLinearMatMul # All implicitly casting ops produce an output the same as the promoted type # except those which always return bools, floats (in onc case) and the following CastingDifferentOutput = ["sequenceconstruct", "call"] CastingAlwaysBoolOutput = [ "equal", "greater", "less", "logical_and", "logical_not", "logical_or", "logical_xor" ] CastingAlwaysFloatOutput = ["dequantizelinear"] CastingAlwaysIntOutput = ["convinteger", "matmulinteger"] ## Non implicit-casting type support # yapf: disable OutputTypeSameAsFirstInput = [ "_ctcloss", "abs", "acos", "acos", "acosh", "asin", "asinh", "atan", "atanh", "averagepool", "batchnormalization", "bitwiseand", "bitwiseor", "bitwisexor", "bitwisexnor", "bitwisenot", "bitwisexor", "ceil", "celu", "compress", "concat", "conv", "cos", "cosh", "cumsum", "depthtospace", "det", "detach", "dropout", "dynamicupdate", "dynamicslice", "einsum", "elu", "erf", "exp", "expand", "expm1", "flatten", "floor", "fmod", "gather", "gatherelements", "gathernd", "gelu", "geluerf", "globalaveragepool", "globallppool", "globalmaxpool", "groupnormalization", "hardmax", "hardsigmoid", "hardswish", "identity", "identityloss", "l1loss", "leakyrelu", "log", "log1p", "logical_not", "logsoftmax", "lpnormalization", "lppool", "lrn", "maxpool", "maxunpool", "meanvariancenormalization", "neg", "nllloss", "nop", "onehot", "pad", "printtensor", "range", "reciprocal", "reducel1", "reducel2", "reducelogsum", "reducelogsumexp", "reducemax", "reducemean", "reducemin", "reduceprod", "reducesum", "reducesumsquare", "relu", "remainder", "replicatedallreduce", "reshape", "resize", "reverse", "reversesequence", "roialign", "round", "scale", "scaledadd", "scatter", "scatterreduce", "selu", "sequenceerase", "shapeddropout", "shrink", "sigmoid", "sign", "sin", "sinh", "slice", "softmax", "softplus", "softsign", "spacetodepth", "split", "sqrt", "squeeze", "stringnormalizer", "subsample", "swish", "tan", "tanh", "thresholdedrelu", "tile", "transpose", "unique", "unsqueeze", "upsample", ] # yapf: enable FirstOutputTypeSameAsFirstInputButSecondAlwaysInt = ["topk", "reducemedian"] OutputTypeSameAsThirdInput = ["onehot"] OutputTypeAlwaysBool = [ "isinf", "isnan", "logical_and", "logical_not", "logical_or", "logical_xor" ] OutputTypeAlwaysFloat = ["tfidfvectorizer"] OutputTypeAlwaysInt32 = [ "argmax", "argmin", "isinf", "isnan", "nonmaxsuppression", "nonzero", "shape", "size" ] OutputTypeAlwaysUint8 = [ "dynamicquantizelinear", "quantizelinear", "qlinearconv", "qlinearmatmul" ] OutputTypeAsDtype = [ "cast", "eyelike", "multinomial", "randomnormal", "randomuniform" ] OutputTypeAsDtypeOrAsPromoted = ["randomnormallike", "randomuniformlike"] OutputTypeVariable = [ "concatfromsequence", "constant", "constantofshape", "loop", "multinomial", "sequenceat", "sequentempty", "sequenceinsert ", "splittosequence" ] MultipleOutputsOps = { "gru": "2", "lstm": "2", "rnn": "2", "split": "num_outputs", "topk": "2", "reducemedian": "2", "batchnormalization": "num_node_outputs", } ExtraArgumentOps = { "batchnormalization": ["unsigned int num_node_outputs"], } CXXTypeToTypeClass = { # Scalar integers "int64_t": "INT", "int": "INT", "bool": "INT", "unsigned int": "INT", "popart::ReductionType": "INT", "popart::ScatterReduction": "INT", "nonstd::optional": "INT", "nonstd::optional": "INT", "Attributes::Int": "INT", # Floats "float": "FLOAT", "nonstd::optional": "FLOAT", # Non-scalar floats "std::vector": "FLOAT_VEC", # Non-scalar integers. "std::vector": "INT_VEC", "nonstd::optional >": "INT_VEC", "Attributes::Ints": "INT_VEC", # String "char": "CHAR", "std::string": "STRING", "std::vector": "STRING_VEC", # Debug context "popart::DebugContext": "DEBUG_CONTEXT" } # Cleans up raw C++ type to remove reference or const qualifiers def clean(cxxType): return cxxType.replace("&", "").replace("const", "").strip().rstrip() # Convert the raw C++ type parsed from the header into the macro type. def toType(cxxType): cleaned = clean(cxxType) if cleaned in CXXTypeToTypeClass: return CXXTypeToTypeClass[cleaned] logger.debug("toType: Unknown cxxType=%s / cleaned=%s", cxxType, cleaned) # Soft fail as it isn't unexpected for some popart functions to be unsupported right now. return "UNKNOWN" CXX_TYPE_CONV_TABLE = { "nonstd::optional": "std::int32_t", "nonstd::optional": "std::int32_t", "popart::ReductionType": "std::int32_t", "popart::ScatterReduction": "std::int32_t", "nonstd::optional": "float", "nonstd::optional>": "std::vector", "Attributes::Ints": "std::vector", "Attributes::Int": "std::int32_t", "std::vector": "std::vector" } CXX_NON_CONV_TYPES = [ "char", "bool", "float", "int", "int64_t", "unsigned int", "std::string", "std::vector", "std::vector", "popart::DebugContext" ] # Convert from the popart header types into normal C++ types that can be used by pytorch. def convertCxxConvert(cxxType_orig): cxxType = cxxType_orig.replace("&", "") cxxType = cxxType.replace("const ", "const[preserved_space]") cxxType = cxxType.replace("unsigned const", "const unsigned") # Remove any whitespace but keep "const" and "unsigned" safe cxxType = cxxType.replace("const ", "const[preserved_space]") cxxType = cxxType.replace("unsigned ", "unsigned[preserved_space]") cxxType = "".join(cxxType.split()) cxxType = cxxType.replace("[preserved_space]", " ") if cxxType in CXX_TYPE_CONV_TABLE: return CXX_TYPE_CONV_TABLE[cxxType] # Most types won't need processing if cxxType in CXX_NON_CONV_TYPES: return cxxType_orig # Handle const if cxxType.startswith("const "): non_const_type = cxxType[len("const "):] if non_const_type in CXX_TYPE_CONV_TABLE: # const is dropped for legacy return CXX_TYPE_CONV_TABLE[non_const_type] if non_const_type in CXX_NON_CONV_TYPES: return cxxType_orig # Error on unknown types print(f"Unknown type: {cxxType}") sys.exit(1) def attrTypeGetter(ty): typemap = { "CHAR": "i", "INT": "i", "INT_VEC": "is", "FLOAT": "f", "FLOAT_VEC": "fs", "STRING": "s", "STRING_VEC": "ss", "DEBUG_CONTEXT": "x", } assert ty in typemap, "Invalid type: " + ty return typemap[ty] def addCastingOptStr(name): if name in CastingOps: return "ImplicitCast::All" if name in CastingExceptFirstArgsOps: return "ImplicitCast::ExceptFirst" if name in CastingExceptSecondArgsOps: return "ImplicitCast::ExceptSecond" if name in CastingExceptThirdArgsOps: return "ImplicitCast::ExceptThird" if name in CastingExceptFourthFifthArgsOps: return "ImplicitCast::ExceptFourthFifth" return "ImplicitCast::None" def addOutputTypeStr(name): # pylint: disable=too-many-return-statements if name in CastingAlwaysBoolOutput or name in OutputTypeAlwaysBool: return "OutputType::AlwaysBool" if name in CastingAlwaysFloatOutput or name in OutputTypeAlwaysFloat: return "OutputType::AlwaysFloat" if name in CastingAlwaysIntOutput or name in OutputTypeAlwaysInt32: return "OutputType::AlwaysInt" if any(name in n for n in (CastingOps, CastingExceptFirstArgsOps, CastingExceptSecondArgsOps, CastingExceptThirdArgsOps)): return "OutputType::AsImplicitCastPromoted" if name in OutputTypeSameAsFirstInput: return "OutputType::AsFirstInput" if name in FirstOutputTypeSameAsFirstInputButSecondAlwaysInt: return "OutputType::FirstAsFirstInputSecondAlwaysInt" if name in OutputTypeSameAsThirdInput: return "OutputType::AsThirdInput" if name in OutputTypeAlwaysUint8: return "OutputType::AlwaysUint8" if name in OutputTypeAsDtype: return "OutputType::AsDtype" if name in OutputTypeAsDtypeOrAsPromoted: return "OutputType::AsDtypeOrAsPromoted" if name in OutputTypeVariable: return "OutputType::Unknown" print(f"Missing type spec for: {name}") return "OutputType::Unknown" macroFile = "" headerStubs = "" cxxFile = "" classes = [] for classname in jsonOutput: classes.append(classname) classes.reverse() for opset in classes: macroFile += "// Ops from %s\n" % opset for name in jsonOutput[opset]: if name in UnsupportedOps: continue logger.debug("Generating code for %s::%s", opset, name) # Generate the macro opDecl = "OP_DECL(" funcName = name.capitalize() opDecl += "popart, " + name + ", " + name if opset.startswith("AiOnnxOpset"): opDecl += ", AiOnnxOpset11." + name else: opDecl += ", " + opset + "." + name argVector = "" bodyArgVector = "" earlyExit = True args = jsonOutput[opset][name]["args"] for arg in args: # Skip the first args and also the "name" arg. if arg["name"] == "args": # Guarantee we are working with an op which takes in popart tensors as 0th argument. earlyExit = False continue macroType = toType(arg["type"]) if macroType == "UNKNOWN": logger.info("Skipping OP: %s" " due to parse failure on %s", name, str(arg)) earlyExit = True break if arg["name"] != "debugContext": argVector += "ARG(" + macroType + "," + arg["name"] + ") " if any(arg["type"].endswith(s) for s in ["ReductionType", "ScatterReduction"]): bodyArgVector += f"BODY_ARG(static_cast<{clean(arg['type'])}>("\ + arg["name"] + ")) " elif arg["name"] == "debugContext": bodyArgVector += "BODY_ARG(DEBUG_CONTEXT(\"" + funcName + "\"))" else: bodyArgVector += "BODY_ARG(" + arg["name"] + ") " if earlyExit: continue if argVector == "": argVector = "NONE" if bodyArgVector == "": bodyArgVector = "NONE" opDecl += ", " + argVector opDecl += ", " + bodyArgVector macroFile += opDecl + ")\n" header = "torch::jit::Node* " header += "create" + funcName + "(torch::jit::Graph *graph, const " \ "std::vector& args" cppFile = " torch::jit::Node *new_node = createAndInsertNode(graph, " \ "symbols::popart::" + name + ", args" cppFile += f", {addCastingOptStr(name)}, {addOutputTypeStr(name)}" if name in MultipleOutputsOps: cppFile += ", %s" % MultipleOutputsOps[name] cppFile += ");\n" args = jsonOutput[opset][name]["args"] for arg in args: # Skip the first args if arg["name"] == "args": continue attr = attrTypeGetter(toType(arg["type"])) if attr == "x": continue header += "," + convertCxxConvert(arg["type"]) + " " + arg["name"] cppFile += "new_node->" + attr + "_(c10::Symbol::attr("\ "\"" + arg["name"] + "\")," + arg["name"] + ");\n" if name in ExtraArgumentOps: header += ", " + ", ".join(ExtraArgumentOps[name]) if name in OutputTypeAsDtype: cppFile += "setNodeOutputsTypes(new_node, ImplicitCast::None, " cppFile += "OutputType::AsDtype);\n" if name in OutputTypeAsDtypeOrAsPromoted: cppFile += "setNodeOutputsTypes(new_node, ImplicitCast::All, " cppFile += "OutputType::AsDtypeOrAsPromoted);\n" cppFile += "return new_node;\n" cppFile = header + ") {\n" + cppFile + "}" header += ");" headerStubs += header + "\n" cxxFile += cppFile + "\n" autoComment = """// Copyright (c) 2022 Graphcore Ltd. All rights reserved. // Auto generated file, do not modify // Run `python3 scripts/PopParse.py` to regenerate // clang-format off """ with open( os.path.join(_utils.sources_dir(), 'popart_compiler', 'include', 'popart_compiler', 'CompilerOperationMacros.inc.hpp'), 'w') as f: print(autoComment, file=f) print(macroFile, file=f) with open( os.path.join(_utils.sources_dir(), 'poptorch', 'source', 'include', 'poptorch', 'CompilerOps.inc.hpp'), 'w') as f: print(autoComment, file=f) print(headerStubs, file=f) with open( os.path.join(_utils.sources_dir(), 'poptorch', 'source', 'CompilerOps.cpp.inc'), 'w') as f: print(autoComment, file=f) print(cxxFile, file=f) ================================================ FILE: scripts/PopTorchHandlers.py ================================================ #!/usr/bin/env python3 # Copyright (c) 2020 Graphcore Ltd. All rights reserved. import os from popgen.api import expand, convert, generate from popgen.helpers import cint, clong, cstr, tensor_list from popgen.values import OriginalNode from popgen.operatorfactory import op from utils import _utils script = "PopTorchHandlers.py" output_dir = os.path.join(_utils.sources_dir(), "poptorch/source/popart_canonicalization") convert("recomputation_checkpoint", 1, "recomputationCheckpoint") convert("update_param_inplace", 2, "copyvarupdate") expand("begin_ipu_block", lambda x, y, z: op.beginIpuBlock( clong(x), clong(y), clong(z))) expand("internal_cast", lambda tensor, dtype: op.internalCast( tensor, cstr(dtype))) expand("call_cpu_op", lambda x, s: op.callCpuOp(tensor_list(x), cstr(s), OriginalNode())) expand("identity_loss", lambda x, r: op.identityloss(x, cint(r))) expand("optimizer_group", lambda x, l: op.optimizerGroup( clong(x), tensor_list(l))) expand( "set_matmul_serialization", lambda x, s, a, b: op.setMatMulSerialization( x, cstr(s), clong(a), cint(b))) expand("start_for_loop", op.startForLoop) expand( "end_for_loop", lambda output, inputs, trip_count: op.endForLoop( output, inputs, clong(trip_count))) expand("start_if_block", op.startIfBlock) expand("start_else_block", op.startElseBlock) expand("end_if_block", op.endIfBlock) expand("nop", op.nop) # These are graph annotations: they don't take any arguments and don't return # anything: we just want to pass them through to the lowering stage. expand("end_ipu_block", op.passThrough) expand("begin_multi_conv", op.passThrough) expand("pop_name_scope", op.passThrough) expand("end_cpu_op", op.passThrough) generate(script, "symbols::poptorch", output_dir + "/PoptorchHandlers.gen.cpp", globals()) ================================================ FILE: scripts/__init__.py ================================================ ================================================ FILE: scripts/apply_linters.py ================================================ #!/usr/bin/env python3 # Copyright (c) 2021 Graphcore Ltd. All rights reserved. from typing import Tuple import argparse import collections import difflib import enum import hashlib import json import logging import os import pathlib import re import sys import tempfile import time import packaging.version import yaml from utils import _utils logger = logging.getLogger("apply_linters") _utils.set_logger(logger) yapf_flags = "--style='{based_on_style: pep8}'" cpp_lint_disabled = [ "runtime/string", "runtime/references", "build/c++11", "build/header_guard", "whitespace/comments", "whitespace/indent" ] class OutputProcessor: def __call__(self, raw_output: str, returncode: int) -> Tuple[str, int]: raise NotImplementedError() class SaveOutput(OutputProcessor): def __init__(self): self.output = "" def __call__(self, raw_output: str, returncode: int) -> Tuple[str, int]: self.output = raw_output return raw_output, returncode class GitStrategy(enum.Enum): Master = "master" # Files modified / added between HEAD and origin/master Head = "head" # Files modified / added in the last commit Diff = "diff" # Files modified / added but not commited All = "all" # All files tracked by git PreCommit = "pre-commit" # pre-commit is like "master" except it takes # precedence over the files provided on the # command line. class ILinterFamily: """Regroup the linters running on the same types of files (e.g cpp or py) """ def __init__(self, supported_extensions, linters, excluded_extensions=None): """ :param supported_extensions: Array of extensions supported by the linters (e.g ["hpp","cpp"]) :param linters: list of linters to run for the matching files :param excluded_extensions: Optional list of extensions to exclude """ self._linters = linters # ["hpp","cpp"] -> ".*\.(hpp|cpp)$" self._supported = re.compile(r".*\.(%s)$" % '|'.join(supported_extensions)) if excluded_extensions: self._excluded = re.compile(r".*\.(%s)$" % '|'.join(excluded_extensions)) else: self._excluded = None self.first_lint = True def gen_lint_commands(self, filename, autofix): if not re.match(self._supported, filename): logger.debug("%s didn't match %s", filename, self._supported) return [] if self._excluded and re.match(self._excluded, filename): logger.debug("%s matched exclusion %s", filename, self._excluded) return [] if self.first_lint: # Check all the linters are correctly installed self.first_lint = False all_valid = all(linter.check_version() for linter in self._linters) if not all_valid: print("\nERROR: You need a valid PopTorch buildenv to run " "the linters:") print("- create a buildenv using scripts/create_buildenv.py") print( "- activate the environment: source activate_buildenv.sh") print("- configure your PopTorch build: cmake " "../poptorch -DPOPLAR_SDK=...") sys.exit(1) return [ linter.gen_lint_command(filename, autofix) for linter in self._linters if linter.is_enabled(filename, autofix) ] class CppLinters(ILinterFamily): def __init__(self): super().__init__(["hpp", "cpp"], excluded_extensions=["inc.hpp", "inc.cpp"], linters=[ClangTidy(), ClangFormat()]) class PyLinters(ILinterFamily): def __init__(self): super().__init__(["py"], linters=[Pylint(), Yapf()]) def is_enabled(self, filename, autofix): # pylint: disable=unused-argument # Don't run PyLint on the buildenv config files return re.match(r".*\.buildenv\.py$", filename) is None class ILinter: """Base class for all the linters""" def gen_lint_command(self, filename, autofix): """Create one or more commands to lint the given file""" raise RuntimeError("Must be implemented by child class") def check_version(self): """Check the linter is installed. (Called only once)""" raise RuntimeError("Must be implemented by child class") def is_enabled(self, filename, autofix): # pylint: disable=unused-argument """Should the linter run for this given file?""" return True class ProcessManager: _manager = None @staticmethod def create(max_num_proc=0): assert ProcessManager._manager is None ProcessManager._manager = ProcessManager(max_num_proc) @staticmethod def get(): if ProcessManager._manager is None: ProcessManager.create() return ProcessManager._manager def __init__(self, max_num_proc): self.max_num_proc = max_num_proc self.queue = [] self.running = [] self.num_running = 0 def enqueue(self, create_proc_fn): if self.max_num_proc == 0: create_proc_fn() return self.queue.append(create_proc_fn) self.update() def update(self): def _is_running(proc): """Update num_running when a process just returned """ if proc.is_running(): return True self.num_running -= 1 logger.debug("Process completed, %d/%d processes in use", self.num_running, self.max_num_proc) return False # Check the status of all the running processes self.running = [p for p in self.running if _is_running(p)] # Start new processes if slots are available while self.queue and self.num_running < self.max_num_proc: self.running.append(self.queue[0]()) self.queue = self.queue[1:] self.num_running += 1 logger.debug("Process started, %d/%d processes in use", self.num_running, self.max_num_proc) class Command: """Asynchronously run a command in a sub shell""" def __init__(self, *cmd, stop_on_error=True, print_output=True, output_processor=None, name=None, print_output_on_error=True): # Stop on error self.cmd = "set -e;" if stop_on_error else "" self.cmd += " ".join(cmd) self.output_processor = output_processor self.print_output = print_output self.proc = None self.output = "" self.name = name or cmd[0] self.print_output_on_error = print_output_on_error def start(self): ProcessManager.get().enqueue(self._create_proc) def _create_proc(self): assert self.proc is None, "Process already started" self.output = "" def append_to_output(line): self.output += line + "\n" # We make sure that the PYTHONPATH is clear because we do not want the # linter to undertake run-time inspection of the poptorch module. new_env = os.environ.copy() new_env["PYTHONPATH"] = "" if "CPATH" in new_env: del new_env["CPATH"] self.proc = _utils.Process([self.cmd], redirect_stderr=True, env=new_env, stdout_handler=append_to_output) return self.proc def is_running(self): return self.proc is None or self.proc.is_running() def wait(self): while self.proc is None: ProcessManager.get().update() time.sleep(1) returncode = self.proc.wait() output = self.output logger.debug("Command %s returned with %d", self.name, returncode) if self.output_processor: output, returncode = self.output_processor(output, returncode) if self.print_output_on_error and returncode: print(f"{self.name} failed with exit code {returncode}") print("Output:") print(output) elif self.print_output and output: print(f"Output of {self.name}:") print(output) return returncode def run(self): self.start() return self.wait() class CondaCommand(Command): """A command which will activate a Conda buildenv before running""" activate_cmd = None def __init__(self, *cmd, name=None, **kwargs): if CondaCommand.activate_cmd is None: CondaCommand.activate_cmd = get_conda_activate_cmd() logger.debug("Activate command initialised to %s", CondaCommand.activate_cmd) if cmd: super().__init__(CondaCommand.activate_cmd, *cmd, **kwargs, name=name or cmd[0]) class ClangTools: _llvm_path = "${CONDA_PREFIX}" @staticmethod def path(): return os.path.join(ClangTools._llvm_path, "bin") @staticmethod def clang_format(): return os.path.join(ClangTools.path(), "clang-format") @staticmethod def clang_tidy(): return os.path.join(ClangTools.path(), "clang-tidy") @staticmethod def clang_apply_replacements(): return os.path.join(ClangTools.path(), "clang-apply-replacements") def get_conda_activate_cmd(): """Check if we're already inside a Conda environment, if not return the command to run to activate one""" if "CONDA_PREFIX" in os.environ: logger.debug("Conda environment active, nothing to do") return "" sources_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) activate_script = os.path.join(sources_dir, ".linters", "activate_buildenv.sh") if not os.path.isfile(activate_script): error = ["No active Conda environment, you need to either activate "\ "it or create a link to it:", ". ../build/activate_buildenv.sh", "or", f"ln -sf /my/build/activate_buildenv.sh {activate_script}" ] raise RuntimeError("\n".join(error)) return f". {activate_script};" def offset_to_line(filename, offsets): """Convert a list of offsets in a file to a dictionary of line, column. [ offset ] -> { offset: (line,column) } """ if not filename: return {offset: (0, 0) for offset in offsets} offsets = sorted(set(offsets)) line = 1 mappings = {} file_offset = 0 try: it = iter(offsets) offset = next(it) for l in open(filename): start_line_offset = file_offset file_offset += len(l) while offset < file_offset: mappings[offset] = (line, offset - start_line_offset + 1) offset = next(it) line += 1 except StopIteration: return mappings raise RuntimeError(f"Invalid offset {offset} (File length: {file_offset})") class DiffCreator: """Create a diff between the output of a command and the content of file. Some linters (for example yapf) print the modified file to stdout instead of modifying it in-place. This class will create a diff with the original file and print the differences. If autofix is enabled, the content of the original file will be replaced. """ def __init__(self, filename, linter, autofix): self.filename = filename self.linter = linter if autofix: self.linter += "(autofix)" self.autofix = autofix def __call__(self, output, errcode): """Called by Command with the output of the linter""" origin = open(self.filename).readlines() new = output.splitlines(True) delta = "" for line in difflib.unified_diff(origin, new, fromfile="a/" + self.filename, tofile="b/" + self.filename): m = re.match(r"@@ -(\d+),.*@@", line) if m: print(f"{self.filename}:{int(m.group(1))+3}:error:" f"[{self.linter}] to fix run " "./scripts/apply_linters.py --autofix") delta += line if delta: if self.autofix: with open(self.filename, "w") as f: f.write(output) else: print(f"{self.linter} found the following issues in " f"{self.filename}\n{delta}") errcode = 1 return delta, errcode class VersionParseCommandBase(CondaCommand): def __init__(self, *cmd, **kwargs): super().__init__(*cmd, **kwargs) self.version = None def _parse_version(self, output, return_code): raise NotImplementedError("Must be implemented in the derived type") def run_and_compare_versions(self, expected): self.run() expected_version = packaging.version.parse(expected) if expected_version != self.version: logger.error("Required version of %s is %s, but found %s", self.name, expected_version, self.version) return False return True class VersionJSONParseCommand(VersionParseCommandBase): def __init__(self, command_name): super().__init__( f"grep \\\"version\\\" " f"${{CONDA_PREFIX}}/conda-meta/{command_name}-*.json", print_output=False, output_processor=self._parse_version) def _parse_version(self, output, return_code): if return_code: return output, return_code self.version = packaging.version.parse( json.loads("{" + output + "}")["version"]) return output, return_code class VersionParseCommand(VersionParseCommandBase): def __init__(self, version_re_prefix, command_name): super().__init__(command_name, "--version", print_output=False, output_processor=self._parse_version) self.version_re_prefix = version_re_prefix self.version = None def _parse_version(self, output, return_code): if return_code: return output, return_code match_result = re.search(f"{self.version_re_prefix} ([.0-9]+)", output, flags=re.MULTILINE) if match_result: self.version = packaging.version.parse(match_result[1]) return output, return_code def compare_versions_from_conda(command_name, expected): version_parse_cmd = VersionJSONParseCommand(command_name) return version_parse_cmd.run_and_compare_versions(expected) def compare_versions_from_output(command_name, expected, version_re_prefix=None): if version_re_prefix is None: version_re_prefix = command_name version_parse_cmd = VersionParseCommand(version_re_prefix, command_name) return version_parse_cmd.run_and_compare_versions(expected) class ClangFormat(ILinter): def gen_lint_command(self, filename, autofix): flags = "" output_processor = None if autofix: flags += " -i" else: output_processor = DiffCreator(filename, "clang-format", autofix) return CondaCommand(ClangTools.clang_format(), flags, filename, output_processor=output_processor, print_output=autofix) def check_version(self): return compare_versions_from_output(ClangTools.clang_format(), "13.0.1", "version") class ClangTidy(ILinter): class ResultsProcessor(OutputProcessor): """Wait for all the jobs to complete then combine and process their outputs """ def __init__(self, num_jobs, autofix): self.num_jobs = num_jobs self.tmp_folder = tempfile.TemporaryDirectory( prefix="poptorchLinter_") self.autofix = autofix def __call__(self, raw_output, returncode): self.num_jobs -= 1 logger.debug("1 clang-tidy job completed, %d remaining", self.num_jobs) logger.debug("clang-tidy output: %s", raw_output) if self.num_jobs == 0: diagnostics = [] # Combine the diagnostics from the different reports for f in pathlib.Path(self.tmp_folder.name).glob("*.yaml"): with open(f) as file: res = yaml.full_load(file) # Combine the diagnostics diagnostics += res.get("Diagnostics", []) # Error messages are linked to a file + offset # Collect the "offsets" used for each filename offsets = collections.defaultdict(list) for diag in diagnostics: msg = diag["DiagnosticMessage"] offsets[msg["FilePath"]].append(msg["FileOffset"]) # Create a map of map that linking offsets in files to their # corresponding line and column: # line_mapping[filename] = { offset : (line, col) } line_mappings = {} for filename, file_offsets in offsets.items(): # Don't lint files in the build folder if not os.path.isabs(filename): continue line_mappings[filename] = offset_to_line( filename, file_offsets) printed = [] for diag in diagnostics: msg = diag["DiagnosticMessage"] filename = msg["FilePath"] # Don't lint files in the build folder if not os.path.isabs(filename): continue line, col = line_mappings[filename][msg["FileOffset"]] error = "error" if self.autofix and msg["Replacements"]: error += " (autofixed)" output = f"{filename}:{line}:{col}: {error}: " output += f"{msg['Message']} [{diag['DiagnosticName']}]" # If this message has already been printed: skip it if output in printed: continue if not printed: print("Output of clang-tidy:") print(output) printed.append(output) if not printed and returncode != 0: # If we didn't manage to parse the diagnostics but clang-tidy # returned a failure at least print the raw output. print(raw_output) # Apply the fixes using clang-apply-replacements if self.autofix: CondaCommand(ClangTools.clang_apply_replacements(), self.tmp_folder.name).run() return raw_output, returncode def __init__(self): self.configs = [] self.includes = [] self.compile_commands = {} def get_compile_commands_flags(self, filename): if filename.endswith("cpp"): if filename in self.compile_commands: return self.compile_commands[filename] logger.warning( "%s is absent from compile_commands.json: check " "CMakeLists.txt to make sure it's compiled", filename) # Fall through to header path to try to find # flags for files in the same folder folder = os.path.dirname(filename) filename = os.path.basename(filename) path = folder.split(os.path.sep) # If it's a public header then it will be in # poptorch/component/include/component/my_header.hpp # and the cpp files will be in /component/source/ # # Therefore we need to replace "include/component" with "source" # to find a cpp file with the compilation flags we want. if "include" in path: # Remove folders in path up to "include" while path.pop() != "include": continue # Types is a sub module in popart_compiler, so we want to go up one more level. if path[-1] == "types": path.pop() # TODO(T49191) lower_to_poplar, dialect and pytorch_bridge don't # have their sources in a "source" subfolder at the moment. exceptions = ["lower_to_poplar", "pytorch_bridge", "dialect"] if not "source" in path and not any(comp in path for comp in exceptions): # Point at "source" instead path.append("source") # else it's a private header: nothing to do, it's already in the same # folder as the source files. folder = os.path.join(*path) for path, flags in self.compile_commands.items(): if path.startswith(folder): logger.debug("Found flags for folder %s", folder) return flags logger.warning("No compilation flags found for folder %s", folder) return ("", "") def gen_lint_command(self, filename, autofix): if not self.configs: self.check_version() gcc_flags, work_dir = self.get_compile_commands_flags(filename) flags = "-std=c++17 -fsized-deallocation -DONNX_NAMESPACE=onnx " flags += gcc_flags flags += " -I" + " -I".join(self.includes) cd = "" if work_dir: cd = f"cd {work_dir};" commands = [] results = ClangTidy.ResultsProcessor(len(self.configs), autofix) # Clang-tidy has a lot of checks so we run them in parallel in # different processes for i, c in enumerate(self.configs): report = os.path.join(results.tmp_folder.name, f"report_{i}.yaml") commands.append( CondaCommand(cd, ClangTools.clang_tidy(), "--quiet", os.path.realpath(filename), f"--export-fixes={report}", c, "--", flags, name=("clang-tidy --quiet " f"{filename} -- {flags}"), output_processor=results, print_output_on_error=False, print_output=False)) return commands def process_compile_commands(self, commands): # Some flags are not supported by clang-tidy unsupported_flags = ["-fno-semantic-interposition"] for c in commands: gcc_flags = c["command"].split() cmd = " ".join( [f for f in gcc_flags if f not in unsupported_flags]) m = re.match(".*/poptorch/(.*)", c["file"]) assert m, f"Couldn't find '/poptorch/' in {c['file']}" # Exception we've got nested "poptorch" folders, so make sure # the path is the correct one. file_maybe = m.group(1) if not os.path.exists(file_maybe): file_maybe = os.path.join("poptorch", file_maybe) if not os.path.exists(file_maybe): logger.warning( "compile_commands.json: %s/%s ignored: neither file exist", m.group(1), file_maybe) self.compile_commands[file_maybe] = (cmd, c["directory"]) # pylint: disable=too-many-return-statements def check_version(self): config = [] self.configs = [] def parse_config(output, returncode): nonlocal config config = output.splitlines(True) # For some reason clang-tidy's config contains these options it doesn't support, so filter them out. excludes = [ "FunctionHungarianPrefix", "MethodHungarianPrefix", "NamespaceHungarianPrefix" ] config = [ line for line in config if not any(e in line for e in excludes) ] return output, returncode def parse_checks(output, returncode): nonlocal config # Ignore first line it's the header all_checks = output.splitlines()[1:] checks_per_thread = 40 for offset in range(0, len(all_checks), checks_per_thread): checks = all_checks[offset:offset + checks_per_thread] config[1] = "Checks: '" + ",".join(checks) + "'\n" self.configs.append("--config=\"" + "".join(config) + "\"") return output, returncode def parse_include_tests(output, returncode): if output: returncode = 1 return output, returncode def parse_system_includes(output, returncode): if returncode: logger.error("Failed to find system includes: %s", output) return output, returncode include_path_section = False for line in output.split("\n"): if "search starts here" in line: include_path_section = True if include_path_section and line.startswith(" "): logger.debug("Adding %s to includes", line) self.includes.append(line.rstrip()) return output, returncode def parse_compile_commands_file(output, returncode): if returncode: logger.error("compile_commands.json not found. " "Make sure to build PopTorch first.") return output, returncode self.process_compile_commands(json.loads(output)) return output, returncode if CondaCommand("g++ -E -x c++ - -v < /dev/null", print_output=False, output_processor=parse_system_includes).run(): return False if CondaCommand(ClangTools.clang_tidy() + " --dump-config", print_output=False, output_processor=parse_config).run(): return False if CondaCommand(ClangTools.clang_tidy() + " --list-checks", print_output=False, output_processor=parse_checks).run(): return False tests = [ f"test -d {i} || echo \"Include folder {i} not found\"" for i in self.includes ] if CondaCommand(";".join(tests), stop_on_error=False, output_processor=parse_include_tests).run(): return False # Check if there is a compile_commands.json if CondaCommand("cat ${CONDA_PREFIX}/../compile_commands.json", print_output=False, output_processor=parse_compile_commands_file).run(): return False return compare_versions_from_output(ClangTools.clang_tidy(), "13.0.1", "version") def is_enabled(self, filename, autofix): # Don't run Clang Tidy on the pybind11 modules because we don't know # where pybind headers are. return "custom_cube_op.cpp" not in filename and \ "python/" not in filename class Pylint(ILinter): def pylint(self): return "${CONDA_PREFIX}/bin/pylint" def gen_lint_command(self, filename, autofix): return CondaCommand( self.pylint(), "--score=no --reports=no -j 0 --msg-template=" "'{path}:{line}:{column}:error:pylint[{symbol}({msg_id})]: {msg}'" " --rcfile=.pylintrc", filename) def check_version(self): return compare_versions_from_output(self.pylint(), "2.7.2", "pylint") def is_enabled(self, filename, autofix): # pylint: disable=unused-argument # Don't run PyLint on the buildenv config files return re.match(r".*\.buildenv\.py$", filename) is None class Yapf(ILinter): def yapf(self): return "${CONDA_PREFIX}/bin/yapf" def gen_lint_command(self, filename, autofix): flags = yapf_flags output_processor = None if autofix: flags += " -i" else: output_processor = DiffCreator(filename, "yapf", autofix) return CondaCommand(self.yapf(), flags, filename, output_processor=output_processor, print_output=autofix) def check_version(self): return compare_versions_from_output(self.yapf(), "0.27.0", "yapf") class Executor: def __init__(self, filename, cmd): self.filename = filename self.cmd = cmd self.returncode = 0 self._next_step() def _next_step(self): for step in self.cmd[0]: step.start() def update(self): for s in self.cmd[0]: if s.is_running(): return # All steps complete for this command: for s in self.cmd[0]: self.returncode += s.wait() self.cmd = self.cmd[1:] if self.cmd: self._next_step() elif self.returncode: print(f"{self.filename}:error: contains linting errors: " "run ./scripts/apply_linters.py --autofix") def execution_complete(self): return not self.cmd class Linters: """Interface class used to lint files""" def __init__(self): self._linters = [CppLinters(), PyLinters()] def _get_git_files(self, strategy): files = [] class GetFiles: def __init__(self, files): self.files = files def __call__(self, output, returncode): # If we keep the last element of each line we will have the files we need to lint. # ['M', 'poptorch/source/dispatch_tracer/RegisterAtenOverloads.cpp'] # ['R092', 'poptorch/source/dispatch_tracer/dispatchers/Tracer.hpp', 'poptorch/source/dispatch_tracer/dispatchers/IDispatch.hpp'] # ['A', 'poptorch/source/dispatch_tracer/dispatchers/JitDispatch.hpp'] for line in output.splitlines(): self.files.append(line.split()[-1]) return output, returncode assert isinstance(strategy, GitStrategy) git_cmd = "" filter_cmd = "| grep \"^[AMRT]\" " if strategy in [GitStrategy.Master, GitStrategy.PreCommit]: git_cmd = "git diff --name-status -r origin/mk2-main " elif strategy == GitStrategy.Head: git_cmd = "git diff --name-status -r HEAD^ " elif strategy == GitStrategy.Diff: git_cmd = "git diff --name-status -r HEAD " elif strategy == GitStrategy.All: git_cmd = "git ls-tree --name-only -r HEAD " filter_cmd = "" else: raise RuntimeError(f"Unknown strategy requested {strategy}") Command(git_cmd, filter_cmd, print_output=False, output_processor=GetFiles(files)).run() return files def lint_git(self, strategy, autofix, add_trailer_on_success): return self.lint_files(self._get_git_files(strategy), autofix, add_trailer_on_success) def _read_head_trailer(self): out = SaveOutput() Command("git show -s --pretty='%(trailers:key=Lint-Ok,valueonly)'", print_output=False, output_processor=out).run() return out.output.splitlines()[0].strip().rstrip() def _unstaged_diff(self, files): out = SaveOutput() Command("git diff " + " ".join(files), print_output=False, output_processor=out).run() return out.output.strip().rstrip() def _compute_git_trailer(self, files): diff_content = "" for f in sorted(files): with open(f, "r", encoding="utf-8") as src: diff_content += src.read() return str(hashlib.md5(diff_content.encode("utf-8")).hexdigest()) def check_git_trailer(self, strategy): return self._check_trailer(self._get_git_files(strategy), add_if_missing=False) def _check_trailer(self, files, add_if_missing): head_trailer = self._read_head_trailer() files_trailer = self._compute_git_trailer(files) if files_trailer == head_trailer: logger.info("Git trailer present and up to date") return 0 if add_if_missing: logger.warning( "Files are linted but trailer is either missing or out of " "date, updating it:") git_cmd = ( "echo \"$(git log -1 --pretty=format:%B | " "git interpret-trailers --if-exists='replace' --trailer " f"'Lint-Ok: {files_trailer}' --if-exists=replace)\" | " "git commit --amend --no-edit -F -") Command(git_cmd).run() logger.warning( "If you were trying to push your local branch to Github, " "try again.") else: logger.error( "Files haven't been linted: expected the git trailer to be " "'%s' but found '%s'", files_trailer, head_trailer) return -1 def lint_files(self, files, autofix, add_trailer_on_success): # If there is no local change and the trailer is up to date: no need to re-run the linters. if add_trailer_on_success and self._unstaged_diff( files) == "" and self._read_head_trailer( ) == self._compute_git_trailer(files): logger.info( "Git trailer already present and up to date: early return") return 0 jobs = {} for f in files: cmd = self._gen_lint_commands(f, autofix) if cmd: jobs[f] = cmd if not jobs: logger.info("No linter to run: early return") return 0 executors = [] returncode = 0 for filename, cmd in jobs.items(): print(f"Linting file {filename} [{len(cmd)}] commands to run") if autofix: executors.append(Executor(filename, cmd)) else: # No risk of conflicting modification in place # Merge the steps from all the linters all_steps = [] for c in cmd: all_steps += c executors.append(Executor(filename, [all_steps])) still_running = True while still_running: still_running = False ProcessManager.get().update() for e in executors: if e.execution_complete(): returncode += e.returncode continue e.update() still_running = True time.sleep(1) if add_trailer_on_success: diff = self._unstaged_diff(files) if diff != "": logger.warning( "Your commit needs to be amended to include the following " "changes:\n%s", diff) if returncode == 0: return self._check_trailer(files, add_if_missing=True) return returncode def _gen_lint_commands(self, filename, autofix): cmd = [] for linter in self._linters: cmd += linter.gen_lint_commands(filename, autofix) return [[c] if isinstance(c, Command) else c for c in cmd] def main(): parser = argparse.ArgumentParser() # TODO Add option to exclude some linters (e.g -no-clang-tidy) # TODO Check / update Copyrights parser.add_argument("--debug", "-d", action="store_true", help="Print debug messages") parser.add_argument("--autofix", "-a", action="store_true", help="Automatically apply fixes when possible") parser.add_argument( "--add-trailer-on-success", "-t", action="store_true", help="Add a git trailer to the commit message on success.", ) parser.add_argument( "--check-trailer", "-c", action="store_true", help= "Check the git trailer in HEAD and raise an error if it's invalid.", ) parser.add_argument( "--git-strategy", "-s", type=str, choices=[v.value for _, v in GitStrategy.__members__.items()], default=GitStrategy.Master.value, help="Strategy to use when no files are passed") parser.add_argument("--jobs", "-j", type=int, default=_utils.get_nprocs(), help="Number of cores to use for linting (0 = auto)") parser.add_argument("files", nargs="*", help="one or more files to lint") args = parser.parse_args() logging_level = logging.DEBUG if args.debug else logging.INFO logging.basicConfig(level=logging_level) logger.debug("Args: %s", str(args)) if args.jobs: assert args.jobs >= 0 ProcessManager.create(args.jobs) linters = Linters() if args.check_trailer: assert not args.files, ("You cannot pass a list of files and use " "--check-trailer at the same time") return linters.check_git_trailer(GitStrategy(args.git_strategy)) # Check we've got a Conda environment available CondaCommand() strategy = GitStrategy(args.git_strategy) # PRE_COMMIT is a special case because it will pass on the command line # the files which have been modified in the last commit but also set some # environment variables to indicate where to find the whole branch. # As we want to lint all the files on the branch, we need to ignore the # files provided on the command line. if args.files and strategy != GitStrategy.PreCommit: return linters.lint_files(args.files, args.autofix, args.add_trailer_on_success) print(f"Linting files selected by the git strategy '{args.git_strategy}'") return linters.lint_git(strategy, args.autofix, args.add_trailer_on_success) if __name__ == "__main__": sys.exit(main()) ================================================ FILE: scripts/check_spelling.py ================================================ #!/usr/bin/env python3 # Copyright (c) 2021 Graphcore Ltd. All rights reserved. import ast import glob import os import re import shlex import shutil import signal import sys import termios import threading import tty from utils import _utils CUSTOM_DIC_PATH = "docs/common/custom_dic" HUNSPELL_CMD = [ "hunspell", "-a", # Pipe mode "-d", "en_GB", # Graphcore uses en_GB for documentation "-i", "utf-8", # Encoding: suitable for linux and osx "-mode=none" ] # Use raw text TERM_STDIN = sys.stdin def getChar(): try: # Backup this or the terminal will break on closing old_attr = termios.tcgetattr(TERM_STDIN.fileno()) tty.setraw(TERM_STDIN.fileno()) char = TERM_STDIN.read(1) finally: # Reset the terminal termios.tcsetattr(TERM_STDIN.fileno(), termios.TCIFLUSH, old_attr) return char class DocStr(): def __init__(self, doc_str, source_file, line_num): self._doc_str = doc_str self._source_file = source_file self._line_num = line_num @property def doc_str(self): return self._doc_str @property def line_num(self): return self._line_num @property def source_file(self): return self._source_file def __str__(self): s = f"{self._line_num}:" + self._doc_str return s def start_hunspell_process(): # Add custom dictionary first time only if "-p" not in HUNSPELL_CMD: custom_dic_path = os.path.join(_utils.sources_dir(), CUSTOM_DIC_PATH) if not os.path.exists(custom_dic_path): open(custom_dic_path, 'a').close() HUNSPELL_CMD.append("-p") HUNSPELL_CMD.append(shlex.quote(custom_dic_path)) hunspell_output = [] def out_handler(line): hunspell_output.append(line) # subprocess.Popen fails to pass the filename correctly without this when # shell=True. shlex.quote will handle any spaces correctly. cmd = " ".join(HUNSPELL_CMD) hunspell_proc = _utils.Process(cmd, env=None, redirect_stderr=True, stdout_handler=out_handler, bufsize=0) # First line is just a version while len(hunspell_output) < 1: assert hunspell_proc.is_running() hunspell_output.clear() return {'proc': hunspell_proc, 'out': hunspell_output} CODE_BLOCK = re.compile(r"\.\. code-block::[^\n]+\n\n.*?\n\n", flags=re.DOTALL) def strip_code_blocks(s): s_list = list(s) for match in CODE_BLOCK.finditer(s): for pos in range(match.start(), match.end()): # Preserve lines by replacing everything except new lines with # spaces if s_list[pos] != "\n": s_list[pos] = " " return "".join(s_list) def should_skip(line): stripped_line = line.strip() if stripped_line.startswith(">>>"): return True if stripped_line.startswith("..."): return True return False ALL_EXCLUSIONS = (re.compile(r":param [^:]+:"), re.compile(r"p[0-9]+[^0-9]"), re.compile(r":py:[^:]+:"), re.compile(r"T[0-9]+[^0-9]"), re.compile(r"`+[^`]+`+"), re.compile(r":r?type.*")) def remove_exclusions(line): for exclusion in ALL_EXCLUSIONS: line = exclusion.sub("", line) line = line.replace(".. seealso::", "") return line def get_doc_str_line_number(element): # Handle the case of lots of parameters etc if isinstance(element.body[0], ast.Expr): if isinstance(element.body[0].value, ast.Str): end_line_no = element.body[0].value.lineno doc_str_lines = element.body[0].value.s.count("\n") return end_line_no - doc_str_lines # If the string lookup fails return element.lineno DOC_STR_ELEMENTS = (ast.AsyncFunctionDef, ast.FunctionDef, ast.ClassDef, ast.Module) def recursive_add_doc_str(source_file, element, doc_str_list): for sub_element in element.body: if isinstance(sub_element, DOC_STR_ELEMENTS): doc_str = ast.get_docstring(sub_element) if doc_str is not None: doc_str_list.append( DocStr(doc_str, source_file, get_doc_str_line_number(sub_element))) if hasattr(sub_element, "body"): recursive_add_doc_str(source_file, sub_element, doc_str_list) BLACK_ON_WHITE = "\033[30;107m" RESET_COLOR = "\033[39;49m" UNDERLINE = "\033[4m" NOT_UNDERLINE = "\033[24m" def print_context(doc_str, line_offset, unknown_spelling): print(BLACK_ON_WHITE, end='') all_lines = doc_str.doc_str.split("\n") for line_num, line in enumerate(all_lines): if line_num == line_offset: # Make sure we find the right incident of spelling pattern = unknown_spelling + r"[^a-z]" match_start = re.search(pattern, line + " ").start() before = line[:match_start] print(before, end='') print(UNDERLINE, end='') print(unknown_spelling, end='') print(NOT_UNDERLINE, end='') after = line[match_start + len(unknown_spelling):] print(after, end='') else: print(line, end='') if line_num + 1 != len(all_lines): print() print(RESET_COLOR + "\n") def process_incorrect_word(hunspell, result, doc_str, line_offset): result = result.split(" ") symbol = result[0] if symbol not in ("&", "#"): raise RuntimeError("Invalid symbol") unknown_spelling = result[1] line_num = doc_str.line_num + line_offset while True: print_context(doc_str, line_offset, unknown_spelling) print(f"Unknown spelling, '{unknown_spelling}' on line {line_num}" f" ({doc_str.source_file}).") if symbol == b"&": # Comma seprated list of suggestions suggestions = [r.decode("utf-8") for r in result[4:]] print("Suggestions: " + " ".join(suggestions)) print("(space): continue, (a)dd to dictionary, (q)uit") c = getChar() if c == ' ': break if c == 'a': # Add to dictionary and save hunspell['proc'].write(b"*") hunspell['proc'].write(unknown_spelling.encode("utf-8")) hunspell['proc'].write(b"\n") hunspell['proc'].write(b"#\n") break # Ctrl+c and ctrl+z are intercepted if c in ('q', '\x03', '\x04'): # ^C and ^D sys.exit(0) if c == '\x1a': # ^Z signal.pthread_kill(threading.get_ident(), signal.SIGSTOP) print("\n\n\n\n") def process_doc_str(hunspell, doc_str): all_doc_str = doc_str.doc_str all_doc_str = strip_code_blocks(all_doc_str) all_lines = all_doc_str.split("\n") for line_offset, line in enumerate(all_lines): if should_skip(line): continue line = remove_exclusions(line) full_line = b"^" # Escape any commands full_line += line.encode('utf-8') + b"\n" hunspell['proc'].write(full_line) while True: if len(hunspell['out']) == 0: assert hunspell['proc'].is_running() continue next_token = hunspell['out'].pop(0) if next_token == "": break if (next_token == "*" or next_token == "-" or next_token[0] == "+"): continue process_incorrect_word(hunspell, next_token, doc_str, line_offset) def check_source_file(source_dir, source_file): source_file_without_root = source_file[len(source_dir) + 1:] print(f"Checking {source_file_without_root}\n") with open(source_file, 'r') as f: source = f.read() ast_module = ast.parse(source, source_file) all_doc_str = [] recursive_add_doc_str(source_file_without_root, ast_module, all_doc_str) hunspell = start_hunspell_process() for doc_str in all_doc_str: process_doc_str(hunspell, doc_str) hunspell['proc'].eof() hunspell['proc'].wait() if __name__ == "__main__": if _utils.get_os_type() != _utils.OsType.Linux: print("Not running on linux.") sys.exit(1) if shutil.which(HUNSPELL_CMD[0]) is None: print(f"Please install {HUNSPELL_CMD[0]}.") sys.exit(1) source_dir = os.path.join(_utils.sources_dir(), "python") for source_file in glob.glob(os.path.join(source_dir, "*.py")): check_source_file(source_dir, source_file) ================================================ FILE: scripts/create_buildenv.py ================================================ #!/usr/bin/env python3 # Copyright (c) 2020 Graphcore Ltd. All rights reserved. import argparse import collections import contextlib import fcntl import hashlib import inspect import logging import re import os import platform import subprocess import sys import tarfile import urllib.request from utils import _utils logger = logging.getLogger(os.path.basename(__file__)) _utils.set_logger(logger) _conda_toolchains_packages = ["gcc_linux-64=7.3.0", "gxx_linux-64=7.3.0"] class Version: def __init__(self, version_str): self.version = tuple(int(i) for i in version_str.split(".")) def __lt__(self, other): return self.version < other.version def __eq__(self, other): return self.version == other.version def __hash__(self): return hash(self.version) def __str__(self): return ".".join([str(v) for v in self.version]) def __repr__(self): return str(self) def _default_cache_dir(): return os.environ.get("CONDA_CACHE_DIR", os.path.join(_utils.sources_dir(), ".cache")) def _system_conda_path(): #pylint: disable=broad-except try: conda_root = subprocess.check_output(["conda", "info", "--base"], stderr=None) conda_root = conda_root.decode("utf-8").strip() return conda_root except (FileNotFoundError, Exception): logger.debug('Conda Root Not Found') return None #pylint: enable=broad-except class Installer: """Common interface for all installers""" def install(self, env): raise Exception(f"Must be implemented by child class {type(self)}") def hashString(self): """Unique string identifying this version of the installer.""" raise Exception(f"Must be implemented by child class {type(self)}") class CondaPackages(Installer): """Install the list of Conda packages in the environment.""" def __init__(self, *packages): assert all(isinstance(p, str) for p in packages) self.packages = packages class CondaChannels(Installer): """Enable extra Conda channels.""" def __init__(self, *channels): assert all(isinstance(c, str) for c in channels) self.channels = channels class PipPackages(Installer): """Install the list of pip3 packages in the environment.""" def __init__(self, *packages): assert all(isinstance(p, str) for p in packages) self.packages = packages def install(self, env): env.run_commands("pip3 install " + " ".join(self.packages)) def hashString(self): return " ".join(self.packages) class PipRequirements(Installer): """Install pip3 packages from a requirements file.""" def __init__(self, filename="requirements.txt"): if not filename.startswith("/"): filename = os.path.join(os.getcwd(), filename) self._requirements_file = filename def install(self, env): env.run_commands( f"pip3 install -r {self._requirements_file} --retries 30") def hashString(self): with open(self._requirements_file, "r") as f: return f.read() class Installers: """Contains the list of installers to install in the environment.""" def __init__(self): self._installers = [] def add(self, installer): assert isinstance( installer, Installer), "All package installers must inherit from Installer" self._installers.append(installer) def __call__(self): return self._installers class Config: """Contains the configuration for the environment.""" def __init__(self, install_linters, **opts): self.__dict__ = opts self.install_linters = install_linters def setDefault(self, **opts): for k, v in opts.items(): if k not in self.__dict__: self.__dict__[k] = v class Environment: def __init__(self, buildenv_dir, activate_filename): self._buildenv_dir = buildenv_dir self._activate_filename = activate_filename @property def prefix(self): return self._buildenv_dir def run_commands(self, *cmds, env=None, stop_on_error=True, stdout_handler=None, stderr_handler=None): _utils.run_commands(f". {self._activate_filename}", *cmds, env=env, stop_on_error=stop_on_error, stdout_handler=stdout_handler, stderr_handler=stderr_handler) def rmdir_if_exists(self, path): _utils.rmdir_if_exists(path) class BuildenvManager: def __init__(self, cache_dir=None, output_dir=None, python_version=None, use_conda_toolchains=False, install_linters=False, empty_env=False, **config): if python_version is None: python_version = platform.python_version() if python_version.startswith("3.6"): python_version = "3.7" logger.warning( "Python 3.6 is no longer supported, defaulting " "to %s, if you really want to " "use python 3.6 then use --python-version 3.6", python_version) self.output_dir = os.path.realpath(output_dir or os.getcwd()) self.cache_dir = cache_dir or _default_cache_dir() self.buildenv_dir = os.path.join(self.output_dir, "buildenv") self.conda_packages = [f"python={python_version}"] self.conda_channels = [] if not empty_env: self.conda_packages.append("conda-pack=0.5.0") # Support for python 3.6 was removed from pip in version 22.0 # https://pip.pypa.io/en/stable/news/#v22-0 if python_version.startswith("3.6"): self.conda_packages.append("pip=21.1.3") is_aarch64 = _utils.get_arch_type() == "aarch64" if not is_aarch64 and not empty_env: # There is not one version of gdb which works # for both python 3.6.8 (CentOS 7) and python 3.9 if python_version.startswith("3.6.8"): self.conda_packages.append("gdb=8.3") else: self.conda_packages.append("gdb=10.2") self.projects = {} if use_conda_toolchains: self.conda_packages += _conda_toolchains_packages self.config = Config(install_linters=install_linters, is_aarch64=is_aarch64, **config) assert self.output_dir != _utils.sources_dir(), ( "This script needs " "to be called from a build directory. Try mkdir build && cd build" " && ../scripts/create_buildenv.py") # internal constants self.activate_filename = os.path.join(self.output_dir, "activate_buildenv.sh") self.env = Environment(self.buildenv_dir, self.activate_filename) self.lock_already_acquired = False def add_project(self, project, project_dir): assert os.path.exists(project_dir) self.projects[project] = os.path.realpath(project_dir) def _collect_installers(self): view_dir = os.path.dirname(_utils.sources_dir()) installers = Installers() # We share with the config files all the classes inheriting from Installer exec_locals = { name: c for name, c in inspect.getmembers(sys.modules[__name__], inspect.isclass) if Installer in c.__bases__ or c == Installer } exec_locals["installers"] = installers exec_locals["config"] = self.config for p, project_dir in self.projects.items(): # Try to find (in that order): # 1) /my_project.buildenv.py # 2) /my_project/config.buildenv.py to_test = [ os.path.join(view_dir, p + ".buildenv.py"), os.path.join(project_dir, "config.buildenv.py"), os.path.join(project_dir, p + ".buildenv.py"), ] conf = None for f in to_test: if os.path.exists(f): conf = f break if conf is None: logger.warning( "No requirements found for project '%s' (Tried %s)", p, to_test) continue with open(conf, "r") as f: code = f.read() os.chdir(project_dir) # Share the os module as it's commonly used to get the current # working directory, create directories, etc. # pylint: disable=exec-used exec(code, {"os": os, "_utils": _utils}, exec_locals) # Process the installers: other_installers = [] for i in installers(): if isinstance(i, CondaPackages): self.conda_packages += i.packages elif isinstance(i, CondaChannels): self.conda_channels += i.channels else: other_installers.append(i) packages = collections.defaultdict(list) # Resolve version conflicts # Create a dictionary package name -> [ versions ] for package in self.conda_packages: s = package.replace("==", "=").split("=") name = s[0] version = [Version(s[1])] if len(s) > 1 else [] packages[name] += version self.conda_packages = [] # Make sure the packages are unique and in a deterministic order for name in sorted(packages.keys()): versions = packages[name] if not versions: self.conda_packages.append(name) logger.warning("Version for package %s is not set", name) continue # Sort the versions by descending order and remove duplicates versions = list(set(versions)) versions.sort(reverse=True) if len(versions) > 1: logger.warning( "Conflict: more than one version requested for " "package %s: %s, selecting %s", name, versions, versions[0]) self.conda_packages.append(f"{name}={str(versions[0])}") return other_installers def create(self, create_template_if_needed=False): os.makedirs(self.output_dir, exist_ok=True) os.chdir(self.output_dir) self._clear_activate_buildenv() self._install_conda_if_needed() installers = self._collect_installers() env_hash = self._compute_environment_hash(installers) template_name = f"poptorch_{env_hash}.tar.gz" full_template_name = os.path.join(self.cache_dir, template_name) with self.cache_lock(): if os.path.isfile(full_template_name): logger.info("Found template %s: Unpacking to %s", full_template_name, self.buildenv_dir) os.makedirs(self.buildenv_dir) os.chdir(self.output_dir) tar = tarfile.open(full_template_name) tar.extractall(self.buildenv_dir) assert os.path.isdir(self.buildenv_dir) self.env.run_commands(f". {self.buildenv_dir}/bin/activate", "conda-unpack") self._append_to_activate_buildenv( f"conda activate {self.buildenv_dir}", ) else: logger.info( "Didn't find template %s: creating a new " "environment in %s", full_template_name, self.output_dir) self._create_new_env(installers) if create_template_if_needed: os.chdir(self.output_dir) self.env.run_commands( f"conda activate {self.buildenv_dir}", f"conda pack -p {self.buildenv_dir} -o \ {full_template_name}") if self.config.install_linters: self.env.run_commands(f"cd {_utils.sources_dir()}", "pre-commit install --hook-type pre-push") os.chdir(self.output_dir) # If ccache is available try: def ignore(_): pass self.env.run_commands("ccache -V", stdout_handler=ignore, stderr_handler=ignore) # CC / CXX -> Enable ccache for the current C / C++ compilers. self.env.run_commands( """echo "export CC=\\"ccache ${CC:-gcc}\\"" >> %s""" % self.activate_filename, """echo "export CXX=\\"ccache ${CXX:-g++}\\"" >> %s""" % self.activate_filename) except AssertionError: pass def _create_new_env(self, installers, is_retry=False): """ Sometimes the Conda install in the NFS cache gets corrupted: CondaVerificationError: The package for setuptools located at /nfs/conda//miniconda/pkgs/setuptools-58.0.4-py38h578d9bd_2 appears to be corrupted. When this happens: delete the conda install and start again with "is_retry=True" to avoid getting stuck in an infinite loop. """ os.chdir(self.output_dir) corrupted = False def check_corruption(line): nonlocal corrupted if "CondaVerificationError" in line: corrupted = True logger.error(line) stderr_handler = None if is_retry else check_corruption try: _utils.rmdir_if_exists(self.buildenv_dir) def getChannels(): return "".join(f" -c {c}" for c in self.conda_channels) self.env.run_commands( f"conda create --prefix {self.buildenv_dir}{getChannels()} " f"-y {' '.join(self.conda_packages)}", stderr_handler=stderr_handler) except AssertionError: if corrupted: # We failed because of some corrupted packages: clear # the environment, reinstall Conda and try again. self._clear_activate_buildenv() self._install_conda_if_needed(force_reinstall=True) self._create_new_env(installers, is_retry=True) return raise self._append_to_activate_buildenv( f"conda activate {self.buildenv_dir}", ) for i in installers: os.chdir(self.output_dir) i.install(self.env) def _clear_activate_buildenv(self): # Clear the content of activate_buildenv.sh # PYTHONNOUSERSITE -> Make Conda ignore packages installed in ~/.local # CCACHE_CPP2 -> Switch ccache to C++ mode (Avoid issues with C pre-processor) with open(self.activate_filename, "w") as f: f.write("# Save the existing environment\n") f.write("_print_var_names (){\n") # grep: only keep lines containing 'declare -x' (Removes # multi-lines content as we only care about variable names). # cut -f1: remove the right hand side of the assignment. # cut -f3: remove 'declare -x' # tr: replace new lines with spaces. f.write(" export -p | grep \"declare -x\" | " "cut -d '=' -f1 | cut -d ' ' -f3 | " "tr '\n' ' '\n") f.write("}\n") f.write("_saved_names=$(_print_var_names)\n") f.write("_saved_vars=\"$(export -p)\"\n") f.write("_saved_ps1=\"$PS1\"\n\n") f.write( "# Use 'deactivate_buildenv' to restore your former environment\n" ) # Note: using 'eval' inside a function doesn't affect the parent # environment, which is why we need to use an alias instead. f.write( "alias deactivate_buildenv='_deactivate;eval \"$_saved_vars\";" "unset _deactivate _print_var_names _saved_names " "_saved_vars _saved_ps1'\n") f.write("_deactivate() {\n") f.write( " # Unset the variables that were added by the buildenv\n") f.write(" _current_vars=$(_print_var_names)\n") f.write(" for v in $_current_vars; do\n") f.write(" if [[ ! \" ${_saved_names[*]} \" =~ \" ${v} \" ]];" " then\n") f.write(" unset \"${v}\"\n") f.write(" fi\n") f.write(" done\n") f.write(" # Restore the shell prompt\n") f.write(" PS1=\"$_saved_ps1\"\n\n") f.write("}\n\n") f.write("export PYTHONNOUSERSITE=1\n") f.write("export CCACHE_CPP2=yes\n") def _append_to_activate_buildenv(self, *lines): with open(self.activate_filename, "a") as f: for line in lines: f.write(f"{line}\n") @contextlib.contextmanager def cache_lock(self): # Handle nested cache_lock scopes: if we already own the lock then # don't try to lock it again. if self.lock_already_acquired: yield return lock = os.path.join(self.cache_dir, "conda.lock") with open(lock, "w") as f: try: fcntl.flock(f, fcntl.LOCK_EX) self.lock_already_acquired = True yield finally: self.lock_already_acquired = False fcntl.flock(f, fcntl.LOCK_UN) def _install_conda_if_needed(self, force_reinstall=False): os.makedirs(self.cache_dir, exist_ok=True) system_conda = _system_conda_path() if system_conda is not None: logger.info("Using system conda") conda_sh = os.path.join(system_conda, "etc", "profile.d", "conda.sh") self._append_to_activate_buildenv(f". {conda_sh}") return conda_install_dir = os.path.join(self.cache_dir, "mambaforge") conda_sh = os.path.join(conda_install_dir, "etc", "profile.d", "conda.sh") installer = os.path.join(self.cache_dir, "Mambaforge_installer.sh") with self.cache_lock(): if os.path.isfile(conda_sh) and not force_reinstall: logger.info( "System conda not found, using the instance from the cache " "(%s) instead", self.cache_dir) else: logger.info( "System conda not found, installing it locally in (%s)", self.cache_dir) if not os.path.isfile(installer): logger.info("Installer not found: downloading...") conda_os = "" os_type = _utils.get_os_type() if os_type == _utils.OsType.Linux: conda_os = "Linux" elif os_type == _utils.OsType.Osx: conda_os = "MacOSX" else: raise RuntimeError( "Unknown OS. Please download the " "installer for your platform from " "https://github.com/conda-forge/miniforge#mambaforge" f" and save it as ${installer}") arch_type = _utils.get_arch_type() # Use Mamba that is not relied on Conda with version > 4.14.0 # while we wait for https://github.com/conda/conda/issues/12250 # to be fixed. (Issue with paths > 128 characters) # As soon as it will be fixed we can use "latest" release. url = ("https://github.com/conda-forge/miniforge/" "releases/download/4.14.0-0/Mambaforge-" f"4.14.0-0-{conda_os}-{arch_type}.sh") urllib.request.urlretrieve(url, installer) _utils.rmdir_if_exists(conda_install_dir) _utils.run_commands( f"bash {installer} -b -p {conda_install_dir}") assert os.path.isfile(conda_sh) self._append_to_activate_buildenv(f". {conda_sh}") def _compute_environment_hash(self, installers): hashes = [i.hashString() for i in installers] return str( hashlib.md5( " ".join(self.conda_packages + hashes + self.conda_channels).encode("utf-8")).hexdigest()) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--debug", "-d", action="store_true", help="Print debug messages") parser.add_argument( "--conda-toolchains", "-t", action="store_true", help="Use Conda toolchains instead of the system ones.") parser.add_argument( "--empty-env", "-e", action="store_true", help=("Create an empty Conda environment using version of python " "specified by --python-version")) parser.add_argument("--popart-deps", action="store_true", help="Install dependencies to build PopART.") parser.add_argument("--no-linters", action="store_true", help="Don't install the linters.") parser.add_argument( "--python-version", "-p", help="Override the default python version used in the build environment" "By default the build environment will use the same python version as " "the host os") parser.add_argument( "--cache-dir", help=f"Cache directory (By default {_default_cache_dir()}") parser.add_argument( "--output-dir", help= "Where to create the build environment (Current directory by default)") parser.add_argument( "--create-template-if-needed", action="store_true", help="Create a template archive in the cache directory " "if one doesn't already exist") parser.add_argument( "--path", help="Path to the project sources or a project.buildenv.py") args = parser.parse_args() logging_level = logging.DEBUG if args.debug else logging.INFO logging.basicConfig(level=logging_level) logger.debug("Args: %s", str(args)) manager = BuildenvManager(args.cache_dir, args.output_dir, args.python_version, args.conda_toolchains, not args.no_linters, args.empty_env) if args.path: path_dir = os.path.realpath(args.path) project = None # If a file was provided: use the containing directory if os.path.isfile(path_dir): filename = os.path.basename(path_dir) m = re.match("(.*).buildenv.py", filename) if m and m.group(1) != "config": project = m.group(1) path_dir = os.path.dirname(path_dir) if project is None: project = path_dir.split(os.path.sep)[-1] manager.add_project(project, path_dir) elif not args.empty_env: manager.add_project("poptorch", _utils.sources_dir()) manager.create(args.create_template_if_needed) ================================================ FILE: scripts/docs_build.py ================================================ #!/usr/bin/env python3 # Copyright (c) 2020 Graphcore Ltd. All rights reserved. import argparse import logging import os import shutil import subprocess import sys import zipfile import sphinx.cmd.build from utils import _utils logger = logging.getLogger(os.path.basename(__file__)) _utils.set_logger(logger) class DocumentationBuilder: def __init__(self, pkg_info, install_dir=None, poptorch_geometric=False): self.pkg_info = pkg_info self.pdf_filename = pkg_info.pdf_filename(poptorch_geometric) self.html_filename = pkg_info.html_filename(poptorch_geometric) self.doc_name = pkg_info.poptorch_geometric_doc_name if \ poptorch_geometric else pkg_info.doc_name self.output_dir = os.path.join( "docs", "poptorch_geometric") if poptorch_geometric else "docs" self.output_pdf_dir = os.path.join(self.output_dir, "pdf") self.output_html_dir = os.path.join(self.output_dir, "html") self.output_guide_dir = os.path.join(self.output_html_dir, self.doc_name) src_dir = os.path.join( "docs", "poptorch_geometric") if poptorch_geometric else "docs" self.docs_src_dir = os.path.join(_utils.sources_dir(), src_dir, "user_guide") self.sphinx_conf_dir = os.path.join(_utils.sources_dir(), src_dir, "common") self.title = _utils.get_first_line( os.path.join(self.docs_src_dir, "index.rst")) self.install_dir = install_dir or "." logger.debug("Document title is %s", self.title) # -a write all files (default: only write new and changed files) # -E don't use a saved environment, always read all files # -n nit-picky mode, warn about all missing references # -W turn warnings into errors # -j auto: automatically select the appropriate number of threads self.common_sphinx_flags = "-a -E -n -W --keep-going -j auto".split( ) + ["-c", self.sphinx_conf_dir] def assert_poptorch_in_path(self): error = None try: import poptorch # pylint: disable=unused-import, import-outside-toplevel except ImportError as e: error = str(e) error += ". poptorch must be in your PYTHONPATH to generate the " error += "documentation: did you enable your build environment?" if error: raise ImportError(error) def cleanup(self): _utils.rmdir_if_exists(self.output_pdf_dir) _utils.rmdir_if_exists(self.output_guide_dir) os.makedirs(self.output_guide_dir) def build_html(self): self.assert_poptorch_in_path() args = self.common_sphinx_flags + [ "-b", "html", "-D", f"project={self.title}", "-D", f"html_title={self.title}", "-D", f"version=v{self.pkg_info.version_long}", self.docs_src_dir, self.output_guide_dir ] assert not sphinx.cmd.build.build_main(args), ( f"The command sphinx-build {' '.join(args)} failed " "(See above for details)") def package_html(self): archive = zipfile.ZipFile( os.path.join(self.install_dir, self.html_filename), "w", zipfile.ZIP_DEFLATED) excluded_dirs = [".doctrees", "_sources"] excluded_files = ["objects.inv", ".buildinfo"] for root, _, files in os.walk(self.output_guide_dir): if any([root.endswith(ex) for ex in excluded_dirs]): continue # Remove docs/html/ prefix new_root = root.replace(self.output_html_dir, "")[1:] # Remove leading '/' for file in files: if file in excluded_files: continue archive.write(os.path.join(root, file), arcname=os.path.join(new_root, file)) archive.close() logger.info("%s was successfully generated", self.html_filename) def build_pdf(self): self.assert_poptorch_in_path() args = self.common_sphinx_flags + [ "-b", "latex", "-D", f"project={self.doc_name}", "-D", f"release=v{self.pkg_info.version_long}", "-D", f"version=v{self.pkg_info.version_long}", self.docs_src_dir, self.output_pdf_dir ] os.environ["DOC_TITLE"] = self.title assert not sphinx.cmd.build.build_main(args), ( f"The command sphinx-build {' '.join(args)} failed " "(See above for details)") subprocess.check_output(["make", "LATEXMKOPTS=\"-silent\""], cwd=self.output_pdf_dir) shutil.copyfile(os.path.join(self.output_pdf_dir, "doc.pdf"), os.path.join(self.install_dir, self.pdf_filename)) logger.info("%s was successfully generated", self.pdf_filename) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--no-pdf", action="store_true", help="Do not generate the PDF documentation") parser.add_argument("--no-html", action="store_true", help="Do not generate the HTML documentation") parser.add_argument("--debug", "-d", action="store_true", help="Print debug messages") parser.add_argument("--add-to-sys-path", help="Path to add to sys.path") parser.add_argument("--install-dir", help="Copy generated files to that folder") args = parser.parse_args() logging_level = logging.DEBUG if args.debug else logging.INFO logging.basicConfig(level=logging_level) logger.debug("Args: %s", str(args)) if args.add_to_sys_path: for path in args.add_to_sys_path.split(";"): logger.debug("Adding %s", path) sys.path.insert(0, path) poptorch_builder = DocumentationBuilder( _utils.PkgInfo.load_from_file(must_exist=False), install_dir=args.install_dir) poptorch_geometric_builder = DocumentationBuilder( _utils.PkgInfo.load_from_file(must_exist=False), install_dir=args.install_dir, poptorch_geometric=True) if not args.no_pdf: poptorch_builder.build_pdf() poptorch_geometric_builder.build_pdf() if not args.no_html: poptorch_builder.build_html() poptorch_builder.package_html() poptorch_geometric_builder.build_html() poptorch_geometric_builder.package_html() ================================================ FILE: scripts/download_external_datasets.py ================================================ #!/usr/bin/env python3 # Copyright (c) 2023 Graphcore Ltd. All rights reserved. import argparse import os.path as osp import torch_geometric as pyg parser = argparse.ArgumentParser(description="Download external datasets") parser.add_argument( "external_datasets_dir", help="The directory where the external datasets will be downloaded.") args = parser.parse_args() pyg.datasets.QM9(root=osp.join(args.external_datasets_dir, "qm9")) pyg.datasets.Planetoid(osp.join(args.external_datasets_dir, "planetoid"), "Cora") ================================================ FILE: scripts/enable.sh.in ================================================ #!/bin/bash export PYTHONPATH=@CMAKE_INSTALL_PREFIX@:$PYTHONPATH @ENABLE_POPLAR_CMD@ @ENABLE_POPART_CMD@ ================================================ FILE: scripts/generate_poppyg_package.py ================================================ #!/usr/bin/env python3 # Copyright (c) 2022 Graphcore Ltd. All rights reserved. import argparse import datetime import os import tempfile import subprocess import shutil import distutils.util import distutils.dir_util import utils._utils as utils from wheel.pep425tags import get_abbr_impl, get_impl_ver, get_abi_tag targets = ['bdist_wheel', 'sdist', 'install'] parser = argparse.ArgumentParser() parser.add_argument('--python-dir', help='Path to the folder containing the python files') parser.add_argument('target', choices=targets, help=f'Which target to build: {targets}') parser.add_argument('--output-dir', default='dist', help='Where to create the packages') args = parser.parse_args() PROJ_NAME = 'poptorch_geometric' src_dir = os.path.join(utils.sources_dir(), PROJ_NAME) output_dir = os.path.realpath(args.output_dir) python_dir = os.path.realpath(args.python_dir) VERSION = utils.PkgInfo.load_from_file(must_exist=False, path='..').version_long # https://www.python.org/dev/peps/pep-0425/ # The platform tag is simply distutils.util.get_platform() with all hyphens - and periods . replaced with underscore _. PLATFORM = distutils.util.get_platform().replace('.', '_').replace('-', '_') def find_requirement(package): with open(os.path.join(src_dir, 'requirements.txt'), 'r') as f: for line in f: if package in line: return line.strip() return None def get_pyg_hosted_dependency(pkg_name): name_and_version = find_requirement(pkg_name) assert name_and_version is not None, f'{pkg_name} not found.' # For sdist packages we don't know ahead of time what the python version # will be, and there is no support for --find-links so we just have to # use the regular wheel instead. if args.target != "bdist_wheel": return name_and_version pkg_ver = name_and_version.split('=')[-1] file_name = pkg_name.replace('-', '_') pkg_whl = f'{pkg_name} @ https://data.pyg.org/whl/torch-2.0.0%2Bcpu/{file_name}-{pkg_ver}-{get_abbr_impl()}{get_impl_ver()}-{get_abi_tag()}-{PLATFORM}.whl' return pkg_whl PYG_DEPENDENCY = find_requirement('torch-geometric') or find_requirement( 'pyg-nightly') if PYG_DEPENDENCY is None: raise RuntimeError('"torch-geometric" not found in requirements.txt') SCATTER_DEPENDENCY = get_pyg_hosted_dependency('torch-scatter') SPARSE_DEPENDENCY = get_pyg_hosted_dependency('torch-sparse') POPTORCH_DEPENDENCY = f'poptorch=={VERSION}' def configure(src_filename, dst_filename): with open(dst_filename, 'w') as f: for line in open(src_filename): f.write( line.replace('@VERSION@', VERSION) \ .replace('@PYG_DEPENDENCY@', PYG_DEPENDENCY) \ .replace('@POPTORCH_DEPENDENCY@', POPTORCH_DEPENDENCY) \ .replace('@PLATFORM@', PLATFORM) \ .replace('@TORCH_SCATTER_DEPENDENCY@', SCATTER_DEPENDENCY) \ .replace('@TORCH_SPARSE_DEPENDENCY@', SPARSE_DEPENDENCY) ) # Create a temporary directory and copy the files to package to it. with tempfile.TemporaryDirectory() as tmp_dir: os.chdir(tmp_dir) shutil.copytree(python_dir, PROJ_NAME) shutil.copy(os.path.join(src_dir, 'MANIFEST.in'), '.') shutil.copy(os.path.join(src_dir, 'License.txt'), '.') shutil.copy(os.path.join(src_dir, 'setup.cfg'), '.') configure(os.path.join(src_dir, 'setup.py'), 'setup.py') env = {**os.environ} start = datetime.datetime.now() if args.target == 'install': subprocess.check_call( f'python3 setup.py build_ext -b {output_dir}'.split(), env=env) else: extra_opts = '' if args.target == 'sdist': extra_opts = '--formats=zip' subprocess.check_call( f'python3 setup.py {args.target} -d {output_dir} {extra_opts}'. split(), env=env) print(f'Time to generate {args.target} in {output_dir} : ' f'{datetime.datetime.now()-start}') ================================================ FILE: scripts/generate_python_package.py ================================================ #!/usr/bin/env python3 # Copyright (c) 2022 Graphcore Ltd. All rights reserved. import argparse import datetime import os import tempfile import subprocess import shutil import distutils.util import distutils.dir_util from wheel.pep425tags import get_abbr_impl, get_impl_ver, get_abi_tag import utils._utils as utils targets = ["bdist_wheel", "sdist", "install"] parser = argparse.ArgumentParser() parser.add_argument("--python-dir", default="include", help="Path to the folder containing the python files") parser.add_argument( "--include-dir", default="include", help="Path to the include folder needed to compile the wheel") parser.add_argument( "--lib-dir", default="lib", help= "Path to the folder containing the libraries needed to compile the wheel") parser.add_argument( "--standalone", help=("Colon separated list of folders to add to the lib folder of the " "sdist / wheel package")) parser.add_argument("target", choices=targets, help=f"Which target to build: {targets}") parser.add_argument("--output-dir", default="dist", help="Where to create the packages") args = parser.parse_args() def get_version_from_requirements(package): with open(os.path.join(src_dir, 'requirements.txt'), 'r') as f: for line in f: if package in line and not 'cpu' in line: name_and_version = line.split(';')[0].split('=') return name_and_version[-1].strip() return None def get_torch_dependency(package, version): if "aarch64" in utils.get_arch_type(): # There is no +cpu variant of Torch on Arm return f'{package}=={version}' # For sdist packages we don't know ahead of time what the python version # will be, and there is no support for --find-links so we just have to # use the regular torch wheel instead. if args.target != "bdist_wheel": return f'{package}=={version}' return f"{package} @ https://download.pytorch.org/whl/cpu/{package}-{version}%2Bcpu-{get_abbr_impl()}{get_impl_ver()}-{get_abi_tag()}-{PLATFORM}.whl" def get_poptorch_version(): version = utils.PkgInfo.load_from_file(must_exist=False, path="..").version_long if args.standalone is not None: # Only 1 "+" symbol allowed per version separator = "+" if "+" not in version else "_" version += separator + "standalone" return version VERSION = get_poptorch_version() # https://www.python.org/dev/peps/pep-0425/ # The platform tag is simply distutils.util.get_platform() with all hyphens - and periods . replaced with underscore _. PLATFORM = distutils.util.get_platform().replace(".", "_").replace("-", "_") torch_ver = utils.get_required_torch_version() TORCH_DEPENDENCY = get_torch_dependency('torch', torch_ver) src_dir = utils.sources_dir() # torch{audio, vision} are added here to prevent the torch upgrade when other # packages depend on torch{audio, vision}. torchaudio_ver = get_version_from_requirements('torchaudio') TORCHAUDIO_DEPENDENCY = get_torch_dependency('torchaudio', torchaudio_ver) torchvision_ver = get_version_from_requirements('torchvision') TORCHVISION_DEPENDENCY = get_torch_dependency('torchvision', torchvision_ver) # Only keep files of a given extension class ExtOnly: def __init__(self, *ext): self.ext = ext def _is_ignored(self, file): return not any(file.endswith(ext) for ext in self.ext) def __call__(self, adir, filenames): # Return the files to ignore return [f for f in filenames if self._is_ignored(f)] include_dir = os.path.realpath(args.include_dir) lib_dirs = [os.path.realpath(args.lib_dir)] if args.standalone is not None: lib_dirs += [os.path.realpath(l) for l in args.standalone.split(":")] output_dir = os.path.realpath(args.output_dir) python_dir = os.path.realpath(args.python_dir) def configure(src_filename, dst_filename): with open(dst_filename, "w") as f: for line in open(src_filename): f.write( line.replace("@VERSION@", VERSION) \ .replace("@PLATFORM@", PLATFORM) \ .replace("@TORCH_DEPENDENCY@", TORCH_DEPENDENCY) \ .replace("@TORCHAUDIO_DEPENDENCY@", TORCHAUDIO_DEPENDENCY) \ .replace("@TORCHVISION_DEPENDENCY@", TORCHVISION_DEPENDENCY) ) # Create a temporary directory and copy the files to package to it. with tempfile.TemporaryDirectory() as tmp_dir: os.chdir(tmp_dir) shutil.copytree(os.path.join(src_dir, "python"), "src", ignore=ExtOnly(".cpp")) shutil.copytree(python_dir, "poptorch") # distutils won't throw an exception if the destination already exists, # which will happen if lib_dirs contains more than one element. for lib_dir in lib_dirs: distutils.dir_util.copy_tree(lib_dir, "poptorch/lib") shutil.copytree(include_dir, "include") shutil.copy(os.path.join(src_dir, "MANIFEST.in"), ".") shutil.copy(os.path.join(src_dir, 'setup.cfg'), '.') shutil.copy(os.path.join(src_dir, 'License.txt'), '.') shutil.copy(os.path.join(src_dir, 'poptorch_third_party_licenses.txt'), '.') configure(os.path.join(src_dir, "setup.py"), "setup.py") configure(os.path.join(src_dir, "pyproject.toml"), "pyproject.toml") # distutils doesn't like spaces in CXX (https://github.com/mapnik/python-mapnik/issues/99#issuecomment-527591113) env = {**os.environ} cc = env.get("CC", "gcc") cxx = env.get("CXX", "g++") # Only keep the real compiler: e.g "cmake gcc" -> "gcc" cc = cc.split(" ")[-1] cxx = cxx.split(" ")[-1] env["CXX"] = cxx env["CC"] = cc start = datetime.datetime.now() if args.target == "install": subprocess.check_call( f"python3 setup.py build_ext -b {output_dir}".split(), env=env) dst_dir = f"{output_dir}/poptorch/lib" if os.path.isdir(dst_dir): shutil.rmtree(dst_dir) shutil.copytree("poptorch/lib", dst_dir, ignore=ExtOnly(".so")) else: extra_opts = "" if args.target == "sdist": extra_opts = "--formats=zip" subprocess.check_call( f"python3 setup.py {args.target} -d {output_dir} {extra_opts}". split(), env=env) print(f"Time to generate {args.target} in {output_dir} : " f"{datetime.datetime.now()-start}") ================================================ FILE: scripts/popgen/__init__.py ================================================ # Copyright (c) 2020 Graphcore Ltd. All rights reserved. import enum import sys from popgen import onnx onnx.init() onnx.parse_signatures() class PtrOrRef(enum.Enum): PTR = 0 REF = 1 # Root class for all expressions - the result of applying an operator # to a list of arguments class Value: def __init__(self, op, args, const=False, ptr_or_ref=None): assert isinstance(args, list), \ "args should be a list in Value::__init__" self.op = op self.args = args self.cname = "" self.graph_arity = None self.annotation = [] self.const = const self.ptr_or_ref = ptr_or_ref # perform dynamic casting for literals - makes for nice syntax for i, arg in enumerate(args): if isinstance(arg, float): self.args[i] = ConstantFloat(arg) # emit tensor parameters in an initilizer list self.tensor_braces = True # operator overloading - syntax sugar # note that we can't support __eq__ -- it would make the object unhashable def __add__(self, other): return Value('add', [self, other]) def __ge__(self, other): return Value('logical_or', [self > other, self.equal(other)]) def __gt__(self, other): return Value('greater', [self, other]) def __le__(self, other): return Value('logical_or', [self < other, self.equal(other)]) def __lt__(self, other): return Value('less', [self, other]) def __mul__(self, other): return Value('mul', [self, other]) def __ne__(self, other): return Value('logical_not', [self.equal(other)]) def __neg__(self): return Value('neg', [self]) def __sub__(self, other): return Value('sub', [self, other]) def __truediv__(self, other): return Value('div', [self, other]) def __radd__(self, other): return Value('add', [other, self]) def __rmul__(self, other): return Value('mul', [other, self]) def __rsub__(self, other): return Value('sub', [other, self]) def __rtruediv__(self, other): return Value('div', [other, self]) def equal(self, other): return Value('equal', [self, other]) def set_graph_arity(self, arity): self.graph_arity = arity def annotate(self, annot): self.annotation.append(annot) # emit(values, val_id, tabs, f, root) # # Emits C++ code for this value # Parameters: # values - map of previously generated Value objects and their C++ images (Value -> string) # val_id - the index of the first available temp variable # tabs - indentation string # f - output stream # root - True: we should generate a return statement # Returns: index of the next available temp variable def emit(self, values, val_id, tabs, f=sys.stdout, root=False): if self in values: return val_id val_id = self.emit_arguments(values, val_id, tabs, f) self.emit_annotations(tabs, f) # split tensor and non-tensor arguments if not self.args or isinstance(self.args[0], NonTensorValue): tensors = [] non_tensors = [values[arg] for arg in self.args] self.tensor_braces = False else: last_tensor = next(arg for arg in reversed(self.args) if not isinstance(arg, NonTensorValue)) last_tensor = len(self.args) - self.args[::-1].index(last_tensor) tensors = [values[arg] for arg in self.args[:last_tensor]] non_tensors = [values[arg] for arg in self.args[last_tensor:]] suffix = ";\n" if not root: suffix = "->output();\n" val_id = self.emit_assign_return(values, val_id, root, tabs, f, ptr_or_ref=PtrOrRef.PTR) left_brace = ["{"] if self.tensor_braces else [] right_brace = ["}"] if self.tensor_braces else [] if self.op is None: f.write("nullptr" + suffix) else: capital_op = self.op[0].upper() + self.op[1:] self.emit_call("create" + capital_op, ["graph"] + left_brace + tensors + right_brace + non_tensors, suffix, f) return val_id # emit_arguments(values, val_id, tabs, f) # # Emits C++ code for the arguments this value # Parameters: # values - map of previously generated Value objects and their C++ images (Value -> string) # val_id - the index of the first available temp variable # tabs - indentation string # f - output stream # Returns: index of the next available temp variable def emit_arguments(self, values, val_id, tabs, f): for arg in self.args: val_id = arg.emit(values, val_id, tabs, f, False) return val_id # emit_annotations(tabs, f) # # Emits annotations as C++ comments # Parameters: # tabs - indentation string # f - output stream def emit_annotations(self, tabs, f): for annot in self.annotation: f.write(tabs + annot + "\n") # emit_assign_return(values, val_id, root, tabs, f) # # Emits either an assignment or a return statement # Parameters: # values - map of previously generated Value objects and their C++ images (Value -> string) # val_id - the index of the first available temp variable # tabs - indentation string # f - output stream # Returns: index of the next available temp variable def emit_assign_return(self, values, val_id, root, tabs, f, const=False, ptr_or_ref=None): if root: f.write(tabs + "return ") return val_id if isinstance(val_id, str): values[self] = val_id else: values[self] = "t" + str(val_id) val_id += 1 pr_qual = "" if ptr_or_ref == PtrOrRef.PTR: pr_qual = "*" elif ptr_or_ref == PtrOrRef.REF: pr_qual = "&" const_qual = "" if const: const_qual = "const " f.write(tabs + const_qual + "auto " + pr_qual + values[self] + " = ") return val_id # emit_call(fname, args, suffix, f) # # Emit a function call # Parameters: # fname - function name # args - arguments as list of strings # suffix - string to prepend after call # f - output stream def emit_call(self, fname, args, suffix, f): f.write(fname + "(") for (i, arg) in enumerate(args): if i > 0: if arg not in ["}"] and args[i - 1] not in ["{"]: f.write(", ") f.write(arg) f.write(')' + suffix) # vn() # # Return a value number for this object # Returns: tuple(operator, value numbers of arguments) def vn(self): return tuple([self.op] + [arg.vn() for arg in self.args]) # same(other) # # Returns True if the other operator is a potential match for this one def same(self, other): return self.op == other.op # render() # # Returns a string image of this object. Used for C++ annotations. def render(self): if self.op is None: return "" string = self.op + '(' for i, arg in enumerate(self.args): if i > 0: string += ', ' string += arg.render() return string + ')' # ConstantFloat(val) # # Represents a constant floating point value to be used as tensor argument # Parameters: # val - the floating point constant class ConstantFloat(Value): def __init__(self, val): Value.__init__(self, 'float', []) self.val = val def emit(self, values, val_id, tabs, f=sys.stdout, root=False): if self in values: return val_id suffix = ";\n" if not root: suffix = "->output();\n" if len(self.args) > 0: val_id = self.emit_arguments(values, val_id, tabs, f) val_id = self.emit_assign_return(values, val_id, root, tabs, f, ptr_or_ref=PtrOrRef.PTR) self.emit_call( "createConstantFloatLike", ["graph", values[self.args[0]], "{", str(self.val), "}", "{}"], suffix, f) else: val_id = self.emit_assign_return(values, val_id, root, tabs, f, ptr_or_ref=PtrOrRef.PTR) self.emit_call( "createConstantFloat32", ["graph", "{", str(self.val), "}", "{}"], suffix, f) return val_id def vn(self): return str(self.val) def same(self, other): return other.op == 'float' and self.val == other.val def render(self): return str(self.val) # NonTensorValue(op, args) # # Root class for non-tensor values. # Parameters: # op - operator # args - arguments class NonTensorValue(Value): def __init__(self, op, args): Value.__init__(self, op, args) ================================================ FILE: scripts/popgen/api.py ================================================ # Copyright (c) 2020 Graphcore Ltd. All rights reserved import inspect from popgen import generator, registry, values from popgen.operatorfactory import op # convert(aten, arity, popop=None, swizzles=None) # # Registers a conversion rule. # Parameters: # aten - name of the operator to be converted # arity - number of inputs of aten # popop - popART operator to be generated (None: same as aten) # swizzles - list of integer indices representing a permutation of inputs def convert(aten, arity, popop=None, swizzles=None): if popop is None: popop = aten if swizzles is None: swizzles = range(0, arity) inputs = [] for swz in swizzles: assert isinstance(swz, int) and swz in range(0, arity), \ "Illegal swizzle for " + aten inputs.append(values.InputValue("i" + str(swz), swz)) fn = getattr(op, popop) registry.add_handler(aten, fn(*inputs), arity) # expand(aten, fn) # # Registers an expansion rule # Parametrs: # aten - name of operator to be expanded # fn - function defining the expansion def expand(aten, fn): return registry.expand(aten, fn) # forward(source, dest) # # Registers a forwarding rule. Effect is to forward one operator to the # handlers of another. # Parameters: # source - name of forwarded operator # dest - name of operator whoose handlers are to be used def forward(source, dest): assert source not in registry.forwardings, \ source + " is forwarded twice" registry.forwardings[source] = dest # generate(namespace, filename) # # Generate C++ code. # Parameters: # script - name of the top-level script # namespace - the namespace of the operators # filename - file to write the code to # global_symbols - dictionary of global_symbols from top-level def generate(script, namespace, filename, global_symbols=globals()): generator.generate(script, namespace, filename, global_symbols) print("File successfully generated, remember to run " "'./scripts/apply_linters.py -a " + filename + "' before checking the file in") # simplify(name, fn) # # Registers a simplification rule. # Parameters: # operator_name - name of the operator to be greated as a string # fn - function defining the expression to be matched def simplify(name, fn): # computes the weight of the expression (i.e. the number of values involved # in the pattern). The matched will use this to break ties - the heviest # pattern is preferred def weight(value): result = 1 for arg in value.args: result += weight(arg) return result inputs = [] ops = inspect.signature(fn).parameters for idx, op in enumerate(ops): inputs.append(values.InputValue(op, idx)) pattern = fn(*inputs) registry.complex_ops[name] = (weight(pattern), pattern) ================================================ FILE: scripts/popgen/generator.py ================================================ # Copyright (c) 2020 Graphcore Ltd. All rights reserved import datetime import sys import re from popgen import registry, transform # emit_handlers(namespace, aten, handlers, f=sys.stdout) # # Emits the C++ handlers for one operator. # Parameters: # namespace - namespace the operator is in # aten - name of the operator # handlers - list of handlers. must differ in arity. # f - output stream def emit_handlers(namespace, aten, handlers, f=sys.stdout): values = dict() opname = get_camel_case_op_name(aten) emit_arity_check = len(handlers) > 1 decl = "torch::jit::Node *" + opname + "Handler(" + \ "torch::jit::Graph *graph, " + "torch::jit::Node *node) {" if len(decl) <= 80: f.write(decl + "\n") else: decl = "torch::jit::Node *" + opname + "Handler(" f.write(decl + "torch::jit::Graph *graph,\n") f.write(" " * len(decl)) f.write("torch::jit::Node *node) {\n") arities = set() for handler in handlers: assert handler.graph_arity not in arities, \ aten + " has multiple handlers with the same arity" arities.add(handler.graph_arity) values.clear() handler = transform.generate_complex_ops(handler) handler = transform.value_numbering(handler) handler = transform.generate_typed_constants(handler) handler.annotate("// " + handler.render()) if emit_arity_check: f.write(" if (node->inputs().size() == " + str(handler.graph_arity) + ") {\n") handler.emit(values, 0, " ", f, True) f.write(" }\n") else: handler.emit(values, 0, " ", f, True) if emit_arity_check: arity_list = sorted(list(arities)) expect_str = "Expecting " + str(arity_list[0]) for i in range(1, len(arity_list) - 1): expect_str += ', ' + str(arity_list[i]) if len(arity_list) > 1: expect_str += ' or ' + str(arity_list[-1]) if len(arity_list) > 1 or arity_list[0] > 1: expect_str += " operands, " else: expect_str += " operand, " f.write('\n std::stringstream errmsg;\n') f.write(' errmsg << "Incorrect number of arguments for operator ";\n') f.write(' errmsg << "' + namespace + '::' + aten + '. ";\n') f.write(' errmsg << "' + expect_str + '";\n') f.write( ' errmsg << "got " << node->inputs().size() << " operand(s).";\n') f.write(" ERROR(&errmsg);\n") f.write(" return nullptr;\n") f.write("}\n\n") # generate(script, namespace, filename, global_symbols) # # Generate a file containg C++ implementation of handlers # Parameters: # script - name of top-level script # namespace - the namespace the operators are in # filename - the output fil # global_symbols - dictionary of globals from top-level def generate(script, namespace, filename, global_symbols): f = open(filename, 'w') now = datetime.datetime.now() f.write('// DO NOT EDIT! Generated by ' + script + '\n') f.write('// Copyright (c) ' + str(now.year) + ' Graphcore Ltd. All rights reserved.\n\n') f.write('#include "../PoptorchStaticInit.hpp"\n') f.write('#include "../PoptorchSymbols.hpp"\n') f.write('#include "PopartCanonicalizationUtils.hpp"\n') f.write('#include "poptorch/OpBuilder.hpp"\n') f.write('#include "poptorch/Utils.hpp"\n') f.write('#include "poptorch_logging/Error.hpp"\n') f.write('#include "poptorch_logging/Logging.hpp"\n') f.write("\nnamespace poptorch {\n") f.write("\nnamespace {\n\n") registry.add_implicit_handlers(global_symbols) for (aten, handler) in sorted(registry.handlers.items()): emit_handlers(namespace, aten, handler, f) f.write("} // namespace\n") f.write("\n__attribute__((constructor(HANDLER_INIT_PRIORITY))) ") f.write("static void registration() {\n") for (source, _) in registry.forwardings.items(): transform.validate_forwarding(source) to_register = sorted( list(registry.handlers.keys()) + list(registry.forwardings.keys())) for aten in to_register: opname = get_camel_case_op_name(registry.forwardings.get(aten, aten)) reg_handler_line = (" registerHandler(" + namespace + "::" + aten + ", " + opname + "Handler);\n") if len(reg_handler_line) > 81: reg_handler_line = reg_handler_line.replace( ", ", ",\n ") f.write(reg_handler_line) f.write("}\n\n") f.write("} // namespace poptorch\n") f.close() registry.clear() # get_op_name(aten) # # Returns the name of the C++ handler function for an operator # Parameters: # aten - the name of the operator def get_op_name(aten): opname = aten.split(':')[-1] return opname def snake_to_camel_case(snake_case): rx = re.compile(r"_[A-z]") return re.sub(rx, lambda s: s.group(0).upper()[1], snake_case) def get_camel_case_op_name(aten): rx = re.compile(r"_$") return re.sub(rx, "InPlace", snake_to_camel_case(get_op_name(aten))) ================================================ FILE: scripts/popgen/helpers.py ================================================ # Copyright (c) 2020 Graphcore Ltd. All rights reserved from popgen import PtrOrRef, values # alpha(m, a): # # Generate the alpha computation required for operators that have implicit scaling # Parameters: # m - quantity to be scaled # a - scaling factor def alpha(m, a): return values.AlphaValue([m, a]) # as_ir(v) # # Helper that returns a vector of ints as IR constant # Parameters: # v - the input vector def as_ir(v): return values.Helper('AsIr', [v], 'intVectorToIrConstant', needs_graph=True, ptr_or_ref=PtrOrRef.PTR) # cint(n) # # Generate an integer as a C int literal or variable # Parameters: # n - value to be generated def cint(n): return values.NonTensorConstant('cint', n, 'constantToInt') # clong(n) # # Generate an integer as a C long literal or variable # Parameters # n - value to be generated def clong(n): return values.NonTensorConstant('clong', n, 'constantToLong') # clong(l) # # Generate a value as a list of C longs # Parameters # l - value to be generated def clong_list(l): return values.NonTensorHelper('clong_list', [l], 'constantToLongVec', expects_node=True) # cfloat(f) # # Generate a floating point as a C float literal or variable # Parameters # f - value to be generated def cfloat(f): return values.NonTensorConstant('cfloat', f, 'constantToFloat') # cstr(s) # # Generate a string as a C string literal or variable # Parameters # s - value to be generated def cstr(s): return values.NonTensorConstant('cstr', s, 'constantToString') # dimension(a, t) # # Helper for parameters that are dimensional indices # Parameters: # v - value representing a dimensional index # t - tensor type def dimension(v, t): return values.NonTensorHelper('dimension', [v, t], 'handleDimensionParam') # dimension_list(t, a) # # Produces a list with the dimensions of a tensor. Needed for some # reduction operators. # Parameters: # t - input tensor # a - axes vector (optional) def dimension_list(t, a=None): args = [t, a] if a is not None else [t] return values.NonTensorHelper('dimension_list', args, "reduceHelperDimensionCreator") # empty_initializer() # # Helper that produces an empty initializer list def empty_initializer(): return values.EmptyInitializer() # output_shape(index = 0) # # Generate a tensor shape for the output value. # Parameters # index - index of output (default: 0) def output_shape(idx=0): return tensor_shape(values.OutputValue(idx)) # output_type(index = 0) # # Generate the expected scalar type for the output value. # Parameters # index - index of output (default: 0) def output_type(idx=0): return scalar_type(values.OutputValue(idx)) # reduction(r) # # Converts reduction type from pytorch to popart # Parameters: # r - integer containing reduction Id def reduction(r): return values.NonTensorHelper('reduction', [r], 'convertReduceToPopart') # tensor_list(l) # # Generate a list of tensors # Parameters # l - value to be generated def tensor_list(l): return values.Helper('TensorList', [l], "handleTensorList", True) # tensor_long(t) # # Change the scalar type of a tensor to Long # Parameters # s - input tensor def tensor_long(t): return values.CastInPlace("inplace_cast", [t], 'at::ScalarType::Long') # tensor_shape(t) # # Generate the shape of a tensor as a C++ vector of ints # Parameters # t - input tensor def tensor_shape(t): return values.NonTensorHelper('tensor_shape', [t], "shapeFromTensor") # tensor_type(t) # # Generate the tensor type of the input # Parameters: # t - the input tensor def tensor_type(t): return values.TensorType(t) # scalar_type(t) # # Return the scalar type of a tensor # Parameters # t - input tensor def scalar_type(t): return values.NonTensorHelper('scalar_type', [t], 'getNodeScalarType') ================================================ FILE: scripts/popgen/onnx.py ================================================ # Copyright (c) 2020 Graphcore Ltd. All rights reserved import logging import json import os import re from ctypes.util import find_library import clang.cindex current_dir = os.path.dirname(os.path.realpath(__file__)) logger = logging.getLogger('OnnxParser') poplar_include_dir = None popart_include_dir = None popart_files = ["builder.hpp", "builder.gen.hpp"] nodeBlacklist = { "DomainOpSet", "Builder", "getOpsetVersion", "AiOnnxOpset11", "AiOnnxOpset12" } # find_popart_includes(path=None) # # Validate path to popart include files def find_popart_includes(): assert "CONDA_PREFIX" in os.environ, ("You need to run this script from " "inside an activated buildenv") compile_commands = os.path.realpath( os.path.join(os.environ["CONDA_PREFIX"], "..", "compile_commands.json")) assert os.path.isfile(compile_commands), ( "You need to configure your build " "by running cmake") with open(compile_commands, "r") as f: cmds = json.load(f) regex = r'.*-isystem (.*popart.*?(/install)?/include) ?.*' for c in cmds: if "popart_compiler" in c["file"]: m = re.match(regex, c["command"]) if not m: continue return m.group(1) raise RuntimeError( "Failed to find path to PopART in compile_commands.json") def find_poplar_includes(): assert "CONDA_PREFIX" in os.environ, ("You need to run this script from " "inside an activated buildenv") compile_commands = os.path.realpath( os.path.join(os.environ["CONDA_PREFIX"], "..", "compile_commands.json")) assert os.path.isfile(compile_commands), ( "You need to configure your build " "by running cmake") with open(compile_commands, "r") as f: cmds = json.load(f) regex = r'.*-isystem (.*poplar.*?(/install)?/include) ?.*' for c in cmds: if "popart_compiler" in c["file"]: m = re.match(regex, c["command"]) if not m: continue return m.group(1) raise RuntimeError( "Failed to find path to Poplar in compile_commands.json") # init(popart_path=None, clang_path=None, debug=False): # # Initialize parser module and logging object # Parameters: # popart_path - path to popART headers (default: autodetect) # clang_path - path to clang shared object (default: autodetect) # debug - True: enable debug logging def init(popart_path=None, poplar_path=None, clang_path=None, debug=False): global poplar_include_dir global popart_include_dir logging_level = logging.DEBUG if debug else logging.INFO logging.basicConfig(level=logging_level) if popart_path is None: popart_include_dir = find_popart_includes() else: builder_path = os.path.isfile( os.path.join(popart_path, "popart", "builder.hpp")) assert builder_path, ("Unable to locate popART's popart/builder.hpp " "in " + popart_path) popart_include_dir = popart_path if poplar_path is None: poplar_include_dir = find_poplar_includes() else: poplar_include_dir = poplar_path logger.info('Will pick up poplar headers from: %s', poplar_include_dir) logger.info('Will pick up popART headers from: %s', popart_include_dir) for (i, fname) in enumerate(popart_files): popart_files[i] = os.path.realpath( os.path.join(popart_include_dir, "popart", fname)) if clang.cindex.Config.loaded: # Already initialised return if clang_path is None: for version in [9, 8, 7, 6]: logger.debug('Trying to find: clang-%s', str(version)) clang_path = find_library('clang-' + str(version)) if clang_path is not None: break assert clang_path is not None, 'Could not find clang' logger.info('Will use clang: %s', clang_path) clang.cindex.Config.set_library_file(clang_path) # find_functions(node, namespace=""): # # Locate function declarations starting from an AST node # Parameters: # jsonOutput - reference to dictionary functions' dictionary # node - the AST node # namespace - C++ namespace of declarations def find_functions(jsonOutput, node, namespace=""): # If this is not the file path provided on the comand line, skip. if node.location.file is not None and \ os.path.realpath(str(node.location.file)) not in popart_files: return if node.spelling in nodeBlacklist: return if node.kind == clang.cindex.CursorKind.CLASS_DECL: namespace = node.spelling if node.kind != clang.cindex.CursorKind.CXX_METHOD: for child in node.get_children(): find_functions(jsonOutput, child, namespace) return functionName = node.spelling returnType = str(node.type.spelling).split("(")[0] operation = dict() operation["type"] = returnType operation["args"] = [] if node.access_specifier != clang.cindex.AccessSpecifier.PUBLIC: return argNum = 0 for child in node.get_children(): argument = {} if child.kind != clang.cindex.CursorKind.PARM_DECL: continue argument["type"] = child.type.spelling argument["name"] = child.spelling # skip 'name' argument if argument['name'] == 'name': continue argument["num"] = argNum operation["args"].append(argument) argNum += 1 if namespace not in jsonOutput: jsonOutput[namespace] = {} jsonOutput[namespace][functionName] = operation # parse() # # Parse popART header files and extract onnx operator information # Returns: # Map of operators, return types and arguments def parse(): index = clang.cindex.Index.create() path = os.path.realpath( os.path.join(popart_include_dir, "popart", "builder.hpp")) logger.info('Parsing: %s', path) tu = index.parse(path, args=[ "-std=c++14", "-I" + popart_include_dir, "-I" + poplar_include_dir, "-DONNX_NAMESPACE=onnx" ]) for diag in tu.diagnostics: logger.warning(diag) json = dict() find_functions(json, tu.cursor) classes = [] for name in json: if name.startswith("Ai"): classes.append(name) else: del json[name] classes.reverse() added_functions = set() for opset in classes: to_remove = [] for name in json[opset]: if name in added_functions: to_remove.append(name) else: added_functions.add(name) for name in to_remove: json[opset].pop(name) return json signatures = dict() def parse_signatures(): json = parse() classes = [] for classname in json: classes.append(classname) classes.reverse() type_map = { 'char': ['cstr'], 'bool': ['cint'], 'float': ['cfloat'], 'int64_t': ['clong', 'dimension'], 'int': ['cint'], 'unsigned int': ['cint'], 'std::string': ['cstr'], 'std::vector': ['cfloat_list', 'empty_initializer'], 'std::vector': ['clong_list', 'empty_initializer'], 'std::vector': ['cstr_list', 'empty_initializer'], 'nonstd::optional': ['cfloat', 'None'], 'nonstd::optional': ['cint', 'None'], 'nonstd::optional': ['clong', 'None'], 'nonstd::optional': ['cstr', 'None'], 'nonstd::optional >': ['clong_list', 'dimension_list', 'None'], 'nonstd::optional': 'ignore', 'nonstd::optional': 'ignore', 'Attributes::Float': ['cfloat'], 'Attributes::Int': ['clong'], 'Attributes::Ints': ['clong_list', 'empty_initializer'], 'popart::ReductionType': ['cint', 'reduction'], 'popart::ScatterReduction': ['cint', 'scatter_reduction'], 'popart::Builder': 'ignore', 'popart::ConstVoidData': 'ignore', 'popart::MultiConvDilations': 'ignore', 'popart::MultiConvInputs': 'ignore', 'popart::MultiConvPads': 'ignore', 'popart::MultiConvStrides': 'ignore', 'popart::TensorId': 'ignore', 'popart::DebugContext': 'popart::DebugContext', } for classname in classes: for op in json[classname]: args = json[classname][op]['args'] arglist = [] for arg in args: name = arg['name'] ty = arg['type'].replace('const ', '').replace(' &', '') if name == 'args': arglist.append('Args') continue if ty not in type_map: assert False, "Unsupported type " + ty + \ " in onnx.parse_signatures()" if type_map[ty] != 'ignore': arglist.append(type_map[ty]) signatures[op] = arglist ================================================ FILE: scripts/popgen/operatorfactory.py ================================================ # Copyright (c) 2020 Graphcore Ltd. All rights reserved from popgen import NonTensorValue, Value, onnx, poptorch from popgen.helpers import empty_initializer # no_tensor_braces(v): # # Modifiers for values that take tensors without initializer list braces # Parameters: # v - the input value def no_tensor_braces(v): v.tensor_braces = False return v # def check_operator_signature(value, signatures) # # Verify an operator has correct signature # Parameters: # value - the operator # signatures - signatures' dictionary def check_operator_signature(value, signatures): assert value.op in signatures, \ str(value.op) + " is not a supported operator" actual_args = value.args expected_args = signatures[value.op] # check non-tensor arguments first_non_tensor = -1 if expected_args[0] == 'Args': for i, arg in enumerate(actual_args): if arg.op == 'empty_initializer': continue if isinstance(arg, NonTensorValue): first_non_tensor = i break assert first_non_tensor != 0, 'Expecting at least 1 tensor ' + \ 'argument for ' + value.op # no non-tensor arguments if first_non_tensor == -1: return value # check non-tensor arguments expected_args = expected_args[1:] actual_args = actual_args[first_non_tensor:] # assume any missing arguments are optional for i in range(1, len(expected_args) - len(actual_args)): actual_args.append('None') for i, arg in enumerate(actual_args): if isinstance(arg, Value): arg = arg.op assert arg in expected_args[i], 'Incorrect operand ' + str(i) + \ ' for ' + value.op + '. Got ' + arg + ', expecting ' + \ 'one of: ' + str(expected_args[i]) return value # Factory class for creating popArt ops. Operators are created # on the fly based on spelling of attributes. class OperatorFactory: def __getattr__(self, name): if name in onnx.signatures: return lambda *args: \ check_operator_signature(Value(name, list(args)), \ onnx.signatures) if name in poptorch.signatures: return lambda *args: \ check_operator_signature(Value(name, list(args)), \ poptorch.signatures) raise ValueError(name + " is not a supported operator") def cast(self, t, ty): value = no_tensor_braces(Value('cast', [t, ty])) check_operator_signature(value, poptorch.signatures) return value def internalCast(self, t, ty): value = no_tensor_braces(Value('internalCast', [t, ty])) check_operator_signature(value, poptorch.signatures) return value def constantPad(self, x, l, c): value = no_tensor_braces(Value('constantPad', [x, l, c])) check_operator_signature(value, poptorch.signatures) return value def edgePad(self, t, l): value = no_tensor_braces(Value('edgePad', [t, l])) check_operator_signature(value, poptorch.signatures) return value def printIpuTensor(self, t, s): value = no_tensor_braces(Value('printIpuTensor', [t, s])) check_operator_signature(value, poptorch.signatures) return value def callCpuOp(self, t, s, n): value = no_tensor_braces(Value('callCpuOp', [t, s, n])) check_operator_signature(value, poptorch.signatures) return value def transpose(self, t): value = Value('transpose', [t, empty_initializer()]) check_operator_signature(value, onnx.signatures) return value def randomNormal(self, x, shape, high, low, scalar_type=None): args = [x, shape, high, low] if scalar_type is not None: args += [scalar_type] value = Value('randomNormal', args) check_operator_signature(value, poptorch.signatures) return value def randomUniform(self, x, shape, high, low, scalar_type=None): args = [x, shape, high, low] if scalar_type is not None: args += [scalar_type] value = no_tensor_braces(Value('randomUniform', args)) check_operator_signature(value, poptorch.signatures) return value def recomputationCheckpoint(self, x): value = no_tensor_braces(Value('recomputationCheckpoint', [x])) check_operator_signature(value, poptorch.signatures) return value def reflectionPad(self, t, l): value = no_tensor_braces(Value('reflectionPad', [t, l])) check_operator_signature(value, poptorch.signatures) return value def setAvailableMemory(self, x, y): value = no_tensor_braces(Value('setAvailableMemory', [x, y])) check_operator_signature(value, poptorch.signatures) return value def setMatMulSerialization(self, x, s, a, b): value = no_tensor_braces(Value('setMatMulSerialization', [x, s, a, b])) check_operator_signature(value, poptorch.signatures) return value def startForLoop(self, inputs): value = no_tensor_braces(Value('startForLoop', [inputs])) check_operator_signature(value, poptorch.signatures) return value def endForLoop(self, output, inputs, trip_count): value = no_tensor_braces( Value('endForLoop', [output, inputs, trip_count])) check_operator_signature(value, poptorch.signatures) return value def startIfBlock(self, condition): value = no_tensor_braces(Value('startIfBlock', [condition])) return value def startElseBlock(self, outputs_then): value = no_tensor_braces(Value('startElseBlock', [outputs_then])) return value def endIfBlock(self, outputs_else, condition): value = no_tensor_braces(Value('endIfBlock', [outputs_else, condition])) check_operator_signature(value, poptorch.signatures) return value def passThrough(self): return Value(None, []) op = OperatorFactory() ================================================ FILE: scripts/popgen/poptorch.py ================================================ # Copyright (c) 2020 Graphcore Ltd. All rights reserved # signatures for manually added operators signatures = { 'beginIpuBlock': [['clong'], ['clong'], ['clong']], 'cast': ['Args', ['scalar_type']], 'internalCast': ['Args', ['cstr']], 'constantPad': ['Args', ['clong_list'], ['cfloat']], 'edgePad': ['Args', ['clong_list']], 'optimizerGroup': [['clong'], ['tensor_list']], 'printIpuTensor': ['Args', ['cstr']], 'callCpuOp': [['tensor_list'], ['cstr'], ['node']], 'randomNormal': [ 'Args', ['tensor_shape'], ['cfloat'], ['cfloat'], ['scalar_type', 'None'] ], 'randomUniform': [ 'Args', ['tensor_shape'], ['cfloat'], ['cfloat'], ['scalar_type', 'None'] ], 'recomputationCheckpoint': ['Args'], 'reflectionPad': ['Args', ['clong_list']], 'setAvailableMemory': ['Args', ['cfloat']], 'setMatMulSerialization': ['Args', ['cstr'], ['clong'], ['cint']], 'startForLoop': ['Args'], 'endForLoop': ['Args', ['clong']], 'startIfBlock': ['Args'], 'startElseBlock': ['Args'], 'endIfBlock': ['Args'], } ================================================ FILE: scripts/popgen/registry.py ================================================ # Copyright (c) 2020 Graphcore Ltd. All rights reserved import inspect import re from popgen import values # simplification rules. operator_name -> value complex_ops = dict() # forwardings. from_operator -> to_operator forwardings = dict() # operator handlers. operator_name -> list(value) handlers = dict() # add_handler(aten, value, arity) # # Register a new handler for an operator # Parameters: # aten - name of operator # value - root of the expansion expression # arity - number of unique graph nodes taken as input def add_handler(aten, value, arity): if aten not in handlers: handlers[aten] = [] value.set_graph_arity(arity) handlers[aten].append(value) # add_implicit_handlers(global_symbols) # # Inspect global namespace dictionary and register function handlers # Parameters: # global_symbols - dictianary of top-level globals def add_implicit_handlers(global_symbols): for name in global_symbols.keys(): fn = global_symbols[name] if not callable(fn): continue res = re.search('(.+)_handler$', name) if res: expand(res.group(1), fn) # clear(clear_complex_ops = False) # # Clears all internal dictionaries. # Parameters: # clear_complex_ops - clear complex_ops map (default: False) def clear(clear_complex_ops=False): handlers.clear() forwardings.clear() if clear_complex_ops: complex_ops.clear() # expand(aten, fn) # # Registers an expansion rule # Parametrs: # aten - name of operator to be expanded # fn - function defining the expansion def expand(aten, fn): inputs = [] ops = inspect.signature(fn).parameters for idx, op in enumerate(ops): inputs.append(values.InputValue(op, idx)) add_handler(aten, fn(*inputs), len(ops)) ================================================ FILE: scripts/popgen/transform.py ================================================ # Copyright (c) 2020 Graphcore Ltd. All rights reserved from popgen import registry, Value, ConstantFloat from popgen.values import InputValue # generate_complex_ops(value) # # Apply simplification rules to the expression rooted at the parameter. # New values are annotated with the applied transformation. # Parameters: # value - root of an expression # Returns: root af simplified expression def generate_complex_ops(value): # munch(value, pattern) # # Attempt to match a pattern to the expression rooted at value # Parameters: # value - root of the original expression # pattern - root of the pattern # Returns: tuple(match, list([(idx, arg), ...]) # match - True / False according to whether matchig was successful # (idx, arg) - arguments of the new complex operator # idx - index of pattern's input node # arg - value that is to become an argument with said index def munch(value, pattern): if isinstance(pattern, InputValue): return (True, [(pattern.num, value)]) if not pattern.same(value): return (False, None) match = True new_args = [] for i, _ in enumerate(pattern.args): (match, args) = munch(value.args[i], pattern.args[i]) if not match: new_args = None break new_args += args return (match, new_args) # Attempt to match patterns in reverse order of weight and stop at # first match. Repeat process recursively. for name, op in registry.complex_ops.items(): (_, pattern) = op (match, pos_args) = munch(value, pattern) if match: new_args = [None] * len(pos_args) for (pos, arg) in pos_args: new_args[pos] = arg new_value = Value(name, new_args) new_value.annotation = value.annotation new_value.annotate("// matched " + name + ": " + pattern.render()) return generate_complex_ops(new_value) value.args = [generate_complex_ops(arg) for arg in value.args] return value # generate_typed_constants(value) # # When possible, have constants inherit type information from sibling operands. # This is achieved by attaching a sibling tensor operand as an argument to the # constant. The emit function should then produce a creation call that borrows # type information from the argument. # Parameters: # value - root of the expression tree # Returns: # value - potentially new root node def generate_typed_constants(value, type_like=None): if isinstance(value, ConstantFloat): if type_like is not None: value.args.append(type_like) return value # find the first tensor argument args = [ arg for arg in value.args if isinstance(arg, Value) and not isinstance(arg, ConstantFloat) ] # 'where' is a nasty case where the first tensor is bool! if len(args) > 0: if value.op != 'where': type_like = args[0] else: type_like = args[1] for (i, arg) in enumerate(value.args): value.args[i] = generate_typed_constants(arg, type_like) return value # value_numbering(value) # # Perform value numbering. Any identical values will be merged into a single. # object. The tree rooted at the parameter becomes and acyclic graph. # Parameters: # value - root of an expression tree # Returns: potentially new root of an acyclic graph def value_numbering(value): vn = dict() def numbered_value(value): for i, _ in enumerate(value.args): value.args[i] = numbered_value(value.args[i]) key = value.vn() if key in vn: return vn[key] vn[key] = value return value return numbered_value(value) # validate_forwarding(source): # # Ensure the forwarding of source is sane and resolve any chained rules by closure. # Parameters: # source - name of operator being forwarded def validate_forwarding(source): visited = set(source) dest = registry.forwardings[source] assert source not in registry.handlers, \ source + " is both forwarded and handled" while dest not in registry.handlers: assert dest in registry.forwardings, \ source + " forwarded but no handler found" assert dest not in visited, source + " has circular forwarding" visited.add(dest) dest = registry.forwardings[dest] registry.forwardings[source] = dest ================================================ FILE: scripts/popgen/values.py ================================================ # Copyright (c) 2020 Graphcore Ltd. All rights reserved import sys from popgen import PtrOrRef, Value, NonTensorValue from popgen.operatorfactory import op # AlphaValue(args) # # Represents the alpha computation required for operators that perform # implicit scaling. Its purpose is to avoid a multiplication by unity. # Parameters: # args[0] - value to be scaled # args[1] - scaling factor class AlphaValue(Value): def __init__(self, args): Value.__init__( self, 'alpha', [args[0], args[1], op.mul(args[0], args[1])]) def emit(self, values, val_id, tabs, f=sys.stdout, root=False): if self in values: return val_id val_id = self.emit_arguments(values, val_id, tabs, f) self.emit_annotations(tabs, f) values[self] = "t" + str(val_id) f.write(tabs + "auto *" + values[self] + " = hasUnityValue(" + values[self.args[1]] + ") ? " + values[self.args[0]] + " : " + values[self.args[2]] + ";\n") return val_id + 1 # CastInPlace(op, args, to_type) # # Represents the operation of swiching the scalar type of a tensor. # It is a cast "in-place" in the sense that it doesn't generate # any casting nodes. # Parameters: # op - name of operator # args - tensor input # to_type - name of target type class CastInPlace(Value): def __init__(self, op, args, to_type): Value.__init__(self, op, args) self.to_type = to_type def emit(self, values, val_id, tabs, f=sys.stdout, root=False): if self in values: return val_id val_id = self.emit_arguments(values, val_id, tabs, f) self.emit_annotations(tabs, f) node = "t" + str(val_id) f.write(tabs + "auto *" + node + " = " + values[self.args[0]] + "->node();\n") f.write(f"{tabs}setNodeTensorAttrValue({node}, " f"getNodeTensorAttrValue({node}).to({self.to_type}));\n") f.write(f"{tabs}{node}->output()->inferTypeFrom(" f"getNodeTensorAttrValue({node}));\n") if not root: values[self] = "t" + str(val_id + 1) f.write(tabs + "auto *" + values[self] + " = " + node + "->output();\n") return val_id + 2 f.write(tabs + "return " + node + "->output();\n") return val_id + 1 # TensorType(t) # # Represents the tensor type of a tensor value. # Parameters: # t - the input tensor class TensorType(NonTensorValue): def __init__(self, t): NonTensorValue.__init__(self, 'TensorType', [t]) self.val = t def emit(self, values, val_id, tabs, f=sys.stdout, root=False): assert not root, "TensorType cannot be a root expression" if self in values: return val_id val_id = self.emit_arguments(values, val_id, tabs, f) self.emit_annotations(tabs, f) values[self] = "t" + str(val_id) f.write(tabs + "auto " + values[self] + " = " + values[self.val] + "->type()->expect();\n") return val_id + 1 # Helper(op, args, method, expects_node=False, needs_graph=False) # # A wrapper class for helper methods that return tensors # Parameters: # op - operator # args - arguments # method - generation method # expects_node - True if arguments should be typed Node* instead of Value* # needs_graph - method takes pointer to graph object class Helper(Value): def __init__(self, op, args, method, expects_node=False, needs_graph=False, const=False, ptr_or_ref=None): super().__init__(op, args, const, ptr_or_ref) self.method = method self.expects_node = expects_node self.needs_graph = needs_graph def emit(self, values, val_id, tabs, f=sys.stdout, root=False): if self in values: return val_id val_id = self.emit_arguments(values, val_id, tabs, f) self.emit_annotations(tabs, f) args = [values[arg] for arg in self.args] if self.expects_node: args = [arg + "->node()" for arg in args] if self.needs_graph: args = ["graph"] + args val_id = self.emit_assign_return(values, val_id, root, tabs, f, self.const, self.ptr_or_ref) self.emit_call(self.method, args, ";\n", f) return val_id # InputValue(name, num) # # Represents an input to an operator # Parameters: # name - name of input # num - index of input class InputValue(Value): def __init__(self, name, num): Value.__init__(self, 'input', []) self.name = name self.num = num def emit(self, values, val_id, tabs, f=sys.stdout, root=False): assert not root, "input values cannot be root expression" if self in values: return val_id self.emit_assign_return(values, self.name, root, tabs, f, ptr_or_ref=PtrOrRef.PTR) f.write("node->input(" + str(self.num) + ");\n") return val_id def vn(self): return self.name def same(self, other): return True def render(self): return self.name # OutputValue(index) # # Represents the output value of an operator. This is useful for # occasions where we need the expected shape of the output. # Parameters: # index - index of output class OutputValue(Value): def __init__(self, index): Value.__init__(self, 'output' + str(index), []) self.index = index def emit(self, values, val_id, tabs, f=sys.stdout, root=False): assert not root, "output values may not be root expressions" if self in values: return val_id val_id = self.emit_assign_return(values, val_id, root, tabs, f, ptr_or_ref=PtrOrRef.PTR) f.write("node->output(" + str(self.index) + ");\n") return val_id + 1 def render(self): return self.op # NonTensorConstant(op, val, method) # # Represents a constant value that is not a tensor. Supports literals # as well as graph constants. # Parameters: # val - the constant value # method - helper method to be called when the value is not a literal class NonTensorConstant(NonTensorValue): def __init__(self, op, val, method): self.val = val self.method = method if isinstance(val, Value): NonTensorValue.__init__(self, op, [val]) else: NonTensorValue.__init__(self, op, []) if isinstance(self.val, str): self.val = '"' + self.val + '"' def emit(self, values, val_id, tabs, f=sys.stdout, root=False): assert not root, op + " cannot be a root expression" if not isinstance(self.val, Value): values[self] = str(self.val) return val_id if self in values: return val_id val_id = self.emit_arguments(values, val_id, tabs, f) self.emit_annotations(tabs, f) val_id = self.emit_assign_return(values, val_id, root, tabs, f) self.emit_call(self.method, [values[self.val] + "->node()"], ";\n", f) return val_id def vn(self): if isinstance(self.val, Value): return Value.vn(self) return str(self.val) def render(self): if isinstance(self.val, Value): return self.op + "(" + self.val.render() + ")" return str(self.val) def same(self, other): if self.op != other.op or len(self.args) != len(other.args): return False if isinstance(self.val, Value): return self.val.same(other.val) return self.render() == other.render() # NonTensorHelper(op, args, method, expects_node) # # A wrapper class for helper methods that do not return tensors # Parameters: # op - operator # args - arguments # method - generation method # expects_node - True if arguments should be typed Node* instead of Value* class NonTensorHelper(NonTensorValue): def __init__(self, op, args, method, expects_node=False, needs_graph=False): NonTensorValue.__init__(self, op, args) self.method = method self.expects_node = expects_node self.needs_graph = needs_graph def emit(self, values, val_id, tabs, f=sys.stdout, root=False): assert not root, op + " helper cannot be a root expression" return Helper.emit(self, values, val_id, tabs, f, False) # EmptyInitializer() # # Helper class that produces an empty initializer list class EmptyInitializer(NonTensorValue): def __init__(self): NonTensorValue.__init__(self, "empty_initializer", []) def emit(self, values, val_id, tabs, f=sys.stdout, root=False): values[self] = self.render() return val_id def vn(self): return self.render() def render(self): return "{}" class OriginalNode(Value): def __init__(self): Value.__init__(self, 'input', []) self.name = "original_node" def emit(self, values, val_id, tabs, f=sys.stdout, root=False): if self in values: return val_id self.emit_assign_return(values, self.name, root, tabs, f, ptr_or_ref=PtrOrRef.PTR) f.write("node;\n") return val_id def vn(self): return self.name def same(self, other): return True def render(self): return self.name ================================================ FILE: scripts/set_version.py ================================================ #!/usr/bin/env python3 # Copyright (c) 2020 Graphcore Ltd. All rights reserved. import argparse import logging import os from utils import _utils logger = logging.getLogger(os.path.basename(__file__)) _utils.set_logger(logger) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--debug", "-d", action="store_true", help="Print debug messages") parser.add_argument("--torch-version", type=str) parser.add_argument("--input-file", type=str) parser.add_argument("output", help="File to create") args = parser.parse_args() logging_level = logging.DEBUG if args.debug else logging.INFO logging.basicConfig(level=logging_level) logger.debug("Args: %s", str(args)) pkg_info = _utils.PkgInfo.load_from_file(must_exist=False) # Copy the content of python/__init__.py and replace the occurrences of # @VERSION@ / @SNAPSHOT@ with the actual version / snapshot with open(args.output, "w") as f: if args.input_file is None: args.input_file = os.path.join(_utils.sources_dir(), "python", "__init__.py") for line in open(args.input_file): line = line.replace("@VERSION@", pkg_info.version_long) line = line.replace("@SNAPSHOT@", pkg_info.snapshot) if args.torch_version: line = line.replace("@TORCH_VERSION@", args.torch_version) f.write(line) ================================================ FILE: scripts/utils/_utils.py ================================================ # Copyright (c) 2020 Graphcore Ltd. All rights reserved. import re import enum import fcntl import json import os import shutil import logging import platform import subprocess logger = logging.getLogger(__name__) def get_nprocs(): return len(os.sched_getaffinity(0)) # Make the _utils functions log using the caller's logger instead of the # default 'utils/_utils.py' def set_logger(new_logger): global logger logger = new_logger def rmdir_if_exists(directory): if os.path.isdir(directory): shutil.rmtree(directory) def rm_if_exists(filename): if os.path.isfile(filename): os.remove(filename) def get_first_line(filename): return open(filename, "r").readline().rstrip() def sources_dir(): # ./scripts/utils/../../: return os.path.dirname( os.path.dirname(os.path.dirname(os.path.realpath(__file__)))) class OsType(enum.Enum): Osx = "osx" Linux = "linux" Unknown = "unknown" def get_required_torch_version(): for line in open(os.path.join(sources_dir(), "CMakeLists.txt"), "r"): m = re.match(r"set\(TORCH_VERSION +([0-9.]+)\)", line) if m: return m.group(1) raise RuntimeError("Couldn't find TORCH_VERSION in CMakeLists.txt") class PkgInfo: _pkg_info_file = "pkg_info.json" def __init__(self, version=None, snapshot=None, os_type=None, package_os_type=None, build_number=None, doc_name=None, project_name=None, **kwargs): logger.debug( "PkgInfo: user provided version=%s snapshot=%s os_type=%s" " package_os_type=%s build_number=%s doc_name=%s " "project_name=%s", version, snapshot, os_type, package_os_type, build_number, doc_name, project_name) self.version = version or _get_version() self.snapshot = snapshot or _get_snapshot() self.os_type = os_type or get_os_type() if isinstance(self.os_type, OsType): self.os_type = self.os_type.value self.package_os_type = package_os_type or _get_package_os_type() self.doc_name = doc_name or "poptorch-user-guide" self.poptorch_geometric_doc_name = "poptorch-geometric-user-guide" self.project_name = project_name or "poptorch" self.version_long = self.version self.poptorch_hash = _get_poptorch_hash() if build_number: self.version_long += "+" + build_number logger.debug("Adding custom attributes: %s", kwargs) self.__dict__.update(kwargs) logger.info("PkgInfo initialised: %s", str(self.__dict__)) def pdf_filename(self, poptorch_geometric=False): doc_name = self.poptorch_geometric_doc_name if poptorch_geometric \ else self.doc_name return f"{doc_name}-{self.version}-{self.snapshot}.pdf" def html_filename(self, poptorch_geometric=False): doc_name = self.poptorch_geometric_doc_name if poptorch_geometric \ else self.doc_name return f"{doc_name}-html-{self.version}-{self.snapshot}.zip" def prodinfo_filename(self): return f"{self.project_name}-{self.version}-{self.snapshot}.yml" def save_to_file(self): with open(PkgInfo._pkg_info_file, "w") as f: json.dump(self.__dict__, f) @staticmethod def load_from_file(must_exist=False, path="."): pkg_info_path = os.path.join(path, PkgInfo._pkg_info_file) if not os.path.exists(pkg_info_path): if not must_exist: logger.info("Using default PkgInfo() options") return PkgInfo() raise FileNotFoundError(f"{pkg_info_path} not found") logger.info("Loading packaging options from %s", pkg_info_path) with open(pkg_info_path, "r") as f: attrs = json.load(f) return PkgInfo(**attrs) def _get_version(): v = json.load(open(os.path.join(sources_dir(), "version.json"))) return f"{v['major']}.{v['minor']}.{v['point']}" def _get_view_hash(): try: hash = subprocess.check_output( [ "git", "--git-dir", os.path.join(os.path.dirname(sources_dir()), ".git"), "rev-parse", "--short=10", "HEAD" ], stderr=subprocess.STDOUT).decode("utf-8").strip().rstrip() return hash except (subprocess.CalledProcessError, FileNotFoundError): return None def _get_poptorch_hash(): try: hash = subprocess.check_output( [ "git", "--git-dir", os.path.join(sources_dir(), ".git"), "rev-parse", "--short=10", "HEAD" ], stderr=subprocess.STDOUT).decode("utf-8").strip().rstrip() return hash except (subprocess.CalledProcessError, FileNotFoundError): return None def _get_snapshot(): """ Use the view hash if available. Use the PopTorch hash as a fallback. Use 0000000000 if no git repository is found """ snapshot = _get_view_hash() if snapshot: logger.debug("Using View hash %s as snapshot", snapshot) return snapshot snapshot = _get_poptorch_hash() if snapshot: logger.debug("Using PopTorch hash %s as snapshot", snapshot) return snapshot logger.debug("No git hash found to use as snapshot") return "0000000000" def _get_package_os_type(): distrib = None version = None try: for line in open("/etc/os-release", "r"): if line.startswith("ID="): distrib = line.split("=")[1].rstrip() distrib = distrib.replace('"', "") elif line.startswith("VERSION_ID="): version = line.split("=")[1] version = version.replace(".", "_") version = version.replace('"', "").rstrip() except FileNotFoundError as exc: logger.warning(f"Setting distro/version to \"unknown\" because: {exc}") distrib = "unknown" version = "unknown" assert distrib and version return f"{distrib}_{version}" def get_arch_type(): arch = platform.machine() assert arch in ["aarch64", "x86_64"] return arch def get_os_type(): p = platform.uname() if p.system == "Darwin": return OsType.Osx if p.system == "Linux": return OsType.Linux return OsType.Unknown def _make_output_non_blocking(output): fd = output.fileno() fl = fcntl.fcntl(fd, fcntl.F_GETFL) fcntl.fcntl(fd, fcntl.F_SETFL, fl | os.O_NONBLOCK) return output class _LinesProcessor: def __init__(self, printer_fn): self.printer_fn = printer_fn self.partial_line = "" def _is_full_line(self, line): return line[-1] == "\n" def process(self, lines, flush=False): """ Due to buffering we need to check if lines are actual lines or just fragment of lines (in which case we wait until we've got the whole line available to print it). """ if lines is None: lines = "" else: lines = lines.decode("utf-8") lines = lines.split("\n") lines[0] = self.partial_line + lines[0] self.partial_line = lines[-1] for line in lines[:-1]: self.printer_fn(line) if flush and self.partial_line: self.printer_fn(self.partial_line) self.partial_line = "" class Process: def __init__( self, cmd, # NB as shell=True, shlex.quote is needed for filenames env=None, redirect_stderr=False, stdout_handler=None, stderr_handler=None, bufsize=-1): if redirect_stderr: assert stderr_handler is None, ("You can't have a stderr handler " "when it's redirected to stdout") stderr = subprocess.STDOUT else: stderr = subprocess.PIPE self.p = subprocess.Popen(cmd, shell=True, env=env, executable='/bin/bash', stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=stderr, bufsize=bufsize) _make_output_non_blocking(self.p.stdout) self.stdout = _LinesProcessor(stdout_handler or logger.info) self.stderr = None self.is_alive = True self._returncode = None if not redirect_stderr: _make_output_non_blocking(self.p.stderr) self.stderr = _LinesProcessor(stderr_handler or logger.error) def _read(self): # If it's the last time _read is called (i.e is_alive is now False) # then flush the pipes and close them if self.stderr: self.stderr.process(self.p.stderr.read(), not self.is_alive) if not self.is_alive: self.p.stderr.close() self.stdout.process(self.p.stdout.read(), not self.is_alive) if not self.is_alive: self.p.stdout.close() self._returncode = self.p.returncode del self.p def eof(self): self.p.stdin.close() def is_running(self): if not self.is_alive: return self.is_alive self.is_alive = self.p.poll() is None # We need to read the outputs to avoid # the process to hang if the output gets too long self._read() return self.is_alive def wait(self): while self.is_running(): pass return self._returncode def write(self, s): self.p.stdin.write(s) def returncode(self): return self._returncode def run_commands(*commands, env=None, stop_on_error=True, stdout_handler=None, stderr_handler=None): bash_flags = "" if logger.isEnabledFor(logging.DEBUG): bash_flags += "x" # print commands if stop_on_error: bash_flags += "e" if bash_flags: bash_flags = "set -" + bash_flags + ";" logger.debug("Running: %s", commands) c = Process([bash_flags + ";".join(commands)], env=env, stdout_handler=stdout_handler, stderr_handler=stderr_handler) returncode = c.wait() assert returncode == 0, (f"Shell commands {commands} failed with " f"return code {returncode}") ================================================ FILE: setup.cfg ================================================ [metadata] license_files = License.txt poptorch_third_party_licenses.txt ================================================ FILE: setup.py ================================================ # Copyright (c) 2020 Graphcore Ltd. All rights reserved. import pathlib import os import sys import logging from setuptools import setup from setuptools.dist import Distribution from pybind11.setup_helpers import Pybind11Extension logging.basicConfig(level=logging.INFO) # torch{audio, vision} are added here to prevent the torch upgrade when other # packages depend on torch{audio, vision}. REQUIRES = [ 'tqdm', '@TORCH_DEPENDENCY@', '@TORCHAUDIO_DEPENDENCY@', '@TORCHVISION_DEPENDENCY@' ] VERSION = "@VERSION@" LONG_DESCRIPTION = ( "PopTorch is a set of extensions for PyTorch enabling " "models to be trained, evaluated and used on the Graphcore IPU.") LIBS = ["*.so", "lib/*", "lib/poplar_rt/*", "lib/graphcore/lib/*.a"] class BinaryDistribution(Distribution): """Distribution which always forces a binary package with platform name""" def has_ext_modules(self): return True def get_torch_paths(): # setup.py is executed several times, so it's ok if torch is not always # available. try: import torch # pylint: disable=import-outside-toplevel except ModuleNotFoundError: return [], [] torch_root = str(pathlib.Path(torch.__file__).parent) return [ os.path.join(torch_root, "include"), os.path.join(torch_root, "include", "torch", "csrc", "api", "include") ], [os.path.join(torch_root, "lib")] torch_include_dirs, torch_lib_dirs = get_torch_paths() package_data = {'poptorch': LIBS} # Copy custom codelets into the package so that we can pre-compile them later. package_data["poptorch"].append("*.inc.cpp") core_mod = Pybind11Extension( "poptorch.poptorch_core", ["src/poptorch.cpp"], define_macros=[("_GLIBCXX_USE_CXX11_ABI", 0)], include_dirs=["include"] + torch_include_dirs, library_dirs=["poptorch/lib"] + torch_lib_dirs, extra_link_args=["-Wl,--rpath=$ORIGIN/lib:$ORIGIN"], libraries=[ "poptorch", "popart_compiler", "poptorch_err", "poptorch_logging", "torch_python", "torch" ], language="c++", cxx_std="17") # Same as pybind11_add_module but without stripping the symbols and setting the visibility to hidden. # Source: https://pybind11.readthedocs.io/en/stable/compiling.html#advanced-interface-library-targets # # If the symbols are stripped then error messages will only contain symbol # addresses instead of human readable names. core_mod.extra_compile_args = [ f for f in core_mod.extra_compile_args if not "visibility=hidden" in f and not "-g0" in f ] setup( name='poptorch', version=VERSION, description=LONG_DESCRIPTION, long_description=LONG_DESCRIPTION, long_description_content_type="text/markdown", url='http://graphcore.ai', author='Graphcore', author_email='contact@graphcore.ai', ext_modules=[core_mod], has_ext_modules=lambda: True, license='MIT License', license_files=('License.txt', 'poptorch_third_party_licenses.txt'), packages=['poptorch'], package_data=package_data, include_package_data=True, python_requires=f"=={sys.version_info.major}.{sys.version_info.minor}.*", platforms="@PLATFORM@", install_requires=REQUIRES, zip_safe=False, distclass=BinaryDistribution, classifiers=[ 'Intended Audience :: Developers', 'Intended Audience :: Education', 'Intended Audience :: Science/Research', 'License :: OSI Approved :: MIT License', 'Programming Language :: Python :: 3', 'Topic :: Scientific/Engineering', 'Topic :: Scientific/Engineering :: Mathematics', 'Topic :: Scientific/Engineering :: Artificial Intelligence', 'Topic :: Software Development', 'Topic :: Software Development :: Libraries', 'Topic :: Software Development :: Libraries :: Python Modules', ], ) ================================================ FILE: tests/.gitignore ================================================ .datasets ================================================ FILE: tests/CMakeLists.txt ================================================ add_subdirectory(custom_ops) add_subdirectory(cpp) # Copy tests to the build folder if requested. if(COPY_TESTS) # NOTE: Collapsing the hierarchy like this may cause conflicts. file(GLOB_RECURSE TEST_FILES "${CMAKE_CURRENT_SOURCE_DIR}/*.py") install(FILES ${TEST_FILES} DESTINATION "${CMAKE_CURRENT_BINARY_DIR}") set(TESTS_PATH "${CMAKE_CURRENT_BINARY_DIR}") else() set(TESTS_PATH "${CMAKE_CURRENT_SOURCE_DIR}") endif() set(EXTERNAL_DATASETS_DIR "${CMAKE_BINARY_DIR}/buildenv/external_datasets") # Generate tests. run_poptorch_install_command( "python3 ${CMAKE_CURRENT_SOURCE_DIR}/generate_test_file.py \ ${TESTS_PATH} \ ${CMAKE_CURRENT_BINARY_DIR}/CTestTestfile.cmake \ --add-to-sys-path ${CMAKE_INSTALL_PREFIX} \ --external-datasets-dir ${EXTERNAL_DATASETS_DIR} \ --extra-pytest-args=\"${EXTRA_PYTEST_ARGS}\" " "${PROJECT_BINARY_DIR}" "generate_test_file.py") ================================================ FILE: tests/activations_test.py ================================================ #!/usr/bin/env python3 # Copyright (c) 2020 Graphcore Ltd. All rights reserved. import pytest import torch import torch.nn as nn import helpers import poptorch # pylint: enable=wrong-import-order # Non-linear activations (Weighted activations) # 'torch.nn.ELU', 'torch.nn.Hardshrink', 'torch.nn.Hardtanh', 'torch.nn.LeakyReLU', 'torch.nn.LogSigmoid', 'torch.nn.MultiheadAttention', 'torch.nn.MultiheadAttention.forward', # 'torch.nn.PReLU', 'torch.nn.ReLU', 'torch.nn.ReLU6', 'torch.nn.RReLU', 'torch.nn.SELU', 'torch.nn.SiLU', 'torch.nn.CELU', 'torch.nn.GELU', 'torch.nn.Sigmoid', 'torch.nn.Softplus', # 'torch.nn.Softshrink', 'torch.nn.Softsign', 'torch.nn.Tanh', 'torch.nn.Tanhshrink', 'torch.nn.Threshold', # Non-linear activations (other) # 'torch.nn.Softmin', 'torch.nn.Softmax', 'torch.nn.Softmax2d', 'torch.nn.LogSoftmax', 'torch.nn.AdaptiveLogSoftmaxWithLoss', 'torch.nn.AdaptiveLogSoftmaxWithLoss.log_prob', # 'torch.nn.AdaptiveLogSoftmaxWithLoss.predict', # A version of Softplus with non default arguments class SoftplusWithParams(nn.Softplus): def __init__(self): super().__init__(beta=5.0, threshold=4.0) activation_functions = [ nn.ReLU, nn.Tanh, nn.Sigmoid, nn.SELU, nn.SiLU, nn.ELU, nn.Softmax, nn.LogSoftmax, nn.Softsign, nn.LeakyReLU, nn.Hardtanh, nn.Softplus, nn.Softshrink, nn.Hardshrink, nn.CELU, nn.Hardsigmoid, nn.Hardswish, SoftplusWithParams ] @pytest.mark.parametrize("op", activation_functions) def test_activations(op): torch.manual_seed(42) input = torch.randn([2, 20]) fn = op(dim=1) if op in (nn.Softmax, nn.LogSoftmax) else op() model = helpers.ModelWithWeights(fn, input.shape) model.train() # Run on CPU. native_out, _ = model((input, )) # Run on IPU. poptorch_model = poptorch.trainingModel(model) poptorch_out, _ = poptorch_model((input, )) # Inference test - check outputs helpers.assert_allclose(actual=poptorch_out, expected=native_out, rtol=1e-4, atol=1e-7, equal_nan=True) poptorch_model.assert_weights_changed() @pytest.mark.parametrize("approximate", ["tanh", "none"]) def test_gelu(approximate): if approximate == "none": pytest.skip("TODO: Implement efficient GELU_ERF") torch.manual_seed(42) input = torch.randn((2, 20)) op = nn.GELU(approximate=approximate) model = helpers.ModelWithWeights(op, input.shape) model.train() # Run on CPU. native_out, _ = model((input, )) # Run on IPU. poptorch_model = poptorch.trainingModel(model) poptorch_out, _ = poptorch_model((input, )) helpers.assert_allclose(actual=poptorch_out, expected=native_out) poptorch_model.assert_weights_changed() @pytest.mark.parametrize("input", [ torch.randn((4, )), torch.randn((2, 2)), torch.randn((2, 8, 16)), torch.randn((2, 8, 32, 32)) ]) def test_prelu(input): num_channels = input.shape[1] if input.dim() >= 2 else 1 model = nn.PReLU(num_channels) # Run on CPU. native_out = model(input) # Run on IPU. poptorch_model = poptorch.inferenceModel(model) poptorch_out = poptorch_model(input) helpers.assert_allclose(actual=poptorch_out, expected=native_out, rtol=1e-4, atol=1e-7, equal_nan=True) @pytest.mark.parametrize("dim", range(5)) def test_glu(dim): torch.manual_seed(42) N, C, M, K, L = 2, 4, 6, 8, 10 input = torch.randn(N, C, M, K, L) model = helpers.ModelWithWeights(nn.GLU(dim=dim), input.shape) # Run on CPU. native_out, _ = model((input, )) # Run on IPU. poptorch_model = poptorch.trainingModel(model) poptorch_out, _ = poptorch_model((input, )) # Inference test - check outputs helpers.assert_allclose(expected=native_out, actual=poptorch_out) # Training test - check weights have changed poptorch_model.assert_weights_changed() @pytest.mark.parametrize("op", activation_functions) def test_activation_numerics(op): enable_exceptions = True if op in (nn.SELU, nn.ELU, nn.CELU): # These activations rely on exponentials that will overflow # but saturate to a linear function in the range where x >> 0 enable_exceptions = False model = op(dim=1) if op in (nn.Softmax, nn.LogSoftmax) else op() x = torch.FloatTensor([[10., 100., 1000.]]) native_out = model(x) options = poptorch.Options() options.Precision.enableFloatingPointExceptions(enable_exceptions) poptorch_model = poptorch.inferenceModel(model, options=options) poptorch_out = poptorch_model(x) helpers.assert_allclose(actual=poptorch_out, expected=native_out) @pytest.mark.ipuHardwareRequired @pytest.mark.filterwarnings("ignore:Trace had nondeterministic nodes") @pytest.mark.filterwarnings("ignore:Output nr 1. of the traced function") @pytest.mark.filterwarnings("ignore:Output nr 2. of the traced function") def test_rrelu_training(): opts = poptorch.Options().randomSeed(0) input = torch.randn([3000]) model = helpers.ModelWithWeights(nn.RReLU(), input.shape) # in training negative inputs are multiplied by a random parameter # we'll check positive outputs and distribution of negative outputs native_out, _ = model((input, )) poptorch_model = poptorch.trainingModel(model, options=opts) poptorch_out, _ = poptorch_model((input, )) ref = native_out[native_out >= 0] out = poptorch_out[poptorch_out >= 0] helpers.assert_allclose(actual=out, expected=ref) ref = native_out[native_out < 0] out = poptorch_out[poptorch_out < 0] # Inference test - check outputs for stat in [torch.mean, torch.var]: helpers.assert_allclose(actual=stat(out), expected=stat(ref), atol=0.1, rtol=0.1) # Training test - check weights have changed poptorch_model.assert_weights_changed() def test_rrelu_inference(): torch.manual_seed(42) input = torch.randn([200]) model = nn.RReLU() # in inference there is no randomness - check results directly model.eval() native_out = model(input) poptorch_model = poptorch.inferenceModel(model) poptorch_out = poptorch_model(input) helpers.assert_allclose(actual=poptorch_out, expected=native_out) ================================================ FILE: tests/attach_detach_test.py ================================================ #!/usr/bin/env python3 # Copyright (c) 2021 Graphcore Ltd. All rights reserved. import re import math import unittest.mock import pytest import torch import helpers import poptorch @unittest.mock.patch.dict("os.environ", {"POPTORCH_WAIT_FOR_IPU": "0"}) @pytest.mark.ipuHardwareRequired def test_attach_detach(): torch.manual_seed(42) target = torch.randint(0, 10, [1]) target = target.expand([10]) input = torch.randn(10, 10) class Model(torch.nn.Module): def __init__(self): super().__init__() self.linear = torch.nn.Linear(10, 10) self.loss = torch.nn.CrossEntropyLoss() def forward(self, data, target=None): out = self.linear(data) if target is None: return out loss = self.loss(out, target) return out, loss model = Model() opts = poptorch.Options() # Ensure that both models use the same IPU opts.useIpuId(1) training = poptorch.trainingModel(model, options=opts) opts = opts.clone() inference = poptorch.inferenceModel(model, options=opts) _, initial_loss = training(input, target) if math.isnan(initial_loss): raise ValueError("original_loss is NaN") if poptorch.ipuHardwareIsAvailable(): with pytest.raises(poptorch.Error) as excinfo: inference.compile(torch.randn(10)) assert excinfo.match("Failed to acquire") training.detachFromDevice() # Ensure that this breaks error_msg = r"Device is not attached" with pytest.raises(poptorch.Error, match=error_msg): training.detachFromDevice() inference.compile(torch.randn(10)) if poptorch.ipuHardwareIsAvailable(): inference.detachFromDevice() assert initial_loss > 0.1 loss = float('nan') for _ in range(0, 2): _, loss = training(input, target) # Each batch should NOT report its own loss. As by default training # model should have a "Final" output mode. assert len(loss.size()) == 0 if math.isnan(loss): raise ValueError("loss is NaN") training.detachFromDevice() inference(torch.randn(10)) inference.detachFromDevice() @pytest.mark.ipuHardwareRequired def test_attach_detach_accuracy(): class TrainingModelWithLoss(torch.nn.Module): def __init__(self): super().__init__() self.model = torch.nn.Linear(1, 2) self.loss = torch.nn.CrossEntropyLoss() def forward(self, args, loss_inputs=None): output = self.model(args) if loss_inputs is None: return output final_loss = self.loss(output, loss_inputs) return output, final_loss torch.manual_seed(42) input_data = torch.Tensor([[1.], [-1.]]) labels_data = torch.Tensor([0, 1]).long() model_with_loss = TrainingModelWithLoss() optimizer = poptorch.optim.SGD(model_with_loss.parameters(), lr=0.1, use_combined_accum=False) training_model = poptorch.trainingModel(model_with_loss, optimizer=optimizer) inference_model = poptorch.inferenceModel(model_with_loss) losses1 = [] for _ in range(5): _, loss = training_model(input_data, labels_data) print("Loss:", loss) losses1.append(loss) training_model.detachFromDevice() inference1 = inference_model(input_data) print("Predictions:", inference1) inference_model.detachFromDevice() losses2 = [] for _ in range(100): _, loss = training_model(input_data, labels_data) print(loss) losses2.append(loss) training_model.detachFromDevice() inference2 = inference_model(input_data) print("Predictions:", inference2) assert not torch.allclose(inference1, inference2) assert not torch.allclose(inference2, torch.zeros(2, 2)) assert losses1[-1] > losses2[-1] for i in range(len(losses2) - 1): assert losses2[i] != losses2[i + 1] assert losses2[-1] < 0.1 @pytest.mark.ipuHardwareRequired @unittest.mock.patch.dict("os.environ", {"POPTORCH_WAIT_FOR_IPU": "0"}) @helpers.printCapfdOnExit @helpers.overridePoptorchLogLevel("TRACE") def test_on_demand_attach(capfd): model = torch.nn.Linear(1, 2) opts = poptorch.Options() opts.connectionType(poptorch.ConnectionType.OnDemand) m = poptorch.inferenceModel(model, opts) input = torch.Tensor([[1.], [-1.]]) m(input) log = helpers.LogChecker(capfd).createIterator() # We acquire device 0 to compile. (It's the first device with a matching target) log.findNext(re.escape("Acquired 1 IPU(s): running on device Id 0")) # Make sure we compile before we attach to the device. log.findNext("Finished Poplar compilation") # Device 0 is still free so we'll attach to it. log.findNext("Attached to device 0") n = poptorch.inferenceModel(model, opts) n(input) log = helpers.LogChecker(capfd).createIterator() # We acquire device 0 to compile. (It's the first device with a matching target) # Note: acquiring doesn't mean attaching, it's ok if the device is not actually free. log.findNext(re.escape("Acquired 1 IPU(s): running on device Id 0")) # Make sure we compile before we attach to the device. log.findNext("Finished Poplar compilation") # Device 0 is in use by model 'm' so we should automatically get device 1. log.findNext("Attached to device 1") opts_always = opts.clone() opts_always.connectionType(poptorch.ConnectionType.Always) o = poptorch.inferenceModel(model, opts_always) o(input) log = helpers.LogChecker(capfd).createIterator() # In Always mode we find a free IPU before the compilation and attach to it immediately. log.findNext(re.escape("Acquired 1 IPU(s): running on device Id 2")) # Devices 0 & 1 are in use so we'll get device 2. log.findNext("Attached to device 2") log.findNext("Finished Poplar compilation") @pytest.mark.ipuHardwareRequired def test_attach_detach_tied_weights(): torch.manual_seed(42) input = torch.randn(10, 10) class Model(torch.nn.Module): def __init__(self, inp=10, out=100): super().__init__() self.encoder = torch.nn.Linear(inp, out, bias=False) self.encoder_tied = torch.nn.Linear(inp, out, bias=False) self.encoder_tied.weight = self.encoder.weight def forward(self, data): out = self.encoder(data) + self.encoder_tied(data) out = torch.nn.functional.linear(out, self.encoder.weight.t()) if self.training: return out, poptorch.identity_loss(out, reduction="mean") return out model = Model() model.train() optimizer = torch.optim.AdamW(model.parameters(), lr=0.001) training_opts = poptorch.Options() training_model = poptorch.trainingModel(model, options=training_opts, optimizer=optimizer) training_model.compile(input) training_model.detachFromDevice() model.eval() inference_opts = poptorch.Options() inference_model = poptorch.inferenceModel(model, options=inference_opts) inference_model.compile(input) for _ in range(5): inference_model.detachFromDevice() training_model.attachToDevice() training_model.detachFromDevice() inference_model.attachToDevice() ================================================ FILE: tests/attach_detach_wait_for_ipu_test.py ================================================ #!/usr/bin/env python3 # Copyright (c) 2021 Graphcore Ltd. All rights reserved. import math import os import time import unittest.mock import pytest import torch import torch.multiprocessing as mp import helpers import poptorch def inference_process(event): assert os.environ.get('POPTORCH_WAIT_FOR_IPU') is not None torch.manual_seed(42) target = torch.randint(0, 10, [1]) target = target.expand([10]) model = torch.nn.Linear(10, 10) opts = poptorch.Options() # Ensure that both models use the same IPU opts.useIpuId(1) inference = poptorch.inferenceModel(model, options=opts) inference.compile(torch.randn(10)) event.set() time.sleep(12) inference.detachFromDevice() @helpers.printCapfdOnExit @unittest.mock.patch.dict("os.environ", {"POPTORCH_WAIT_FOR_IPU": "1"}) @pytest.mark.ipuHardwareRequired @helpers.overridePoptorchLogLevel("TRACE") def test_attach_detach_wait_for_ipu(capfd): torch.manual_seed(42) target = torch.randint(0, 10, [1]) target = target.expand([10]) input = torch.randn(10, 10) class Model(torch.nn.Module): def __init__(self): super().__init__() self.linear = torch.nn.Linear(10, 10) self.loss = torch.nn.CrossEntropyLoss() def forward(self, data, target): out = self.linear(data) loss = self.loss(out, target) return out, loss model = Model() opts = poptorch.Options() # Ensure that both models use the same IPU opts.useIpuId(1) poptorch_model = poptorch.trainingModel(model, options=opts) ctx = mp.get_context('spawn') mgr = mp.Manager() event = mgr.Event() process = ctx.Process(target=inference_process, args=(event, )) process.start() event.wait() _, initial_loss = poptorch_model(input, target) process.join() if math.isnan(initial_loss): raise ValueError("original_loss is NaN") poptorch_model.detachFromDevice() log = helpers.LogChecker(capfd) log.assert_contains("No IPU available, sleeping") ================================================ FILE: tests/batching_test.py ================================================ #!/usr/bin/env python3 # Copyright (c) 2020 Graphcore Ltd. All rights reserved. import torch import pytest import helpers import poptorch def test_inferenceBatching(): torch.manual_seed(42) model = torch.nn.Linear(6, 20) # Actually batched by 100. input = torch.randn([10, 1, 5, 6]) # Run pytorch native on CPU batchsize 10. native_output = model(input) # Run on IPU batch size 1 * 10 popart batches. opts = poptorch.Options().deviceIterations(10) ipuModel = poptorch.inferenceModel(model, opts) poptorch_out = ipuModel(input) # Check that inference wrapper has defaulted to "All". assert len(poptorch_out.size()) == 4 assert poptorch_out.size()[0] == 10 helpers.assert_allclose(expected=native_output, actual=poptorch_out) def test_trainingBatching(): torch.manual_seed(4424242) # 10 Batches of 10. input = torch.randn(10, 10) # 10 batches of 1 label = torch.randint(0, 10, [1]) label = label.expand([10]) class Model(torch.nn.Module): def __init__(self): super().__init__() self.linear = torch.nn.Linear(10, 10) self.loss = torch.nn.CrossEntropyLoss() def forward(self, data, target): out = self.linear(data) loss = self.loss(out, target) return out, loss model = Model() # Run on IPU batch size 1 * 10 popart batches. opts = poptorch.Options().deviceIterations(10) poptorch_model = poptorch.trainingModel(model, options=opts) # Run all 10 batches as batchsize 10. out, _ = model(input, label) # Sanity check we weren't already matching the label. assert not torch.equal(torch.argmax(out, dim=1), label) for _ in range(0, 1000): _, loss = poptorch_model(input, label) # Each batch should NOT report its own loss. As by default training model should have a "Final" output mode. assert len(loss.size()) == 0 # Run with trained weights. out, _ = model(input, label) # Check we are now equal with labels. helpers.assert_allequal(actual=torch.argmax(out, dim=1), expected=label) @pytest.mark.parametrize("mode", list(poptorch.OutputMode)) def test_inferenceOutputModes(mode): torch.manual_seed(42) model = torch.nn.Linear(6, 20) # Actually batched by 100. input = torch.randn([10, 1, 5, 6]) # Run pytorch native on CPU batchsize 10. native_out = model(input) # Run on IPU batch size 1 * 10 popart batches. output_return_period ignored if not EVERYN opts = poptorch.Options().deviceIterations(10) opts.outputMode(mode, output_return_period=5) ipuModel = poptorch.inferenceModel(model, opts) poptorch_out = ipuModel(input) if mode in [poptorch.OutputMode.All, poptorch.OutputMode.Default]: # Expect the full batch. assert len(poptorch_out.size()) == 4 assert poptorch_out.size()[0] == 10 helpers.assert_allclose(expected=native_out, actual=poptorch_out) elif mode == poptorch.OutputMode.EveryN: # Otherwise we are expecting device_iterations / N assert len(poptorch_out.size()) == 4 assert poptorch_out.size()[0] == 2 # Check each N is the correct batch helpers.assert_allclose(actual=poptorch_out[0], expected=native_out[4]) helpers.assert_allclose(actual=poptorch_out[1], expected=native_out[9]) else: # Otherwise we are expecting just one element per batch. assert len(poptorch_out.size()) == 4 assert poptorch_out.size()[0] == 1 if mode == poptorch.OutputMode.Final: # Check we are the same as the last output. helpers.assert_allclose(actual=poptorch_out.reshape( native_out[-1].shape), expected=native_out[-1]) elif mode == poptorch.OutputMode.Sum: # Check we are close to the sum of the batch dim. sum = torch.sum(native_out, dim=0, keepdim=True) helpers.assert_allclose(actual=poptorch_out, expected=sum) else: assert False, "Unexpected output mode %s" % mode @pytest.mark.parametrize("mode", list(poptorch.OutputMode)) def test_trainingOutputModes(mode): torch.manual_seed(42) # 1000 Batches of 10. input = torch.randn(1000, 10) # 1000 batches of 1 label = torch.randint(0, 10, [1]) label = label.expand([1000]) class Model(torch.nn.Module): def __init__(self): super().__init__() self.linear = torch.nn.Linear(10, 10) self.loss = torch.nn.CrossEntropyLoss() def forward(self, data, target): out = self.linear(data) loss = self.loss(out, target) return out, loss model = Model() # Run pytorch native on CPU batchsize 10. model(input, label) # Run on IPU batch size 1 * 1000 popart batches. opts = poptorch.Options().deviceIterations(1000) opts.outputMode(mode, output_return_period=20) poptorch_model = poptorch.trainingModel(model, options=opts) poptorch_out, loss = poptorch_model(input, label) if mode == poptorch.OutputMode.All: # Expect the full batch. assert len(poptorch_out.size()) == 2 assert poptorch_out.size()[0] == 1000 assert len(loss.size()) == 1 assert loss.size()[0] == 1000 # Check the rolling average loss is downward sloped. interval = 100 previous_average = torch.mean(loss[:interval]) for i in range(1, 1000 // interval): start = interval * i end = start + interval new_average = torch.mean(loss[start:end]) assert new_average < previous_average previous_average = new_average elif mode == poptorch.OutputMode.EveryN: # Otherwise we are expecting device_iterations / N assert len(poptorch_out.size()) == 2 assert poptorch_out.size()[0] == 50 # There's too much noise in the losses for us to test directly without averaging like above so just test sizes. assert len(loss.size()) == 1 assert loss.size()[0] == 50 else: # Otherwise we are expecting just one element per batch. assert len(poptorch_out.size()) == 2 assert poptorch_out.size()[0] == 1 assert len(loss.size()) == 0 if mode in [poptorch.OutputMode.Final, poptorch.OutputMode.Default]: # We just have to check the loss is small. # This is just relative to the previously observed loss values on this test with this seed. assert loss < 0.2 elif mode == poptorch.OutputMode.Sum: # We just have to check that the loss is huge. assert loss > 500.0 else: assert False, "Unexpected output mode %s" % mode def run_gradient_accumulation_test(input, target, gradient_accumulations, accumulation_reduction_type, lr): torch.manual_seed(42) class Model(torch.nn.Module): def __init__(self): super().__init__() self.linear = torch.nn.Linear(10, 10) self.loss = torch.nn.L1Loss(reduction="mean") def forward(self, data, target): out = self.linear(data) loss = self.loss(out, target) return out, loss model = Model() opts = poptorch.Options() opts.outputMode(poptorch.OutputMode.All) opts.Training.gradientAccumulation(gradient_accumulations) if accumulation_reduction_type is not None: opts.Training.accumulationAndReplicationReductionType( accumulation_reduction_type) poptorch_model = poptorch.trainingModel(model, options=opts, optimizer=torch.optim.SGD( model.parameters(), lr=lr)) # Run 10 training steps for _ in range(10): poptorch_model(input, target) # return trained weight matrix return poptorch_model.linear.weight.data def test_gradient_accumulation_training(): torch.manual_seed(42) target = torch.randn(4, 10) input = torch.randn(4, 10) # Testing gradient accumulations 1 vs 2 and Mean reduction w_with_1 = run_gradient_accumulation_test(target, input, 1, poptorch.ReductionType.Mean, 0.01) w_with_2 = run_gradient_accumulation_test(target, input, 2, poptorch.ReductionType.Mean, 0.01) helpers.assert_allclose(actual=w_with_1, expected=w_with_2) # Test the default matches as well (i.e. the default is mean) w_with_2 = run_gradient_accumulation_test(target, input, 2, None, 0.01) helpers.assert_allclose(actual=w_with_1, expected=w_with_2) # Testing gradient accumulations 1 vs 2 and Sum reduction (different lr) w_with_1 = run_gradient_accumulation_test(target, input, 1, poptorch.ReductionType.Sum, 0.02) w_with_2 = run_gradient_accumulation_test(target, input, 2, poptorch.ReductionType.Sum, 0.01) helpers.assert_allclose(actual=w_with_1, expected=w_with_2) class FourBlockModel(torch.nn.Module): def __init__(self): super().__init__() self.lin1 = torch.nn.Linear(1, 1) self.lin2 = torch.nn.Linear(1, 1) self.lin3 = torch.nn.Linear(1, 1) self.lin4 = torch.nn.Linear(1, 1) def forward(self, x): with poptorch.Block("B1", ipu_id=0): out = self.lin1(x) with poptorch.Block("B2", ipu_id=1): out = self.lin2(out) with poptorch.Block("B3", ipu_id=2): out = self.lin3(out) with poptorch.Block("B4", ipu_id=3): out = self.lin4(out) return out class FourBlockModelNoScope(torch.nn.Module): def __init__(self): super().__init__() self.lin1 = torch.nn.Linear(1, 1) self.lin2 = torch.nn.Linear(1, 1) self.lin3 = torch.nn.Linear(1, 1) self.lin4 = torch.nn.Linear(1, 1) def forward(self, x): poptorch.Block.start("B1", ipu_id=0) out = self.lin1(x) poptorch.Block.start("B2", ipu_id=1) out = self.lin2(out) poptorch.Block.start("B3", ipu_id=2) out = self.lin3(out) poptorch.Block.start("B4", ipu_id=3) out = self.lin4(out) return out @pytest.mark.parametrize("num_grad_accums", (4, 5, 7)) @pytest.mark.parametrize("device_iterations", (1, 2)) def test_gradient_accumulation_pipelined_training(num_grad_accums, device_iterations): class TrainingFourBlockModel(torch.nn.Module): def __init__(self): super().__init__() self.four_block = FourBlockModel() def forward(self, x): out = self.four_block(x) with poptorch.Block("B4", ipu_id=3): loss = poptorch.identity_loss(out, reduction="mean") return out, loss model = TrainingFourBlockModel() opts = poptorch.Options() opts.deviceIterations(device_iterations) opts.Training.gradientAccumulation(num_grad_accums) poptorch_model = poptorch.trainingModel(model, options=opts) if num_grad_accums in (4, 5): err_msg = (r"poptorch\.Options\(\)\.Training\.gradientAccumulation " r"must be greater than or equal to the number of pipeline" r" stages \(7\) when using poptorch\.PipelinedExecution\. " r"Please note that a model with 4 pipeline stages in " r"PopTorch will have an additional 3 stages when training.") with pytest.raises(poptorch.poptorch_core.Error, match=err_msg): poptorch_model(torch.zeros(num_grad_accums * device_iterations)) else: poptorch_model(torch.zeros(num_grad_accums * device_iterations)) @pytest.mark.parametrize("pipelined", [True, False]) @pytest.mark.parametrize("Model", [FourBlockModel, FourBlockModelNoScope]) def test_gradient_accumulation_inference(pipelined, Model): model = Model() opts = poptorch.Options() if pipelined: # pylint: disable=protected-access assert isinstance(opts._execution_strategy, poptorch.PipelinedExecution) else: opts.setExecutionStrategy(poptorch.ShardedExecution()) opts.Training.gradientAccumulation(2) err_msg = (r"You must set " r"poptorch\.Options\(\)\.Training\.gradientAccumulation to 1 " r"or leave it as its default value \(1\) when running a " r"poptorch\.inferenceModel\(\)\.") if pipelined: err_msg += (r" Use poptorch\.Options\(\)\.deviceIterations() to " r"process a sufficient number of batches each run for " r"pipelined execution instead.") with pytest.raises(poptorch.poptorch_core.Error, match=err_msg): poptorch.inferenceModel(model, options=opts) @pytest.mark.parametrize("pipelined", [True, False]) @pytest.mark.parametrize("device_iterations", (2, 4)) @pytest.mark.parametrize("Model", [FourBlockModel, FourBlockModelNoScope]) def test_device_iterations_inference(pipelined, device_iterations, Model): model = Model() opts = poptorch.Options() if pipelined: # pylint: disable=protected-access assert isinstance(opts._execution_strategy, poptorch.PipelinedExecution) else: opts.setExecutionStrategy(poptorch.ShardedExecution()) opts.deviceIterations(device_iterations) poptorch_model = poptorch.inferenceModel(model, options=opts) if pipelined and device_iterations == 2: err_msg = (r"poptorch\.Options\(\)\.deviceIterations must be greater") with pytest.raises(poptorch.poptorch_core.Error, match=err_msg): poptorch_model(torch.zeros(device_iterations)) else: poptorch_model(torch.zeros(device_iterations)) ================================================ FILE: tests/bert_small_and_medium_test.py ================================================ #!/usr/bin/env python3 # Copyright (c) 2020 Graphcore Ltd. All rights reserved. import transformers import torch import helpers import poptorch def test_bert_small(): torch.manual_seed(42) # Bert small. pretrained_weights = 'mrm8488/bert-small-finetuned-squadv2' model = transformers.BertModel.from_pretrained(pretrained_weights, torchscript=True) tokenizer = transformers.BertTokenizer.from_pretrained(pretrained_weights) # It *just* fits on one IPU but if the sequence length is too big it will need two. input_ids = torch.tensor([tokenizer.encode("E")]) options = poptorch.Options() inference_model = poptorch.inferenceModel(model, options) poptorch_out = inference_model(input_ids) native = model(input_ids) for poptorchResult, nativeResult in zip(poptorch_out, native): helpers.assert_allclose(actual=poptorchResult, expected=nativeResult, rtol=1e-02, atol=1e-02) def test_bert_small_half(): torch.manual_seed(42) # Bert small. pretrained_weights = 'mrm8488/bert-small-finetuned-squadv2' model = transformers.BertModel.from_pretrained(pretrained_weights, torchscript=True) tokenizer = transformers.BertTokenizer.from_pretrained(pretrained_weights) # It *just* fits on one IPU but if the sequence length is too big it will need two. input_ids = torch.tensor([tokenizer.encode("E")]) model.half() options = poptorch.Options() inference_model = poptorch.inferenceModel(model, options) poptorch_out = inference_model(input_ids) # Just check that we compile for now. assert poptorch_out[0].dtype == torch.half def test_bert_medium_result(): torch.manual_seed(42) pretrained_weights = 'mrm8488/bert-medium-finetuned-squadv2' # For later versions of transformers, we need to wrap the model and set # return_dict to False class WrappedModel(torch.nn.Module): def __init__(self): super().__init__() transformers_BFQA = transformers.BertForQuestionAnswering self.wrapped = transformers_BFQA.from_pretrained( 'mrm8488/bert-medium-finetuned-squadv2') def forward(self, input_ids, attention_mask): return self.wrapped.forward(input_ids, attention_mask, return_dict=False) def __getattr__(self, attr): try: return torch.nn.Module.__getattr__(self, attr) except AttributeError: return getattr(self.wrapped, attr) model = WrappedModel() tokenizer = transformers.BertTokenizer.from_pretrained( pretrained_weights, return_token_type_ids=True) context = """Edinburgh is Scotland's compact, hilly capital.""" question = "What is the capital of Scotland?" encoding = tokenizer.encode_plus(question, context) mask = encoding["attention_mask"] ins = encoding["input_ids"] input_ids = torch.tensor([ins, ins]) attention_mask = torch.tensor([mask, mask]) start_scores_native, end_scores_native = model( input_ids, attention_mask=attention_mask) opts = poptorch.Options() opts.deviceIterations(2) model.bert.embeddings.position_embeddings = poptorch.BeginBlock( model.bert.embeddings.position_embeddings, ipu_id=1) inference_model = poptorch.inferenceModel(model, opts) start_score_pop, end_scores_pop = inference_model(input_ids, attention_mask) # Longer sequences begin to accumulate more floating point error. helpers.assert_allclose(expected=start_scores_native, actual=start_score_pop, rtol=1e-02, atol=1e-02) helpers.assert_allclose(expected=end_scores_native, actual=end_scores_pop, rtol=1e-02, atol=1e-02) assert torch.argmax(start_score_pop), torch.argmax(start_scores_native) assert torch.argmax(end_scores_pop), torch.argmax(end_scores_native) # Convert to string (Only check the first result as we've already established the two were identical) ans_tokens = ins[torch.argmax(start_score_pop[0] ):torch.argmax(end_scores_pop[0]) + 1] answer_tokens = tokenizer.convert_ids_to_tokens(ans_tokens) answer_tokens_to_string = tokenizer.convert_tokens_to_string(answer_tokens) assert answer_tokens_to_string == 'edinburgh' ================================================ FILE: tests/blas_test.py ================================================ #!/usr/bin/env python3 # Copyright (c) 2020 Graphcore Ltd. All rights reserved. import pytest import torch import helpers import poptorch def blas_op(op, input1, input2, out, atol=1e-04, rtol=1e-04): class Model(torch.nn.Module): def __init__(self, op): super().__init__() self.op = op def forward(self, x, y, out=None): return self.op(x, y, out=out) model = Model(op) args = [input1, input2] if out is not None: args.append(out) native_out = None # Matmul fp16 is not supported on the CPU if input1.dtype != torch.half and input2.dtype != torch.half: # Run on CPU. native_out = model(*args) # Run on IPU. poptorch_model = poptorch.inferenceModel(model) poptorch_out = poptorch_model(*args) if native_out is not None: helpers.assert_allclose(expected=native_out, actual=poptorch_out, atol=atol, rtol=rtol, equal_nan=True) if out is not None and native_out is not None: helpers.assert_allclose(expected=native_out, actual=out, atol=atol, rtol=rtol, equal_nan=True) @pytest.mark.parametrize("out", [True, False]) @pytest.mark.parametrize("shapes", [([10, 200], [200, 45], [10, 45]), ([10, 200], [200], [10]), ([200], [200, 45], [1, 45]), ([200], [200], [])]) def test_matmul(out, shapes): torch.manual_seed(42) if len(shapes[0]) == 1 and len(shapes[1]) == 1 and out: pytest.skip( "TODO(T71439) No shape inference handler for aten::fill_.Tensor") input1 = torch.randn(shapes[0]) input2 = torch.randn(shapes[1]) out = torch.randn(shapes[2]) if out else None blas_op(torch.matmul, input1, input2, out) @pytest.mark.parametrize("mode", (poptorch.MatMulSerializationMode.InputChannels, poptorch.MatMulSerializationMode.ReducingDim, poptorch.MatMulSerializationMode.OutputChannels, poptorch.MatMulSerializationMode.Disabled)) @pytest.mark.parametrize("factor", (2, 5, 10)) @pytest.mark.parametrize("keep_precision", [True, False]) def test_serializedMatMul(mode, factor, keep_precision): torch.manual_seed(42) input1 = torch.rand(1, 10, 200) input2_dim = 45 if mode == poptorch.MatMulSerializationMode.OutputChannels: # Ensure the value is a multiple of factor input2_dim = input2_dim // factor * factor input2 = torch.rand(200, input2_dim) def serialise_matmal_op(input, other, out): assert out is None return poptorch.serializedMatMul(input, other, mode, factor, keep_precision) if keep_precision: input1 = input1.half() input2 = input2.half() blas_op(serialise_matmal_op, input1, input2, None, rtol=0.01, atol=0.05) else: blas_op(serialise_matmal_op, input1, input2, None) @pytest.mark.parametrize("optional_out", [True, False]) def test_bmm(optional_out): input1 = torch.randn([12, 10, 200]) input2 = torch.randn([12, 200, 33]) out = torch.randn([12, 10, 33]) if optional_out else None blas_op(torch.bmm, input1, input2, out) @pytest.mark.parametrize("bias", [True, False]) def test_matmul_training(bias): N, M, K, C = 100, 9, 7, 5 class Net(torch.nn.Module): def __init__(self): super().__init__() torch.manual_seed(42) self.linear = torch.nn.Linear(K, K, bias=bias) self.softmax = torch.nn.LogSoftmax(dim=1) self.loss = torch.nn.L1Loss(reduction="mean") def forward(self, x, y, target): x = self.linear(x) x = torch.matmul(x, y) return x, self.loss(x, target) torch.manual_seed(42) model = Net() optimizer = torch.optim.SGD(model.parameters(), lr=0.01) torch.manual_seed(42) poptorch_model = poptorch.trainingModel(model, optimizer=optimizer) x = torch.randn(N, M, K) y = torch.randn(K, K) target = torch.empty(N, M, K, dtype=torch.long).random_(0, C) for _ in range(0, 400): optimizer.zero_grad() poptorch_out, poptorch_loss = poptorch_model(x, y, target) native_out, native_loss = model(x, y, target) native_loss.backward(retain_graph=True) optimizer.step() helpers.assert_allclose(actual=poptorch_out, expected=native_out, rtol=1e-02, atol=1e-02) helpers.assert_allclose(actual=poptorch_loss, expected=native_loss, rtol=1e-03, atol=1e-03) @pytest.mark.parametrize( "params", [ # input_shape, beta, alpha ((3, 7), 1.0, 1.0), ((3, 1), 1.0, 0.75), ((1, 7), 0.75, 1.0), ((1), 0.75, 0.75), ]) def test_addmm(params): torch.manual_seed(42) input_shape, beta, alpha = params t1 = torch.randn(input_shape) t2 = torch.randn(3, 5) t3 = torch.randn(5, 7) class AddmmModel(torch.nn.Module): def __init__(self, beta, alpha): super().__init__() self.beta = beta self.alpha = alpha def forward(self, x1, x2, x3): return torch.addmm(x1, x2, x3, beta=self.beta, alpha=self.alpha) model = AddmmModel(beta, alpha) cpu_result = model(t1, t2, t3) ipu_result = poptorch.inferenceModel(model)(t1, t2, t3) helpers.assert_allclose(expected=cpu_result, actual=ipu_result) @pytest.mark.parametrize( "params", [ # input_shape, beta, alpha ((3, 7), 1.0, 1.0), ((3, 1), 1.0, 0.75), ((1, 7), 0.75, 1.0), ((1), 0.75, 0.75), ]) def test_baddbmm(params): torch.manual_seed(42) input_shape, beta, alpha = params t1 = torch.randn(input_shape) t2 = torch.randn(2, 3, 5) t3 = torch.randn(2, 5, 7) class AddmmModel(torch.nn.Module): def __init__(self, beta, alpha): super().__init__() self.beta = beta self.alpha = alpha def forward(self, x1, x2, x3): return torch.baddbmm(x1, x2, x3, beta=self.beta, alpha=self.alpha) model = AddmmModel(beta, alpha) cpu_result = model(t1, t2, t3) ipu_result = poptorch.inferenceModel(model)(t1, t2, t3) helpers.assert_allclose(expected=cpu_result, actual=ipu_result) @pytest.mark.parametrize("input_shape", [(20, 10)]) @pytest.mark.parametrize("beta", [0, .5]) @pytest.mark.parametrize("alpha", [0, 1.5]) @pytest.mark.parametrize("use_out", [True, False]) def test_addmv(input_shape, beta, alpha, use_out): torch.manual_seed(42) mat = torch.randn(input_shape) vec = torch.randn(input_shape[1]) inp = torch.randn(input_shape[0]) if beta == 0: # NaNs in input should be ignored inp[0] = float('nan') if alpha == 0: # NaNs in vec or mat should be ignored mat[0, 0] = float('nan') vec[0] = float('nan') output = torch.empty(input_shape[0]) if use_out else None class AddmvModel(torch.nn.Module): def __init__(self, beta, alpha): super().__init__() self.beta = beta self.alpha = alpha def forward(self, inp, mat, vec, out=None): result = torch.addmv(inp, mat, vec, beta=self.beta, alpha=self.alpha, out=out) if self.beta == 0 and self.alpha == 0: # Avoid empty compute graph result += torch.zeros_like(inp) return result model = AddmvModel(beta, alpha) cpu_result = model(inp, mat, vec, out=output) ipu_result = poptorch.inferenceModel(model)(inp, mat, vec, output) helpers.assert_allclose(expected=cpu_result, actual=ipu_result) if use_out is True: helpers.assert_allclose(expected=cpu_result, actual=output) ================================================ FILE: tests/bool_support_test.py ================================================ #!/usr/bin/env python3 # Copyright (c) 2021 Graphcore Ltd. All rights reserved. import torch import pytest import helpers import poptorch # Not need for mean or logsumexp reduce_ops = [torch.sum, torch.prod] test_tensors = [ torch.tensor([1.0, 2.0, 3.1]), torch.tensor([1.1, 2.0, 3.0]), torch.tensor([0.0, 0.0, 0.0]) ] @pytest.mark.parametrize("op", reduce_ops) @pytest.mark.parametrize("t_1", test_tensors) @pytest.mark.parametrize("t_2", test_tensors) def test_reduce_two_bool_types(op, t_1, t_2): class Model(torch.nn.Module): def forward(self, x, y): return op(x == y) model = Model() poptorch_model = poptorch.inferenceModel(model) native_out = model(t_1, t_2) poptorch_out = poptorch_model(t_1, t_2) #expected = no dims (scalar) helpers.assert_allclose(actual=poptorch_out, expected=native_out) assert native_out.dtype == torch.int64 assert poptorch_out.dtype == torch.int32 def test_logits(): class Model(torch.nn.Module): def forward(self, logits, y): acc = torch.sum(torch.argmax(logits, -1) == y) / float(y.size(0)) return acc model = Model() logits = torch.tensor([[1.0, 2.0, 3.0], [3.0, 1.0, 2.0], [2.0, 3.0, 1.0]]) y = torch.tensor([[0], [2], [1]]) poptorch_model = poptorch.inferenceModel(model) native_out = model(logits, y) poptorch_out = poptorch_model(logits, y) helpers.assert_allclose(actual=poptorch_out, expected=native_out) ================================================ FILE: tests/buffers_test.py ================================================ #!/usr/bin/env python3 # Copyright (c) 2020 Graphcore Ltd. All rights reserved. import torch import pytest import helpers import poptorch class ConstantBuffer(torch.nn.Module): def __init__(self): super().__init__() self.register_buffer('stuff', torch.tensor([1, 2, 3], dtype=torch.int32)) def forward(self, x): new_stuff = 1.0 + self.stuff return torch.sum(x + new_stuff) def test_constant_buffer(): model = ConstantBuffer() poptorch_model = poptorch.inferenceModel(model) assert poptorch_model(torch.tensor([2])) == 15 def test_constant_buffer_repeat(): model = ConstantBuffer() poptorch_model = poptorch.inferenceModel(model) assert poptorch_model(torch.tensor([2])) == 15 assert poptorch_model(torch.tensor([2])) == 15 class UpdatableBuffer(torch.nn.Module): def __init__(self): super().__init__() self.register_buffer("buffer_1", torch.ones(1)) self.register_buffer("buffer_2", torch.ones(1)) def forward(self, x): return torch.sum(x + self.buffer_1 + self.buffer_2) def test_copy_named_buffer_to_device_single_buffer(): model = UpdatableBuffer() options = poptorch.Options() options.updatableNamedBuffers(['buffer_1']) poptorch_model = poptorch.inferenceModel(model, options=options) x = torch.ones(3).float() assert poptorch_model(x) == 9 poptorch_model.buffer_1.copy_(poptorch_model.buffer_1 + 1) poptorch_model.copyNamedBuffersToDevice() assert poptorch_model(x) == 12 poptorch_model.buffer_2.copy_(poptorch_model.buffer_2 + 1) poptorch_model.copyNamedBuffersToDevice() assert poptorch_model(x) == 12 def test_copy_named_buffer_to_device_two_buffers(): model = UpdatableBuffer() options = poptorch.Options() options.updatableNamedBuffers(['buffer_1', 'buffer_2']) poptorch_model = poptorch.inferenceModel(model, options=options) x = torch.ones(3).float() assert poptorch_model(x) == 9 poptorch_model.buffer_1.copy_(poptorch_model.buffer_1 + 1) poptorch_model.copyNamedBuffersToDevice() assert poptorch_model(x) == 12 poptorch_model.buffer_2.copy_(poptorch_model.buffer_2 + 1) poptorch_model.copyNamedBuffersToDevice() assert poptorch_model(x) == 15 def test_copy_named_buffer_to_device_no_opt(): model = UpdatableBuffer() options = poptorch.Options() poptorch_model = poptorch.inferenceModel(model, options=options) x = torch.ones(3).float() assert poptorch_model(x) == 9 poptorch_model.buffer_1.copy_(poptorch_model.buffer_1 + 1) with pytest.raises(poptorch.poptorch_core.Error): poptorch_model.copyNamedBuffersToDevice() assert poptorch_model(x) == 9 def test_copy_named_buffer_to_device_invalid_opt(): model = UpdatableBuffer() options = poptorch.Options() options.updatableNamedBuffers(['non_existing_buffer']) poptorch_model = poptorch.inferenceModel(model, options=options) x = torch.ones(3).float() with pytest.raises(poptorch.poptorch_core.Error): poptorch_model(x) def test_training_then_inference(): momentum = 0.1 class Model(torch.nn.Module): def __init__(self): super().__init__() self.bn = torch.nn.BatchNorm1d(10, momentum=momentum) self.loss = torch.nn.MSELoss() def forward(self, x, target): y = self.bn(x) return y, self.loss(y, target) model = Model() input = torch.ones([4, 10], dtype=torch.float32) target = torch.ones([4, 10], dtype=torch.float32) + 1 training_model = poptorch.trainingModel(model) training_model.compile(input, target) inference_model = poptorch.inferenceModel(model) inference_model.compile(input, target) def test_buffer_implicit_copy(): momentum = 0.1 class Model(torch.nn.Module): def __init__(self): super().__init__() self.bn = torch.nn.BatchNorm1d(10, momentum=momentum) self.loss = torch.nn.MSELoss() def forward(self, x, target): y = self.bn(x) return y, self.loss(y, target) model = Model() input = torch.ones([4, 10], dtype=torch.float32) target = torch.ones([4, 10], dtype=torch.float32) + 1 poptorch_model = poptorch.trainingModel(model) poptorch_model(input, target) helpers.assert_allclose(actual=model.bn.running_mean, expected=input[0, :] * momentum) poptorch_model.copyWeightsToHost() helpers.assert_allclose(actual=model.bn.running_mean, expected=input[0, :] * momentum) def test_error_on_remove_buffer(): class Model(torch.nn.Module): def __init__(self): super().__init__() self.register_buffer('y', torch.tensor([2])) def forward(self, x): x = x + 1 if 'y' in self._buffers: del self._buffers['y'] return x model = Model() poptorch_model = poptorch.inferenceModel(model) error_msg = (r"Buffer y is removed from the model when calling the " r"forward method\.") with pytest.raises(poptorch.Error, match=error_msg): poptorch_model(torch.tensor([5.0])) def test_error_on_redefine_buffer(): class Model(torch.nn.Module): def __init__(self): super().__init__() self.register_buffer('y', torch.tensor([2])) def forward(self, x): x = x + 1 # pylint: disable=attribute-defined-outside-init self.y = x model = Model() poptorch_model = poptorch.inferenceModel(model) error_msg = (r"Buffer y is reassigned within the model when calling the " r"forward method\. This is not supported\. Consider using " r"self\.y\.copy_\(src\) to copy data " r"from a source tensor, where src is the name of the " r"source tensor\.") with pytest.raises(poptorch.Error, match=error_msg): poptorch_model(torch.tensor([5.0])) class BufferUpdatingModel(torch.nn.Module): def __init__(self): super().__init__() self.conv = torch.nn.Conv2d(2, 2, 1, padding=0) self.register_buffer("test_buff", torch.zeros([2], dtype=torch.float32)) self.loss = torch.nn.L1Loss() def forward(self, inp, target): x = self.conv(inp) with torch.no_grad(): # pylint: disable=attribute-defined-outside-init self.test_buff += self.conv.bias[0] return x, self.loss(x, target) @pytest.mark.parametrize("device_iterations", [1, 3, 5]) @pytest.mark.parametrize("gradient_accumulation", [1, 3, 5]) def test_buffer_update_with_param(device_iterations, gradient_accumulation): model = BufferUpdatingModel() model.conv.weight.data = torch.ones_like(model.conv.weight.data) model.conv.bias.data = torch.ones_like(model.conv.bias.data) opt = torch.optim.SGD(model.parameters(), lr=0.1) times_to_run = 10 dummy_input = torch.ones([2, 2, 2, 2]) dummy_target = torch.zeros_like(dummy_input) for _ in range(times_to_run * device_iterations): opt.zero_grad() for _ in range(gradient_accumulation): _, loss = model(dummy_input, dummy_target) # Match mean gradient_accumulation loss /= gradient_accumulation loss.backward() opt.step() model_bias = model.conv.bias.clone() model_test_buff = model.test_buff.clone() # pylint: disable=attribute-defined-outside-init model.test_buff = torch.zeros([2], dtype=torch.float32) model.conv.weight.data = torch.ones_like(model.conv.weight.data) model.conv.bias.data = torch.ones_like(model.conv.bias.data) # Check for proper cloning with pytest.raises(AssertionError): helpers.assert_allclose(expected=model_bias, actual=model.conv.bias) with pytest.raises(AssertionError): helpers.assert_allclose(expected=model_test_buff, actual=model.test_buff) opts = poptorch.Options() opts.deviceIterations(device_iterations) opts.Training.gradientAccumulation(gradient_accumulation) dummy_input = torch.ones( [2 * device_iterations * gradient_accumulation, 2, 2, 2]) dummy_target = torch.zeros_like(dummy_input) poptorch_model = poptorch.trainingModel(model, optimizer=torch.optim.SGD( model.parameters(), lr=0.1), options=opts) for _ in range(times_to_run): dummy_target = torch.zeros_like(dummy_input) poptorch_model(dummy_input, dummy_target) helpers.assert_allclose(expected=model_bias, actual=poptorch_model.conv.bias) helpers.assert_allclose(expected=model_test_buff, actual=poptorch_model.test_buff) def test_failing_on_replicas(): model = BufferUpdatingModel() opts = poptorch.Options() opts.replicationFactor(2) poptorch_model = poptorch.trainingModel(model, optimizer=torch.optim.SGD( model.parameters(), lr=0.1), options=opts) dummy_input = torch.ones([4, 2, 2, 2]) dummy_target = torch.zeros_like(dummy_input) error_msg = (r"PopTorch does not support broadcasting buffers. " r"If your model is able to tolerate buffers becoming " r"out of sync between replicas, you can disable " r"buffer broadcasting using " r"poptorch.Options.broadcastBuffers\(False\).") with pytest.raises(poptorch.Error, match=error_msg): poptorch_model(dummy_input, dummy_target) def test_constant_buffer_with_replicas(): # This should not have an error as the buffer is constant model = ConstantBuffer() opts = poptorch.Options() opts.replicationFactor(2) poptorch_model = poptorch.inferenceModel(model, opts) poptorch_model(torch.tensor([1, 2])) def test_no_input_but_one_buffer(): class Model(torch.nn.Module): def __init__(self): super().__init__() self.register_buffer("x", torch.tensor([1.], dtype=torch.float)) def forward(self): # pylint: disable=attribute-defined-outside-init,no-member self.x += 1.0 return self.x model = Model() poptorch_model = poptorch.inferenceModel(model) assert poptorch_model() == 2. assert poptorch_model() == 3. assert poptorch_model() == 4. assert poptorch_model() == 5. def test_unsynchronised_replicated_buffers(): class ReplicaBufferModel(torch.nn.Module): def __init__(self): super().__init__() self.register_buffer("buffer", torch.zeros(1, 2)) def forward(self, x): buffer_update = self.buffer + x self.buffer.copy_(buffer_update) return poptorch.identity_loss(self.buffer, reduction='none') num_replica = 2 torch.manual_seed(43) opts = poptorch.Options() opts.replicationFactor(num_replica) opts.deviceIterations(1) opts.broadcastBuffers(False) model = ReplicaBufferModel() model.float() poptorch_model = poptorch.inferenceModel(model, opts) x = torch.tensor([[9], [2]]) # Each replica update its buffer in place with a random value 50 times. for _ in range(50): y = poptorch_model(x) assert y[0][-1] == x[0] * 50 assert y[1][-1] == x[1] * 50 ================================================ FILE: tests/conftest.py ================================================ #!/usr/bin/env python3 # Copyright (c) 2020 Graphcore Ltd. All rights reserved. import random import enum import os.path as osp import gc import pytest import torch import helpers import numpy as np import poptorch @pytest.fixture(autouse=True) def cleanup(): # Explicitly clean up to make sure we detach from the IPU and free the graph # before the next test starts. gc.collect() # Documentation about markers: https://docs.pytest.org/en/6.2.x/example/markers.html hw_available = poptorch.ipuHardwareIsAvailable() def pytest_make_parametrize_id(val, argname): if isinstance(val, enum.Enum): return f"{argname}:{val.name}" if val is None or isinstance(val, (bool, int, str, float, torch.dtype)): return f"{argname}:{val}" if isinstance(val, type): return f"{argname}:{val.__name__}" # Use default return None def pytest_configure(config): config.addinivalue_line("markers", ("ipuHardwareRequired: require IPU hardware to be " "available on the platform")) config.addinivalue_line("markers", ("excludeFromReducedTesting: exclude from " "reduced testing runs")) config.addinivalue_line("markers", ("extendedTestingOnly: to only include " "in extended testing runs because it takes a " "long time to run")) if config.getoption("collectonly"): helpers.is_running_tests = False helpers.running_reduced_testing = config.getoption("reduced_testing") def pytest_runtest_setup(item): # Is it a test with parameters? if hasattr(item, 'callspec'): # Does it have a trace_model parameter ? trace_model = item.callspec.params.get("trace_model") if trace_model is not None: if trace_model: pytest.skip("Tracing is no longer supported: skipping.") if any(item.iter_markers("ipuHardwareRequired")): if not hw_available: pytest.skip("Hardware IPU needed to test this feature.") # Source: https://raphael.codes/blog/customizing-your-pytest-test-suite-part-2/ def pytest_collection_modifyitems(session, config, items): # pylint: disable=unused-argument # if --extended-tests is set: include all the tests with a # "extendedTestingOnly" marker (Even if --hw-tests-only is set). # if --hw-tests-only is set: only keep tests with a "ipuHardwareRequired" # marker. # if --no-hw-tests is set: keep only the other ones. hw_required = [] hw_not_required = [] force_include = [] force_exclude = [] include_extended = config.getoption("extended_tests") for item in items: if helpers.running_reduced_testing and any( item.iter_markers("excludeFromReducedTesting")): force_exclude.append(item) elif any(item.iter_markers("extendedTestingOnly")): if include_extended: force_include.append(item) else: force_exclude.append(item) elif any(item.iter_markers("ipuHardwareRequired")): hw_required.append(item) else: hw_not_required.append(item) if config.getoption("hw_tests_only"): config.hook.pytest_deselected(items=hw_not_required + force_exclude) items[:] = hw_required + force_include elif config.getoption("no_hw_tests"): config.hook.pytest_deselected(items=hw_required + force_exclude) items[:] = hw_not_required + force_include else: config.hook.pytest_deselected(items=force_exclude) items[:] = hw_required + hw_not_required + force_include def pytest_sessionfinish(session, exitstatus): # Exit status 5 means no tests were collected -> this is not an error. # In our case this is not an error because some files might only contain # HW tests for example and therefore won't have any test to run if # --hw-tests-only is used. if exitstatus == 5: session.exitstatus = 0 def pytest_addoption(parser): parser.addoption("--hw-tests-only", action="store_true", default=False, help="Only run HW tests") parser.addoption("--no-hw-tests", action="store_true", default=False, help="Exclude all tests requiring HW") parser.addoption("--extended-tests", action="store_true", default=False, help=("Include all tests marked with " "'extendedTestingOnly' (Takes precedence over" " --no-hw-tests)")) parser.addoption("--reduced-testing", action="store_true", default=False, help=("Run some tests with a reduced " "number of parameters")) parser.addoption("--seed", type=int, default=0, help=("Set the seed for running the tests.")) parser.addoption("--external-datasets-dir", type=str, default=osp.join(osp.dirname(osp.abspath(__file__)), ".datasets"), help=("The directory where the external datasets will be " "downloaded.")) @pytest.fixture(autouse=True, scope="function") def random_seed(pytestconfig): """Set the random seed for all tests in this directory. autouse=True will use this fixture in every test. Seed can be overridden with --seed on the command line to alter the seed for testing purposes. By default uses 0 for all tests. """ seed = 0 if hasattr(pytestconfig, "seed"): seed = pytestconfig.seed np.random.seed(seed) random.seed(seed) torch.manual_seed(seed) ================================================ FILE: tests/convs_test.py ================================================ #!/usr/bin/env python3 # Copyright (c) 2020 Graphcore Ltd. All rights reserved. import unittest.mock import re import torch import pytest import helpers import poptorch # Convolutions. convolutions = [ torch.nn.Conv1d, torch.nn.Conv2d, torch.nn.Conv3d, torch.nn.ConvTranspose1d, torch.nn.ConvTranspose2d, torch.nn.ConvTranspose3d, torch.nn.Unfold, torch.nn.Fold, ] padding_modes = ['zeros', 'reflect', 'replicate', 'circular'] # Unsupported folds = [] # torch.nn.Unfold, torch.nn.Fold, # Supported. conv_1D = [torch.nn.Conv1d, torch.nn.ConvTranspose1d] conv_2D = [torch.nn.Conv2d, torch.nn.ConvTranspose2d] conv_3D = [torch.nn.Conv3d, torch.nn.ConvTranspose3d] def execute_and_check_wrapper(op, input, training=True, rtol=0.01, atol=0.01): if hasattr(op, 'padding_mode') and op.padding_mode != 'zeros': pytest.skip("TODO(T25617): PopART does not support PadGradOp when" " mode is not 'constant'") model = helpers.ModelWithWeights(op, input.shape, loss_fn=torch.nn.L1Loss(reduction='mean'), out_fn=lambda x: (x, torch.zeros_like(x))) if training: optimizer = poptorch.optim.SGD(model.parameters(), lr=0.01) poptorch_model = poptorch.trainingModel(model, optimizer=optimizer) try: has_own_weight = any( n == 'weight' for (n, p) in poptorch_model.op.named_parameters()) except AttributeError: has_own_weight = False if has_own_weight: weights_before = poptorch_model.op.weight.detach().clone() input = torch.ones_like(input) for _ in range(5): poptorch_out, loss = poptorch_model((input, )) if has_own_weight: model.op.weight.data = weights_before # pylint: disable=protected-access model.lin.weight.data = model._weights_before for _ in range(5): optimizer.zero_grad() native_out, loss = model((input, )) loss.backward() optimizer.step() # Inference test - check outputs helpers.assert_allclose(actual=poptorch_out, expected=native_out, rtol=rtol, atol=atol) else: poptorch_model = poptorch.inferenceModel(model) # Run on CPU. native_out, _ = model((input, )) # Run on IPU. poptorch_out, _ = poptorch_model((input, )) helpers.assert_allclose(actual=poptorch_out, expected=native_out, rtol=rtol, atol=atol) @pytest.mark.parametrize("op", conv_1D) @pytest.mark.parametrize("padding_mode", padding_modes) @pytest.mark.parametrize("training", [True, False]) def test_conv1D(op, padding_mode, training): # This combination doesn't exist in upstream Torch: # ValueError: Only "zeros" padding mode is supported for ConvTranspose1d if (op is torch.nn.ConvTranspose1d and padding_mode != 'zeros'): return torch.manual_seed(42) C_IN = 4 C_OUT = 8 input = torch.randn(1, C_IN, 10) # With square kernels and equal stride model = op(C_IN, C_OUT, 3, stride=2, padding_mode=padding_mode) execute_and_check_wrapper(model, input, training) if op is torch.nn.ConvTranspose1d: # 'popart_exception': Non default value for dilations is not supported. return # non-square kernels and unequal stride and with padding and dilation model = op(C_IN, C_OUT, (3), stride=(2), padding=(4), dilation=(3), padding_mode=padding_mode) execute_and_check_wrapper(model, input, training) @pytest.mark.parametrize("op", conv_2D) @pytest.mark.parametrize("padding_mode", padding_modes) @pytest.mark.parametrize("training", [True, False]) def test_conv2D(op, padding_mode, training): if (op is torch.nn.ConvTranspose2d and padding_mode != 'zeros') or \ padding_mode == 'circular': # TODO(T31811) pytest.skip('skipping unsupported padding_mode') torch.manual_seed(42) C_IN = 4 C_OUT = 2 input = torch.randn(1, C_IN, 8, 10) # With square kernels and equal stride model = op(C_IN, C_OUT, 3, stride=2, padding_mode=padding_mode) execute_and_check_wrapper(model, input, training, rtol=0.1, atol=0.1) # Grouped convolutions. model = op(C_IN, C_OUT, (3, 5), stride=2, groups=2, padding_mode=padding_mode) execute_and_check_wrapper(model, input, training, rtol=0.1, atol=0.1) # Rectangular padding/stride if op is not torch.nn.ConvTranspose2d: # non-square kernels and unequal stride and with padding model = op(C_IN, C_OUT, (3, 5), stride=(2, 1), padding=(4, 2)) execute_and_check_wrapper(model, input, training=False) # non-square kernels and unequal stride and with padding and dilation model = op(C_IN, C_OUT, (3, 5), stride=(2, 1), padding=(4, 2), dilation=(3), padding_mode=padding_mode) execute_and_check_wrapper(model, input, training, rtol=0.01, atol=0.05) @pytest.mark.parametrize("op", conv_3D) @pytest.mark.parametrize("padding_mode", padding_modes) @pytest.mark.parametrize("training", [True, False]) def test_conv3D(op, padding_mode, training): if (op is torch.nn.ConvTranspose3d and padding_mode != 'zeros') or \ (op is torch.nn.Conv3d and padding_mode == 'reflect') or \ padding_mode == 'circular': # TODO(T31811) pytest.skip('skipping unsupported padding_mode') torch.manual_seed(42) C_IN = 4 C_OUT = 2 input = torch.randn(1, C_IN, 3, 5, 8) # With square kernels and equal stride model = op(C_IN, C_OUT, 3, stride=2, padding_mode=padding_mode) execute_and_check_wrapper(model, input, training, rtol=0.1, atol=0.1) # Grouped convolutions. model = op(C_IN, C_OUT, 3, stride=2, groups=2, padding_mode=padding_mode) execute_and_check_wrapper(model, input, training, rtol=0.1, atol=0.1) if op is torch.nn.ConvTranspose3d: # test output padding model = op(C_IN, C_OUT, (3, 2, 2), stride=(2, 1, 1), groups=2, output_padding=[1, 0, 0], padding_mode=padding_mode) execute_and_check_wrapper(model, input, training, rtol=0.05, atol=0.05) else: # non-square kernels and unequal stride and with padding model = op(C_IN, C_OUT, (3, 2, 2), stride=(2, 1, 1), padding=(4, 2, 0), padding_mode=padding_mode) execute_and_check_wrapper(model, input, training, rtol=0.1, atol=0.1) # non-square kernels and unequal stride and with padding and dilation model = op(C_IN, C_OUT, (3, 4, 2), stride=(2, 1, 1), padding=(4, 2, 0), dilation=(3, 1, 1)) execute_and_check_wrapper(model, input, training, rtol=0.1, atol=0.1) def merge_dicts(x, y): z = x.copy() z.update(y) return z # The test is reliant on an IPU model with limited memory, so force the small model @pytest.mark.skip(reason="TODO(AFS-343)") @helpers.printCapfdOnExit @unittest.mock.patch.dict("os.environ", merge_dicts(helpers.forceSmallModel(), {"POPLIBS_LOG_LEVEL": "DEBUG"})) def test_available_memory(capfd): seen_length = 0 def get_mem_prop_for_conv(op): nonlocal seen_length torch.manual_seed(42) input = torch.randn(1, 4, 10, 10) model = helpers.ModelWithWeights( op, input.shape, loss_fn=torch.nn.L1Loss(reduction='mean'), out_fn=lambda x: (x, torch.zeros_like(x))) poptorch.inferenceModel(model).compile((input, )) _, log = capfd.readouterr() m = re.search(r"availableMemoryProportion\ +([\d.]+)", log) assert m return float(m.group(1)) model = torch.nn.Conv2d(4, 16, 10, stride=1) default_prop_for_conv = get_mem_prop_for_conv(model) model.register_forward_hook(lambda _1, _2, conv: poptorch. set_available_memory(conv, 0.5)) adjusted_prop_for_conv = get_mem_prop_for_conv(model) # The default value for available_memory should be more than 0.5 meaning # the default memory available for the convolution should be more than # after we adjusted the available memory assert default_prop_for_conv > adjusted_prop_for_conv assert adjusted_prop_for_conv == 0.5 @pytest.mark.parametrize("mode", poptorch.MatMulSerializationMode) def test_matmul_serialization(mode): torch.manual_seed(42) input_channels = 6 reducing_dim = 2 output_channels = 4 lhs = torch.randn(input_channels, reducing_dim) rhs = torch.randn(reducing_dim, output_channels) if mode == poptorch.MatMulSerializationMode.Disabled: factor = 0 elif mode == poptorch.MatMulSerializationMode.InputChannels: factor = 2 elif mode == poptorch.MatMulSerializationMode.ReducingDim: factor = 2 elif mode == poptorch.MatMulSerializationMode.OutputChannels: factor = 4 else: assert False, "Invalid mode" class BasicNetwork(torch.nn.Module): def forward(self, x, y): out = poptorch.serializedMatMul(x, y, mode, factor, keep_precision=True) return out # Just check we don't explode when the value is set. model = BasicNetwork() native_out = model(lhs, rhs) poptorch_model = poptorch.inferenceModel(model) poptorch_out = poptorch_model(lhs, rhs) helpers.assert_allclose(actual=poptorch_out, expected=native_out) def test_available_memory_automatic(): torch.manual_seed(42) # Just check we don't explode when the value is set. class Network(torch.nn.Module): def __init__(self): super().__init__() self.layer1 = torch.nn.Sequential(torch.nn.Conv2d(1, 10, 5), torch.nn.MaxPool2d(2), torch.nn.ReLU()) self.layer2 = torch.nn.Sequential(torch.nn.Conv2d(10, 20, 5), torch.nn.MaxPool2d(2), torch.nn.ReLU()) self.layer3 = torch.nn.Linear(320, 256) self.layer3_act = torch.nn.ReLU() self.layer4 = torch.nn.Linear(256, 10) self.softmax = torch.nn.LogSoftmax(1) def forward(self, x): x = self.layer1(x) x = self.layer2(x) x = x.view(-1, 320) x = self.layer3_act(self.layer3(x)) x = self.layer4(x) x = self.softmax(x) return x model = Network() # Run on CPU. input = torch.randn(2, 1, 28, 28) native_out = model(input) # Run on IPU. opts = poptorch.Options() opts.setAvailableMemoryProportion(available_memory_proportion={ "IPU0": 0.7, "IPU1": 0.2 }) poptorch_model = poptorch.inferenceModel(model, opts) poptorch_out = poptorch_model(input) helpers.assert_allclose(actual=poptorch_out, expected=native_out) @pytest.mark.parametrize("dim", range(-3, 3)) @pytest.mark.parametrize("training", [True, False]) def test_cumsum(dim, training): torch.manual_seed(42) op = lambda x: torch.cumsum(x, dim=dim) input = torch.randn(1, 5, 6, dtype=torch.float32) execute_and_check_wrapper(op, input, training, rtol=0.02, atol=0.02) @pytest.mark.parametrize("dim", range(-3, 3)) @pytest.mark.parametrize("training", [True, False]) def test_cumprod(dim, training): torch.manual_seed(42) input = torch.randn(1, 5, 6, dtype=torch.float32) op = lambda x: torch.cumprod(x, dim=dim) execute_and_check_wrapper(op, input, training, rtol=0.02, atol=0.02) @pytest.mark.parametrize("src_dtype", [torch.float, torch.int]) @pytest.mark.parametrize("dest_dtype", [torch.float, torch.int]) @pytest.mark.parametrize("dim", range(-1, 1)) def test_cumsum_changing_types(src_dtype, dest_dtype, dim): class Model(torch.nn.Module): def forward(self, inp): return inp.cumsum(dim=dim, dtype=dest_dtype) cpu_model = Model() ipu_model = poptorch.inferenceModel(cpu_model) torch.manual_seed(42) inp = torch.randn(1, 5, 6).to(src_dtype) helpers.assert_allclose(actual=ipu_model(inp), expected=cpu_model(inp)) # The free-function, `out=` form of `cumsum` works a bit differently to the # method form. @pytest.mark.parametrize("src_dtype", [torch.float, torch.int]) @pytest.mark.parametrize("dest_dtype", [torch.float, torch.int]) @pytest.mark.parametrize("dim", range(-1, 1)) def test_cumsum_changing_types_out(src_dtype, dest_dtype, dim): class Model(torch.nn.Module): def forward(self, inp): res = torch.empty(inp.shape).to(dest_dtype) return torch.cumsum(inp, dim=dim, out=res) cpu_model = Model() ipu_model = poptorch.inferenceModel(cpu_model) torch.manual_seed(42) inp = torch.randn(1, 5, 6).to(src_dtype) helpers.assert_allclose(actual=ipu_model(inp), expected=cpu_model(inp)) # Test that the result of `cumsum` can be passed forward without loss of tensor # shape metadata. @pytest.mark.parametrize("src_dtype", [torch.float, torch.int]) @pytest.mark.parametrize("dest_dtype", [torch.float, torch.int]) @pytest.mark.parametrize("dim", range(-1, 1)) def test_cumsum_can_pass_on(src_dtype, dest_dtype, dim): class Model(torch.nn.Module): def forward(self, inp): return inp.cumsum(dim=dim, dtype=dest_dtype)[:, -1] ipu_model = poptorch.inferenceModel(Model()) torch.manual_seed(42) inp = torch.randn(1, 5, 6).to(src_dtype) # Just test it doesn't fail try: ipu_model(inp) except poptorch.poptorch_core.Error as _: assert False, "Passing the result of torch.cumsum onwards failed." ================================================ FILE: tests/cpp/CMakeLists.txt ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. set(Boost_USE_STATIC_LIBS OFF) set(Boost_USE_MULTITHREADED ON) set(Boost_USE_STATIC_RUNTIME OFF) find_package(Boost 1.70 REQUIRED COMPONENTS unit_test_framework) # Ensure ABI matches that of PyTorch add_definitions(${TORCH_CXX_FLAGS}) function(add_unit_test name) add_executable(${name} ${ARGN}) target_link_libraries(${name} Boost::unit_test_framework torch poptorch poptorch_logging pthread) target_include_directories(${name} PRIVATE ${CMAKE_SOURCE_DIR}/poptorch/source/include/) add_test(${name} ${name}) endfunction() add_unit_test(GNNOptimizationsTest GNNOptimizationsTest.cpp) ================================================ FILE: tests/cpp/GNNOptimizationsTest.cpp ================================================ // Copyright (c) 2023 Graphcore Ltd. All rights reserved. #define BOOST_TEST_MODULE GNNOptimizationsTest #include #include #include #include "poptorch/OpBuilder.hpp" #include "poptorch/PopartCanonicalization.hpp" #include "poptorch/TypeAndConstantCanonicalization.hpp" #define CHECK_OPS_IN_GRAPH(graph_str, op) \ BOOST_CHECK_EQUAL(occurrences(graph_str, std::string(":").append(#op)), op); int occurrences(const std::string &graph, const std::string &phrase) { int occurrs = 0; std::string::size_type position = 0; while ((position = graph.find(phrase, position)) != std::string::npos) { occurrs++; position += phrase.length(); } return occurrs; } std::string parseGraphToStr(torch::jit::Graph *graph) { std::stringstream output_ir_stream; for (auto *node : graph->nodes()) { node->print(output_ir_stream, 0, nullptr, true, false, false, false); } return output_ir_stream.str(); } void checkIsReturnUpdated(torch::jit::Graph *graph) { torch::jit::Node *output = graph->outputs()[0]->node(); std::stringstream output_ir_stream; output->print(output_ir_stream, 0, nullptr, true, false, false, false); // Return from scatterreduce should be replaced by squeeze from grouped // version. BOOST_CHECK_EQUAL(occurrences(output_ir_stream.str(), "squeeze"), 1); } BOOST_AUTO_TEST_CASE(GroupScatterReduceAndGatherNodes0) { auto graph = std::make_shared(); const std::string input = R"IR( graph(): %1 : Float(requires_grad=0, device=cpu) = poptorch::tensor_constant() %2 : Int(requires_grad=0, device=cpu) = poptorch::tensor_constant() %3 : Float(requires_grad=0, device=cpu) = poptorch::tensor_constant() %4 : Int(requires_grad=0, device=cpu) = poptorch::tensor_constant() %5 : Float(2, strides=[1], requires_grad=0, device=cpu) = popart::scatterreduce[axis_size=0, axis=0, reduction=0, enable_index_broadcast=1](%1, %2) %6 : Float(2, strides=[1], requires_grad=0, device=cpu) = popart::scatterreduce[axis_size=0, axis=0, reduction=0, enable_index_broadcast=1](%3, %4) %7 : Float(2, strides=[1], requires_grad=0, device=cpu) = popart::scatterreduce[axis_size=0, axis=0, reduction=0, enable_index_broadcast=1](%5, %6) %8 : Float(2, strides=[1], requires_grad=0, device=cpu) = popart::gather[axis=0](%1, %2) %9 : Float(2, strides=[1], requires_grad=0, device=cpu) = popart::gather[axis=0](%3, %4) %10 : Float(2, strides=[1], requires_grad=0, device=cpu) = popart::gather[axis=0](%5, %6) return(%6) )IR"; parseIR(input, graph.get()); poptorch::groupScatterReduceAndGatherNodes(graph.get()); constexpr std::size_t tensor_constant = 4; constexpr std::size_t unsqueeze = 8; constexpr std::size_t concat = 4; constexpr std::size_t groupedscatterreduce = 1; constexpr std::size_t groupedgather = 1; constexpr std::size_t scatterreduce = 1; constexpr std::size_t gather = 1; constexpr std::size_t slice = 4; constexpr std::size_t squeeze = 4; std::string output_ir = parseGraphToStr(graph.get()); CHECK_OPS_IN_GRAPH(output_ir, tensor_constant); CHECK_OPS_IN_GRAPH(output_ir, unsqueeze); CHECK_OPS_IN_GRAPH(output_ir, concat); CHECK_OPS_IN_GRAPH(output_ir, groupedscatterreduce); CHECK_OPS_IN_GRAPH(output_ir, scatterreduce); CHECK_OPS_IN_GRAPH(output_ir, groupedgather); CHECK_OPS_IN_GRAPH(output_ir, gather); CHECK_OPS_IN_GRAPH(output_ir, slice); CHECK_OPS_IN_GRAPH(output_ir, squeeze); checkIsReturnUpdated(graph.get()); } BOOST_AUTO_TEST_CASE(GroupScatterReduceAndGatherNodes1) { auto graph = std::make_shared(); const std::string input = R"IR( graph(): %1 : Float(requires_grad=0, device=cpu) = poptorch::tensor_constant() %2 : Int(requires_grad=0, device=cpu) = poptorch::tensor_constant() %3 : Float(requires_grad=0, device=cpu) = poptorch::tensor_constant() %4 : Int(requires_grad=0, device=cpu) = poptorch::tensor_constant() %5 : Float(requires_grad=0, device=cpu) = poptorch::tensor_constant() %6 : Int(requires_grad=0, device=cpu) = poptorch::tensor_constant() %7 : Float(2, strides=[1], requires_grad=0, device=cpu) = popart::scatterreduce[axis_size=0, axis=0, reduction=0, enable_index_broadcast=1](%1, %2) %8 : Float(2, strides=[1], requires_grad=0, device=cpu) = popart::scatterreduce[axis_size=0, axis=0, reduction=0, enable_index_broadcast=1](%3, %4) %9 : Float(2, strides=[1], requires_grad=0, device=cpu) = popart::scatterreduce[axis_size=0, axis=1, reduction=0, enable_index_broadcast=1](%5, %6) %10 : Float(2, strides=[1], requires_grad=0, device=cpu) = popart::gather[axis=0](%1, %2) %11 : Float(2, strides=[1], requires_grad=0, device=cpu) = popart::gather[axis=0](%3, %4) %12 : Float(2, strides=[1], requires_grad=0, device=cpu) = popart::gather[axis=1](%5, %6) return(%8) )IR"; parseIR(input, graph.get()); poptorch::groupScatterReduceAndGatherNodes(graph.get()); constexpr std::size_t tensor_constant = 6; constexpr std::size_t unsqueeze = 8; constexpr std::size_t concat = 4; constexpr std::size_t groupedscatterreduce = 1; constexpr std::size_t groupedgather = 1; constexpr std::size_t scatterreduce = 1; constexpr std::size_t gather = 1; constexpr std::size_t slice = 4; constexpr std::size_t squeeze = 4; std::string output_ir = parseGraphToStr(graph.get()); CHECK_OPS_IN_GRAPH(output_ir, tensor_constant); CHECK_OPS_IN_GRAPH(output_ir, unsqueeze); CHECK_OPS_IN_GRAPH(output_ir, concat); CHECK_OPS_IN_GRAPH(output_ir, groupedscatterreduce); CHECK_OPS_IN_GRAPH(output_ir, scatterreduce); CHECK_OPS_IN_GRAPH(output_ir, groupedgather); CHECK_OPS_IN_GRAPH(output_ir, gather); CHECK_OPS_IN_GRAPH(output_ir, slice); CHECK_OPS_IN_GRAPH(output_ir, squeeze); checkIsReturnUpdated(graph.get()); } ================================================ FILE: tests/cpu_op_test.py ================================================ #!/usr/bin/env python3 # Copyright (c) 2020 Graphcore Ltd. All rights reserved. import torch import helpers import poptorch def test_simple_CPU(): torch.manual_seed(42) class Model(torch.nn.Module): def __init__(self): super().__init__() self.cpu = poptorch.CPU(self.foo, "MyCPUOp") def foo(self, x, y): return x * y def forward(self, x, y): w = self.cpu(x, y) return w * 3.0 model = Model() inference_model = poptorch.inferenceModel(model) in1 = torch.randn([5, 2, 3, 5]) in2 = torch.tensor([2.0]) out = inference_model(in1, in2) helpers.assert_allclose(actual=out, expected=in1 * 6.0, equal_nan=True) in2 = torch.tensor([4.0]) out = inference_model(in1, in2) helpers.assert_allclose(actual=out, expected=in1 * 12.0, equal_nan=True) def test_simple_CPU_multiple_outputs(): torch.manual_seed(42) class Model(torch.nn.Module): def __init__(self): super().__init__() self.cpu = poptorch.CPU(self.foo, "MyCPUOp") def foo(self, x, y): return x * y, x + y def forward(self, x, y): w, z = self.cpu(x, y) return w * 3.0, z model = Model() inference_model = poptorch.inferenceModel(model) in1 = torch.randn([5, 2, 3, 5]) in2 = torch.tensor([2.0]) out, out2 = inference_model(in1, in2) helpers.assert_allclose(actual=out, expected=in1 * 6.0, equal_nan=True) helpers.assert_allclose(actual=out2, expected=(in1 + in2), equal_nan=True) in2 = torch.tensor([4.0]) out, out2 = inference_model(in1, in2) helpers.assert_allclose(actual=out, expected=in1 * 12.0, equal_nan=True) helpers.assert_allclose(actual=out2, expected=(in1 + in2), equal_nan=True) def test_CPU_reduce(): torch.manual_seed(42) class Model(torch.nn.Module): def __init__(self): super().__init__() self.cpu = poptorch.CPU(self.foo, "MyCPUOp") def foo(self, x): return torch.mean(x) def forward(self, x): w = self.cpu(x) return w model = Model() inference_model = poptorch.inferenceModel(model) in1 = torch.randn([5, 2, 3, 5]) out = inference_model(in1) helpers.assert_allclose(actual=out, expected=torch.mean(in1), equal_nan=True) def test_CPU_matmul(): torch.manual_seed(42) class Model(torch.nn.Module): def __init__(self): super().__init__() self.matmul = [torch.nn.Linear(20, 30)] self.cpu = poptorch.CPU(self.matmul[0], "MatMulOnCPU") def forward(self, input): return self.cpu(input) model = Model() inference_model = poptorch.inferenceModel(model) input = torch.randn(128, 20) out = inference_model(input) helpers.assert_allclose(actual=out, expected=model.matmul[0](input), equal_nan=True) def test_CPU_multiple_calls(): torch.manual_seed(42) class Model(torch.nn.Module): def __init__(self): super().__init__() self.cpu = poptorch.CPU(self.foo, "MyCPUOp") def foo(self, x): assert x.device.type == "cpu", x.device.type return x * 2.0 def forward(self, x): out = self.cpu(x) out = self.cpu(out) out = self.cpu(out) return out model = Model() inference_model = poptorch.inferenceModel(model) in1 = torch.randn([5, 2, 3, 5]) out = inference_model(in1) helpers.assert_allclose(actual=out, expected=in1 * 8.0, equal_nan=True) def test_CPU_multiple_calls_multiple_classes(): torch.manual_seed(42) class Model(torch.nn.Module): def __init__(self): super().__init__() self.cpu = poptorch.CPU(self.foo, "MyCPUOp") self.cpu2 = poptorch.CPU(self.bar, "MyCPUOp2") def foo(self, x): return x * 2.0 def bar(self, x, y): return x + y def forward(self, x, y): out = self.cpu(x) out = self.cpu2(out, y) out = self.cpu(out) out = self.cpu2(out, y) out = self.cpu(out) out = self.cpu2(out, y) return out model = Model() inference_model = poptorch.inferenceModel(model) in1 = torch.randn([5]) in2 = torch.randn([5]) out = inference_model(in1, in2) helpers.assert_allclose(actual=out, expected=model(in1, in2), equal_nan=True) # Just test that the dispatcher is disabled in the CPU op, and re-enabled # afterwards. def test_poptorch_op_in_cpu_op(): torch.manual_seed(42) class Model(torch.nn.Module): def __init__(self): super().__init__() self.cpu = poptorch.CPU(self.foo, "MyCPUOp") def foo(self, x): return poptorch.identity_loss(x, reduction='sum') def forward(self, x): w = self.cpu(x) return w, self.foo(x) options = poptorch.Options() options.deviceIterations(2) dispatched_model = poptorch.inferenceModel(Model(), options) # Just check it doesn't crash dispatched_model(torch.tensor([1.0, 2.0])) ================================================ FILE: tests/ctc_decoder_test.py ================================================ #!/usr/bin/env python3 # Copyright (c) 2021 Graphcore Ltd. All rights reserved. import torch import poptorch class SimpleModel(torch.nn.Module): def forward(self, log_probs, lengths): return poptorch.ctc_beam_search_decoder(log_probs, lengths) def test_ctc_decoder(): input_size = 9 batch_size = 3 num_classes = 10 torch.manual_seed(42) log_probs = torch.randn(input_size, batch_size, num_classes) lengths = torch.randint(5, input_size, (batch_size, ), dtype=torch.int) model = SimpleModel() poptorch_model = poptorch.inferenceModel(model) result = poptorch_model(log_probs, lengths) # note we have no reference implementation so only the most basic # test is possible - relying on popart/poplibs which are validated # against tensorflow's implementation assert result[0].shape == (batch_size, 1) assert result[1].shape == (batch_size, 1) assert result[2].shape == (batch_size, 1, input_size) ================================================ FILE: tests/custom_loss_test.py ================================================ #!/usr/bin/env python3 # Copyright (c) 2020 Graphcore Ltd. All rights reserved. import torch import torch.optim as optim import pytest import helpers import poptorch # Test the reductions work as expected @pytest.mark.parametrize("reduction", ["none", "mean", "sum"]) def test_non_final_loss_reductions(reduction): torch.manual_seed(42) base_model = torch.nn.Linear(10, 10) class CustomLoss(torch.nn.Module): # Mean squared error scaled. def forward(self, x, target): partial_loss = poptorch.identity_loss(x - target, reduction=reduction) loss = partial_loss * partial_loss * 5 return partial_loss, poptorch.identity_loss(loss, reduction="mean") loss_fn = CustomLoss() class ModelWithLoss(torch.nn.Module): def __init__(self): super().__init__() self.base_model = base_model def forward(self, data, target): out = base_model(data) loss = loss_fn(out, target) return out, loss model = ModelWithLoss() poptorch_model = poptorch.trainingModel(model) target = torch.randn(10) input = torch.randn(10) # Capture what the loss function will see before the loss changes. x, _ = model(input, target) _, (partial_loss, _) = poptorch_model(input, target) # Check we have actually reduced the loss if reduction != "none": assert torch.numel(partial_loss) == 1 if reduction == "mean": simulated_loss = torch.mean(x - target) elif reduction == "sum": simulated_loss = torch.sum(x - target) elif reduction == "none": simulated_loss = x - target helpers.assert_allclose(expected=simulated_loss.reshape_as(partial_loss), actual=partial_loss, rtol=1e-02, atol=1e-02) # Test custom loss by training to targets def run_custom_loss_test(loss_fn, base_model=None, input=None, target=None, test_output_vs_target=True): torch.manual_seed(42) if base_model is None: base_model = torch.nn.Linear(10, 10) if input is None: input = torch.randn(1, 10) if target is None: target = torch.randint(0, 10, [1]) class ModelWithLoss(torch.nn.Module): def __init__(self): super().__init__() self.base_model = base_model self.loss_fn = loss_fn def forward(self, data, target): out = base_model(data) loss = self.loss_fn(out, target) return out, loss model = ModelWithLoss() poptorch_model = poptorch.trainingModel(model) # Pytorch native. native_out, loss = model(input, target) #Make sure the first run doesn't already pass the test. original, original_loss = poptorch_model(input, target) assert original_loss > 0.1 if test_output_vs_target: assert not torch.allclose(native_out, target, rtol=1e-02, atol=1e-02) assert not torch.allclose(original, target, rtol=1e-02, atol=1e-02) for _ in range(0, 2500): out, loss = poptorch_model(input, target) # Check we have trained the "model" assert loss < 0.1 if test_output_vs_target: helpers.assert_allclose(actual=out, expected=target, rtol=1e-02, atol=1e-02) # Check that the pytorch native model is also returning the trained # value that was trained on IPU. out, _ = model(input, target) helpers.assert_allclose(actual=out, expected=target, rtol=1e-02, atol=1e-02) return poptorch_model def test_custom_loss(): torch.manual_seed(42) class CustomLoss(torch.nn.Module): # Mean squared error scaled. def forward(self, x, target): loss = poptorch.identity_loss(x - target, reduction="none") loss = loss * loss * 5.0 return poptorch.identity_loss(loss, reduction="mean") run_custom_loss_test(loss_fn=CustomLoss(), input=torch.randn(10), target=torch.randn(10)) def test_custom_loss_l1(): torch.manual_seed(42) class CustomLoss(torch.nn.Module): # Mean squared error scaled. def forward(self, x, target): loss = torch.nn.functional.l1_loss(x, target) loss = loss * loss * 5.0 return poptorch.identity_loss(loss, reduction="mean") run_custom_loss_test(loss_fn=CustomLoss(), input=torch.randn(10), target=torch.randn(10)) def test_custom_loss_nll(): torch.manual_seed(42) class CustomLoss(torch.nn.Module): # Mean squared error scaled. def forward(self, x, target): loss = torch.nn.functional.nll_loss(x, target) loss = loss * 5.0 return poptorch.identity_loss(loss, reduction="mean") base_model = torch.nn.Sequential(torch.nn.Linear(10, 10), torch.nn.LogSoftmax(dim=1)) input = torch.randn(1, 10) target = torch.randint(0, 10, [1]) out = base_model(input) model = run_custom_loss_test(loss_fn=CustomLoss(), base_model=base_model, input=input, target=target, test_output_vs_target=False) model.copyWeightsToHost() # Check that the pytorch native model is also returning the trained # value that was trained on IPU. out = base_model(input) assert torch.argmax(out, dim=1) == target def test_two_custom_losses(): torch.manual_seed(42) base_model = torch.nn.Sequential(torch.nn.Linear(10, 10), torch.nn.LogSoftmax(dim=1)) class CustomLoss(torch.nn.Module): # Mean squared error scaled. def forward(self, x, target): loss = torch.nn.functional.nll_loss(x, target) loss2 = torch.nn.functional.nll_loss(x, target) * 5.0 return loss + loss2 error_msg = ("Multiple independent losses found in graph. " "Graph must have one final loss. " "Wrap final graph loss in poptorch.identity_loss.") with pytest.raises(poptorch.Error, match=error_msg): run_custom_loss_test(loss_fn=CustomLoss(), base_model=base_model) def test_two_custom_losses_with_id_wrapper(): torch.manual_seed(42) base_model = torch.nn.Sequential(torch.nn.Linear(10, 10), torch.nn.LogSoftmax(dim=1)) class CustomLoss(torch.nn.Module): # Mean squared error scaled. def forward(self, x, target): loss = torch.nn.functional.nll_loss(x, target) loss2 = torch.nn.functional.nll_loss(x, target) * 5.0 return poptorch.identity_loss(loss + loss2, reduction="mean") run_custom_loss_test(loss_fn=CustomLoss(), base_model=base_model, test_output_vs_target=False) def test_no_loss(): torch.manual_seed(42) class Model(torch.nn.Module): def __init__(self): super().__init__() self.model = torch.nn.Sequential(torch.nn.Linear(10, 10), torch.nn.LogSoftmax(dim=1)) # Mean squared error scaled. def forward(self, x, target): fwd = self.model(x) loss = fwd * 12 loss2 = target + 1 a = loss + loss2 return fwd, a, loss model = Model() optimizer = optim.SGD(model.parameters(), lr=0.01) poptorch_model = poptorch.trainingModel(model, optimizer=optimizer) label = torch.randint(0, 10, [1]) input = torch.randn(1, 10) error_msg = "Couldn't find a loss in graph" with pytest.raises(poptorch.Error, match=error_msg): _ = poptorch_model(input, label) ================================================ FILE: tests/custom_ops/CMakeLists.txt ================================================ # This compiles a shared object file for the cube operator. # Run make in the custom_ops folder to build. cmake_minimum_required(VERSION 3.14 FATAL_ERROR) project(custom_cube_op) set(CMAKE_BUILD_TYPE Debug) set(CMAKE_POSITION_INDEPENDENT_CODE ON) set(POPART_DIR CACHE PATH "Path to a Popart install") set(POPLAR_DIR CACHE PATH "Path to a Poplar install") if( NOT ${POPLAR_DIR} STREQUAL "") list(APPEND CMAKE_PREFIX_PATH ${POPLAR_DIR}) if(NOT poplar_FOUND) find_package(poplar REQUIRED) endif() else() # Check the package is not already in CMake's path find_package(poplar) if(NOT poplar_FOUND) message(FATAL_ERROR "You must provide a path to a Poplar install using -DPOPLAR_DIR=/path/to/popart/build/install") endif() endif() if( NOT EXISTS ${POPART_DIR} ) # Check the package is not already in CMake's path find_package(popart COMPONENTS popart-only) if(NOT popart_FOUND) message(FATAL_ERROR "You must provide a path to a Popart build using -DPOPART_DIR=/path/to/popart/build") endif() else() list(APPEND CMAKE_PREFIX_PATH ${POPART_DIR}) if(NOT popart_FOUND) find_package(popart REQUIRED COMPONENTS popart-only) endif() endif() # All C++ code in this project will be compiled as C++14 set (CMAKE_CXX_STANDARD 14) set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE) add_library(custom_cube_op SHARED "custom_cube_op.cpp") target_link_libraries(custom_cube_op popart-only) add_library(custom_leaky_relu_op SHARED "custom_leaky_relu_op.cpp") target_link_libraries(custom_leaky_relu_op popart-only) add_library(custom_add_scalar_op SHARED "custom_add_scalar_op.cpp") target_link_libraries(custom_add_scalar_op popart-only) add_library(custom_add_scalar_vec_op SHARED "custom_add_scalar_vec_op.cpp") target_link_libraries(custom_add_scalar_vec_op popart-only) add_library(custom_add_vec_scalar_mul_op SHARED "custom_add_vec_scalar_mul_op.cpp") target_link_libraries(custom_add_vec_scalar_mul_op popart-only) add_library(custom_reduce_op SHARED "custom_reduce_op.cpp") target_link_libraries(custom_reduce_op popart-only) add_library(custom_three_input_reduce_op SHARED "custom_three_input_reduce_op.cpp") target_link_libraries(custom_three_input_reduce_op popart-only) add_library(custom_many_attribute_op SHARED "custom_many_attribute_op.cpp") target_link_libraries(custom_many_attribute_op popart-only) ================================================ FILE: tests/custom_ops/custom_add_scalar_op.cpp ================================================ // Copyright (c) 2021 Graphcore Ltd. All rights reserved. // This tests the use of the int_64/float attributes #include #include #include #include #include #include #include // Use extern to avoid mangled names when importing to python extern "C" { namespace custom_operators { const popart::OperatorIdentifier add_scalar_float = { "test.poptorch", "AddScalarFloat", 1, 1, 1}; // NOLINT } // namespace custom_operators class AddScalarFloatOp; class AddScalarFloatOpx; class AddScalarFloatOp : public popart::Op { public: AddScalarFloatOp(const popart::OperatorIdentifier &_opid, float scalar, const popart::Op::Settings &settings_) : popart::Op(_opid, settings_), _scalar(scalar) {} void setup() override { outInfo(0) = inInfo(0); } std::unique_ptr clone() const final { return std::unique_ptr(new AddScalarFloatOp(*this)); } float getScalar() { return _scalar; } // An estimate of how valuable sub-graph matching will be float getSubgraphValue() const final { return getLowSubgraphValue(); } private: float _scalar; }; popart::OpCreator add_scalar_float_op( {{custom_operators::add_scalar_float, {}}}, [](const popart::OpCreatorInfo &info) { float scalar = info.attributes.getAttribute( "scalar", 0.0f); return std::unique_ptr( new AddScalarFloatOp(info.opid, scalar, info.settings)); }, true); class AddScalarFloatOpx : public popart::popx::Opx { public: AddScalarFloatOpx(popart::Op *op, popart::popx::Devicex *devicex) : popart::popx::Opx(op, devicex) { verifyOp(op, custom_operators::add_scalar_float); _scalar = dynamic_cast(op)->getScalar(); } void grow(poplar::program::Sequence &prog) const override { auto in_tensor = getInTensor(0); auto const_tensor = graph().addConstant(in_tensor.elementType(), {1}, _scalar, "scale_factor"); graph().setTileMapping(const_tensor, 0); auto out_tensor = popops::add(graph(), in_tensor, const_tensor, prog, debugContext()); setOutTensor(0, out_tensor); } private: float _scalar; }; static popart::popx::OpxCreator add_scalar_float_opx_creator(custom_operators::add_scalar_float); namespace custom_operators { const popart::OperatorIdentifier add_scalar_int = { "test.poptorch", "AddScalarInt", 1, 1, 1}; // NOLINT } // namespace custom_operators class AddScalarIntOp; class AddScalarIntOpx; class AddScalarIntOp : public popart::Op { public: AddScalarIntOp(const popart::OperatorIdentifier &_opid, std::int64_t scalar, const popart::Op::Settings &settings_) : popart::Op(_opid, settings_), _scalar(scalar) {} void setup() override { outInfo(0) = inInfo(0); } std::unique_ptr clone() const final { return std::unique_ptr(new AddScalarIntOp(*this)); } std::int64_t getScalar() { return _scalar; } // An estimate of how valuable sub-graph matching will be float getSubgraphValue() const final { return getLowSubgraphValue(); } private: std::int64_t _scalar; }; popart::OpCreator add_scalar_int_op( {{custom_operators::add_scalar_int, {}}}, [](const popart::OpCreatorInfo &info) { auto scalar = info.attributes.getAttribute("scalar", 0); return std::unique_ptr( new AddScalarIntOp(info.opid, scalar, info.settings)); }, true); class AddScalarIntOpx : public popart::popx::Opx { public: AddScalarIntOpx(popart::Op *op, popart::popx::Devicex *devicex) : popart::popx::Opx(op, devicex) { verifyOp(op, custom_operators::add_scalar_int); _scalar = dynamic_cast(op)->getScalar(); } void grow(poplar::program::Sequence &prog) const override { auto in_tensor = getInTensor(0); auto const_tensor = graph().addConstant(in_tensor.elementType(), {1}, _scalar, "scale_factor"); graph().setTileMapping(const_tensor, 0); auto out_tensor = popops::add(graph(), in_tensor, const_tensor, prog, debugContext()); setOutTensor(0, out_tensor); } private: int64_t _scalar; }; static popart::popx::OpxCreator add_scalar_int_opx_creator(custom_operators::add_scalar_int); } // extern "C" ================================================ FILE: tests/custom_ops/custom_add_scalar_vec_op.cpp ================================================ // Copyright (c) 2021 Graphcore Ltd. All rights reserved. // This tests the use of the int_64/float list attributes #include #include #include #include #include #include #include #include // Use extern to avoid mangled names when importing to python extern "C" { namespace custom_operators { const popart::OperatorIdentifier add_scalar_vec_float = { "test.poptorch", "AddScalarVecFloat", 1, 1, 1}; // NOLINT } // namespace custom_operators class AddScalarVecFloatOp; class AddScalarVecFloatOpx; class AddScalarVecFloatOp : public popart::Op { public: AddScalarVecFloatOp(const popart::OperatorIdentifier &_opid, std::vector vec, const popart::Op::Settings &settings_) : popart::Op(_opid, settings_), _vec(std::move(vec)) {} void setup() override { if (inInfo(0).shape().size() != 1) { exit(1); } if (static_cast(inInfo(0).shape()[0]) != _vec.size()) { exit(1); } outInfo(0) = inInfo(0); } std::unique_ptr clone() const final { return std::unique_ptr(new AddScalarVecFloatOp(*this)); } const std::vector &getVec() { return _vec; } // An estimate of how valuable sub-graph matching will be float getSubgraphValue() const final { return getLowSubgraphValue(); } private: std::vector _vec; }; popart::OpCreator add_scalar_vec_float_op( {{custom_operators::add_scalar_vec_float, {}}}, [](const popart::OpCreatorInfo &info) { std::vector const vec = info.attributes.getAttribute("vec"); return std::unique_ptr( new AddScalarVecFloatOp(info.opid, vec, info.settings)); }, true); class AddScalarVecFloatOpx : public popart::popx::Opx { public: AddScalarVecFloatOpx(popart::Op *op, popart::popx::Devicex *devicex) : popart::popx::Opx(op, devicex) { verifyOp(op, custom_operators::add_scalar_vec_float); _vec = dynamic_cast(op)->getVec(); } void grow(poplar::program::Sequence &prog) const override { auto in_tensor = getInTensor(0); auto const_tensor = graph().addConstant( poplar::FLOAT, {_vec.size()}, poplar::ArrayRef(_vec.data(), _vec.size()), "vec"); graph().setTileMapping(const_tensor, 0); auto out_tensor = popops::add(graph(), in_tensor, const_tensor, prog, debugContext()); setOutTensor(0, out_tensor); } private: std::vector _vec; }; static popart::popx::OpxCreator add_scalar_vec_float_opx_creator(custom_operators::add_scalar_vec_float); namespace custom_operators { const popart::OperatorIdentifier add_scalar_vec_int = { "test.poptorch", "AddScalarVecInt", 1, 1, 1}; // NOLINT } // namespace custom_operators class AddScalarVecIntOp; class AddScalarVecIntOpx; class AddScalarVecIntOp : public popart::Op { public: AddScalarVecIntOp(const popart::OperatorIdentifier &_opid, std::vector vec, const popart::Op::Settings &settings_) : popart::Op(_opid, settings_), _vec(std::move(vec)) {} void setup() override { if (inInfo(0).shape().size() != 1) { exit(1); } if (static_cast(inInfo(0).shape()[0]) != _vec.size()) { exit(1); } outInfo(0) = inInfo(0); } std::unique_ptr clone() const final { return std::unique_ptr(new AddScalarVecIntOp(*this)); } const std::vector &getVec() { return _vec; } // An estimate of how valuable sub-graph matching will be float getSubgraphValue() const final { return getLowSubgraphValue(); } private: std::vector _vec; }; popart::OpCreator add_scalar_vec_int_op( {{custom_operators::add_scalar_vec_int, {}}}, [](const popart::OpCreatorInfo &info) { std::vector const vec = info.attributes.getAttribute("vec"); return std::unique_ptr( new AddScalarVecIntOp(info.opid, vec, info.settings)); }, true); class AddScalarVecIntOpx : public popart::popx::Opx { public: AddScalarVecIntOpx(popart::Op *op, popart::popx::Devicex *devicex) : popart::popx::Opx(op, devicex) { verifyOp(op, custom_operators::add_scalar_vec_int); _vec = dynamic_cast(op)->getVec(); } void grow(poplar::program::Sequence &prog) const override { auto in_tensor = getInTensor(0); auto const_tensor = graph().addConstant( poplar::INT, {_vec.size()}, poplar::ArrayRef(_vec.data(), _vec.size()), "vec"); graph().setTileMapping(const_tensor, 0); auto out_tensor = popops::add(graph(), in_tensor, const_tensor, prog, debugContext()); setOutTensor(0, out_tensor); } private: std::vector _vec; }; static popart::popx::OpxCreator add_scalar_vec_int_opx_creator(custom_operators::add_scalar_vec_int); } // extern "C" ================================================ FILE: tests/custom_ops/custom_add_vec_scalar_mul_op.cpp ================================================ // Copyright (c) 2021 Graphcore Ltd. All rights reserved. // This tests the use of the int_64/float attributes #include #include #include #include #include #include #include // Use extern to avoid mangled names when importing to python extern "C" { namespace custom_operators { const popart::OperatorIdentifier add_vec_scalar_mul_float = { "test.poptorch", "AddVecScalarMulFloat", 1, 1, 1}; // NOLINT } // namespace custom_operators class AddVecScalarMulFloatOp; class AddVecScalarMulFloatOpx; // Add the vec and multiply by the scalar class AddVecScalarMulFloatOp : public popart::Op { public: AddVecScalarMulFloatOp(const popart::OperatorIdentifier &_opid, float scalar, std::vector vec, const popart::Op::Settings &settings_) : popart::Op(_opid, settings_), _scalar(scalar), _vec(std::move(vec)) {} void setup() override { if (inInfo(0).shape().size() != 1) { exit(1); } if (static_cast(inInfo(0).shape()[0]) != _vec.size()) { exit(1); } outInfo(0) = inInfo(0); } std::unique_ptr clone() const final { return std::unique_ptr(new AddVecScalarMulFloatOp(*this)); } float getScalar() { return _scalar; } const std::vector &getVec() { return _vec; } // An estimate of how valuable sub-graph matching will be float getSubgraphValue() const final { return getLowSubgraphValue(); } private: float _scalar; std::vector _vec; }; popart::OpCreator add_vec_scalar_mul_float_op( {{custom_operators::add_vec_scalar_mul_float, {}}}, [](const popart::OpCreatorInfo &info) { float const scalar = info.attributes.getAttribute("scalar", 0.0f); std::vector const vec = info.attributes.getAttribute("vec"); return std::unique_ptr( new AddVecScalarMulFloatOp(info.opid, scalar, vec, info.settings)); }, true); class AddVecScalarMulFloatOpx : public popart::popx::Opx { public: AddVecScalarMulFloatOpx(popart::Op *op, popart::popx::Devicex *devicex) : popart::popx::Opx(op, devicex) { verifyOp( op, custom_operators::add_vec_scalar_mul_float); _scalar = dynamic_cast(op)->getScalar(); _vec = dynamic_cast(op)->getVec(); } void grow(poplar::program::Sequence &prog) const override { auto in_tensor = getInTensor(0); auto vec_tensor = graph().addConstant( poplar::FLOAT, {_vec.size()}, poplar::ArrayRef(_vec.data(), _vec.size()), "vec"); graph().setTileMapping(vec_tensor, 0); auto added_tensor = popops::add(graph(), in_tensor, vec_tensor, prog, debugContext()); auto scalar_tensor = graph().addConstant(poplar::FLOAT, {1}, _scalar, "scale_factor"); graph().setTileMapping(scalar_tensor, 0); auto out_tensor = popops::mul(graph(), added_tensor, scalar_tensor, prog, debugContext()); setOutTensor(0, out_tensor); } private: float _scalar; std::vector _vec; }; static popart::popx::OpxCreator add_scalar_float_opx_creator(custom_operators::add_vec_scalar_mul_float); } // extern "C" ================================================ FILE: tests/custom_ops/custom_cube_op.cpp ================================================ // Copyright (c) 2020 Graphcore Ltd. All rights reserved. // This file has been lifted directly from the PopART examples. See file there // for usage. Modified to take in and return two tensors. #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include namespace { // for C++11 compatibility, we don't use std::make_unique template std::unique_ptr make_unique(Args &&...args) { return std::unique_ptr(new T(std::forward(args)...)); } } // namespace // Use extern to avoid mangled names when importing to python extern "C" { namespace CustomOperators { const popart::OperatorIdentifier Cube = { "com.acme", "Cube", 1, {2, 2}}; // NOLINT } // namespace CustomOperators namespace CustomGradOperators { const static popart::OperatorIdentifier CubeGrad = { // NOLINT "com.acme", "CubeGrad", 1, {2, 2}}; } // namespace CustomGradOperators // For training with a custom Op, four classes need to be implemented, // one for each of: // {forward, gradient} x {Op, Opx}. // // If only inference is required, then two classes need to be implemented: // {forward} x {Op, Opx}. // // The Op is a poplar/hardware agnostic description of the computation. // the Opx is the poplar implementation of the Op. // // We do training in this example, so the four classes implemented are: // class CubeOp; class CubeGradOp; class CubeOpx; class CubeGradOpx; // The forward Op class CubeOp : public popart::Op { public: CubeOp(const popart::OperatorIdentifier &_opid, const popart::Op::Settings &settings_) : popart::Op(_opid, settings_) {} // Configure the output popart Tensor void setup() override { outInfo(0) = inInfo(0); outInfo(1) = inInfo(1); } std::unique_ptr clone() const final { return make_unique(*this); } std::vector> getGradOps() override { std::vector> upops; upops.emplace_back(make_unique(*this)); return upops; } // An estimate of how valuable sub-graph matching will be float getSubgraphValue() const final { return getLowSubgraphValue(); } }; static popart::OpCreator cubeOpCreator({{CustomOperators::Cube, {}}}, true); // The forward Opx (poplar implementation of the forward Op) class CubeOpx : public popart::popx::Opx { public: CubeOpx(popart::Op *op, popart::popx::Devicex *devicex) : popart::popx::Opx(op, devicex) { // Not strictly necessary, we check that op is castable to a CubeOp *. verifyOp(op, CustomOperators::Cube); } void grow(poplar::program::Sequence &prog) const override { auto output = popops::map( graph(), popops::expr::Add(popops::expr::Mul(popops::expr::Mul(popops::expr::_1, popops::expr::_1), popops::expr::_1), popops::expr::_2), {getInTensor(0), getInTensor(1)}, prog, debugContext()); setOutTensor(0, output); auto output2 = popops::map( graph(), popops::expr::Mul(popops::expr::Mul(popops::expr::_1, popops::expr::_1), popops::expr::_1), {getInTensor(0)}, prog, debugContext()); setOutTensor(1, output2); } }; // The gradient Op class CubeGradOp : public popart::Op { public: explicit CubeGradOp(const popart::Op &fwdOp) : popart::Op(CustomGradOperators::CubeGrad, fwdOp.getSettings()) {} std::unique_ptr clone() const final { return make_unique(*this); } // same comment as for CubeOp, for running shape/type inference "statically" void setup() override { outInfo(0) = inInfo(0); } // function describing the inputs and output(s) of CubeGradOp // The Gradient Op which we are implementing (CubeGradOp) has 2 inputs. // The input at index 0 is: // the gradient of the 0'th output Tensor of the CubeOp. // The input at index 1 is : // the 0'th output Tensor of the CubeOp. // Supposing the CubeOp has input Tensor T0 and output Tensor T1, // // input at index 0 (T0) // | // CubeOp // | // output at index 0 (T1) // // Then the picture described by the map below looks like, // // // input at index 0 (gradient of T1) // | input at index 1 (T1) // | | // | | // CubeGradOp // | // | // output at index 0 (gradient of T0) // const std::vector &gradInputInfo() const override { static const std::vector inInfo = { {0, 0, popart::GradOpInType::GradOut}, {1, 1, popart::GradOpInType::Out}}; return inInfo; } const std::map &gradOutToNonGradIn() const override { static const std::map outInfo = {{0, 0}}; return outInfo; } // an estimate of how valuable sub-graph matching will be float getSubgraphValue() const final { return getLowSubgraphValue(); } }; class CubeGradOpx : public popart::popx::Opx { public: CubeGradOpx(popart::Op *op, popart::popx::Devicex *devicex) : popart::popx::Opx(op, devicex) { verifyOp(op, CustomGradOperators::CubeGrad); } // Create the gradient poplar::Tensor, which is // 3 * input_to_cube**2 * gradient_of_cube_output void grow(poplar::program::Sequence &prog) const final { insert( outId(0), popops::map(graph(), popops::expr::Mul( popops::expr::Const(3), popops::expr::Mul(popops::expr::Mul(popops::expr::_1, popops::expr::_1), popops::expr::_2)), {getInTensor(0), getInTensor(1)}, // FwdOut, GradOut prog, debugContext())); } }; static popart::popx::OpxCreator cubeOpxCreator(CustomOperators::Cube); static popart::popx::OpxCreator cubeGradOpxCreator(CustomGradOperators::CubeGrad); } static popart::RegisterShapeInferenceFunction cubeOpShapeInference(CustomOperators::Cube, [](auto &ctx) { ctx.outInfo(0) = ctx.inInfo(0); }); ================================================ FILE: tests/custom_ops/custom_leaky_relu_op.cpp ================================================ // Copyright (c) 2021 Graphcore Ltd. All rights reserved. // This file is based on the example in the PopART User Guide: // https://docs.sourcevertex.net/files/popart-popart-user-guide-latest/custom_ops.html #include #include #include #include #include #include #include namespace { // for C++11 compatibility, we don't use std::make_unique template std::unique_ptr makeUnique(Args &&...args) { return std::unique_ptr(new T(std::forward(args)...)); } } // namespace // Use extern to avoid mangled names when importing to python extern "C" { namespace custom_operators { const popart::OperatorIdentifier leaky_relu = { "com.acme", "LeakyRelu", 1, {1, 1}}; // NOLINT } // namespace custom_operators namespace custom_grad_operators { const static popart::OperatorIdentifier LeakyReluGrad = { // NOLINT "com.acme", "LeakyReluGrad", 1, {1, 1}}; } // namespace custom_grad_operators class LeakyReluGradOp; class LeakyReluOp : public popart::Op { public: LeakyReluOp(const popart::OperatorIdentifier &_opid, float alpha_, const popart::Op::Settings &settings_) : popart::Op(_opid, settings_), _alpha(alpha_) {} std::unique_ptr clone() const final { return makeUnique(*this); } void setup() final { outInfo(0) = inInfo(0); } void appendAttributes(popart::OpSerialiserBase &os) const override { Op::appendAttributes(os); os.appendAttribute("alpha", getAlpha()); } void appendOutlineAttributes(popart::OpSerialiserBase &os) const override { Op::appendOutlineAttributes(os); os.appendAttribute("alpha", getAlpha()); } std::vector> getGradOps() override { std::vector> upops; upops.emplace_back(makeUnique(*this)); return upops; } float getSubgraphValue() const final { return getHighSubgraphValue(); } bool requiresRandomSeed() const override { return false; } // Attributes float getAlpha() const { return _alpha; } private: float _alpha; }; static popart::OpDefinition::DataTypes t = {popart::DataType::FLOAT16, popart::DataType::FLOAT}; static popart::OpDefinition leaky_relu_op_def({popart::OpDefinition::Inputs({{"input", t}}), popart::OpDefinition::Outputs({{"output", t}}), popart::OpDefinition::Attributes({{"alpha", {"*"}}})}); static popart::OpCreator leaky_relu_op_creator( popart::OpDefinitions({{custom_operators::leaky_relu, leaky_relu_op_def}}), [](const popart::OpCreatorInfo &info) { float alpha = info.attributes.getAttribute( "alpha", 1e-2f); // default epsilon is 10**(-2) return makeUnique(info.opid, alpha, info.settings); }, true); class LeakyReluOpx : public popart::popx::Opx { public: LeakyReluOpx(popart::Op *op, popart::popx::Devicex *devicex) : popart::popx::Opx(op, devicex) { verifyOp(op, {custom_operators::leaky_relu}); } void grow(poplar::program::Sequence &prog) const final { auto op = getOp(); poplar::Tensor input = getInTensor(0); float alpha = op.getAlpha(); // x < 0.0f ? alpha * x : x auto expression = popops::expr::Select( popops::expr::Mul(popops::expr::Const(alpha), popops::expr::_1), popops::expr::_1, popops::expr::Lt(popops::expr::_1, popops::expr::Const(0.0f))); popops::mapInPlace(graph(), expression, {input}, prog, debugContext("LeakyRelu"), poplar::OptionFlags()); setOutTensor(0, input); } }; static popart::popx::OpxCreator add_scalar_float_opx_creator(custom_operators::leaky_relu); class LeakyReluGradOp : public popart::Op { public: explicit LeakyReluGradOp(const LeakyReluOp &fwdOp) : popart::Op(custom_grad_operators::LeakyReluGrad, fwdOp.settings), _alpha(fwdOp.getAlpha()) {} std::unique_ptr clone() const final { return std::make_unique(*this); } void setup() final { outInfo(0) = inInfo(0); }; const std::vector &gradInputInfo() const override { static const std::vector in_info = { {0, 0, popart::GradOpInType::GradOut}, {1, 0, popart::GradOpInType::In}}; return in_info; } // The Grad Op has 1 output, which is the gradient of the only input const std::map &gradOutToNonGradIn() const override { static const std::map out_info = {{0, 0}}; return out_info; } bool requiresRandomSeed() const override { return false; } // an estimate of how valuable sub-graph matching will be float getSubgraphValue() const final { return getHighSubgraphValue(); } float getAlpha() const { return _alpha; } // Implementation defined below void appendAttributes(popart::OpSerialiserBase &os) const override { Op::appendAttributes(os); os.appendAttribute("alpha", getAlpha()); } // Implementation defined below void appendOutlineAttributes(popart::OpSerialiserBase &os) const override { Op::appendOutlineAttributes(os); os.appendAttribute("alpha", getAlpha()); } private: float _alpha; }; class LeakyReluGradOpx : public popart::popx::Opx { public: LeakyReluGradOpx(popart::Op *op, popart::popx::Devicex *devicex) : popart::popx::Opx(op, devicex) { verifyOp(op, {custom_grad_operators::LeakyReluGrad}); } void grow(poplar::program::Sequence &prog) const final { auto op = getOp(); poplar::Tensor grad = getInTensor(0); poplar::Tensor input = getInTensor(1); float alpha = op.getAlpha(); // (grad * (x < 0.0f ? alpha : 1)) auto expression = popops::expr::Mul( popops::expr::Select( popops::expr::Const(alpha), popops::expr::Const(1.0f), popops::expr::Lt(popops::expr::_2, popops::expr::Const(0.0f))), popops::expr::_1); auto output = popops::map(graph(), expression, {grad, input}, prog, debugContext("LeakyReluGrad"), poplar::OptionFlags()); setOutTensor(0, output); } }; static popart::popx::OpxCreator leaky_relu_opx_creator({custom_operators::leaky_relu}); static popart::popx::OpxCreator leaky_relu_grad_opx_creator(custom_grad_operators::LeakyReluGrad); } // extern "C" ================================================ FILE: tests/custom_ops/custom_many_attribute_op.cpp ================================================ // Copyright (c) 2021 Graphcore Ltd. All rights reserved. // This tests the use of many attributes #include #include #include #include #include #include // Use extern to avoid mangled names when importing to python extern "C" { namespace custom_operators { const popart::OperatorIdentifier many_attribute = { "test.poptorch", "ManyAttributeOp", 1, 1, 1}; // NOLINT } // namespace custom_operators class ManyAttributeOp; class ManyAttributeOpx; // Adds one if all attributes in the creator were correct, otherwise acts // as an identity function class ManyAttributeOp : public popart::Op { public: ManyAttributeOp(const popart::OperatorIdentifier &_opid, bool all_passed, const popart::Op::Settings &settings_) : popart::Op(_opid, settings_), _all_passed(all_passed) {} void setup() override { outInfo(0) = inInfo(0); } std::unique_ptr clone() const final { return std::unique_ptr(new ManyAttributeOp(*this)); } bool allPassed() { return _all_passed; } // An estimate of how valuable sub-graph matching will be float getSubgraphValue() const final { return getLowSubgraphValue(); } private: bool _all_passed; }; popart::OpCreator many_attribute_op( {{custom_operators::many_attribute, {}}}, [](const popart::OpCreatorInfo &info) { bool correct = false; // Have 2 of each kind of attribute if (info.attributes.getAttribute( "float_one") == 1.0 && info.attributes.getAttribute( "float_minus_two") == -2.0 && info.attributes.getAttribute("int_zero") == 0 && info.attributes.getAttribute( "int_minus_five") == -5 && info.attributes.getAttribute( "floats_one_two_three") == std::vector{1.0, 2.0, 3.0} && info.attributes.getAttribute( "floats_minus_one_two_three") == std::vector{-1.0, -2.0, -3.0} && info.attributes.getAttribute( "ints_one_two_three") == std::vector{1, 2, 3} && info.attributes.getAttribute( "ints_minus_one_two_three") == std::vector{-1, -2, -3} && info.attributes.getAttribute( "a_string") == "string with quotes and slash \" ' \\ end" && info.attributes.getAttribute("strs") == std::vector{"\x01", "\x02", "\x03"}) { correct = true; } return std::unique_ptr( new ManyAttributeOp(info.opid, correct, info.settings)); }, true); class ManyAttributeOpx : public popart::popx::Opx { public: ManyAttributeOpx(popart::Op *op, popart::popx::Devicex *devicex) : popart::popx::Opx(op, devicex) { verifyOp(op, custom_operators::many_attribute); _all_passed = dynamic_cast(op)->allPassed(); } void grow(poplar::program::Sequence &prog) const override { auto in_tensor = getInTensor(0); auto const_tensor = graph().addConstant(in_tensor.elementType(), {1}, 1, "one"); graph().setTileMapping(const_tensor, 0); if (_all_passed) { auto out_tensor = popops::add(graph(), in_tensor, const_tensor, prog, debugContext()); setOutTensor(0, out_tensor); } else { setOutTensor(0, in_tensor); } } private: bool _all_passed; }; static popart::popx::OpxCreator many_attributes_opx_creator(custom_operators::many_attribute); } // extern "C" ================================================ FILE: tests/custom_ops/custom_reduce_op.cpp ================================================ // Copyright (c) 2021 Graphcore Ltd. All rights reserved. // This tests the use of the string attribute #include #include #include #include #include #include #include // Use extern to avoid mangled names when importing to python extern "C" { namespace custom_operators { const popart::OperatorIdentifier reduce = {"test.poptorch", "ReduceOp", 1, 1, 1}; // NOLINT } // namespace custom_operators class ReduceOp; class ReduceOpx; class ReduceOp : public popart::Op { public: ReduceOp(const popart::OperatorIdentifier &_opid, std::string reduction, const popart::Op::Settings &settings_) : popart::Op(_opid, settings_), _reduction(std::move(reduction)) {} void setup() override { auto in_tensor = inInfo(0); popart::Shape out_shape({}); outInfo(0).set(in_tensor.dataType(), out_shape); } std::unique_ptr clone() const final { return std::unique_ptr(new ReduceOp(*this)); } std::string getReduction() { return _reduction; } // An estimate of how valuable sub-graph matching will be float getSubgraphValue() const final { return getLowSubgraphValue(); } private: std::string _reduction; }; popart::OpCreator reduce_op( {{custom_operators::reduce, {}}}, [](const popart::OpCreatorInfo &info) { auto reduction = info.attributes.getAttribute( "reduction", "mean"); return std::unique_ptr( new ReduceOp(info.opid, reduction, info.settings)); }, true); class ReduceOpx : public popart::popx::Opx { public: ReduceOpx(popart::Op *op, popart::popx::Devicex *devicex) : popart::popx::Opx(op, devicex) { verifyOp(op, custom_operators::reduce); if (dynamic_cast(op)->getReduction() == "mean") { _mean = true; } else if (dynamic_cast(op)->getReduction() == "sum") { _mean = false; } else { exit(1); } } void grow(poplar::program::Sequence &prog) const override { const poplar::Tensor &in_tensor(getInTensor(0)); auto in_tensor_1_d = in_tensor.flatten(); double scale = 1.0; if (_mean) { scale /= in_tensor_1_d.dim(0); } auto scale_tensor = graph().addConstant(poplar::FLOAT, {}, scale, "scale"); graph().setTileMapping(scale_tensor, 0); auto out_tensor = popops::reduce(graph(), in_tensor_1_d, {0}, {popops::Operation::ADD, false, scale_tensor}, prog, debugContext("reduce")); setOutTensor(0, out_tensor); } private: // Mean if true, otherwise sum bool _mean; }; static popart::popx::OpxCreator reduce_opx_creator(custom_operators::reduce); } // extern "C" ================================================ FILE: tests/custom_ops/custom_three_input_reduce_op.cpp ================================================ // Copyright (c) 2021 Graphcore Ltd. All rights reserved. // This tests the use of the list of strings attribute #include #include #include #include #include #include // Use extern to avoid mangled names when importing to python extern "C" { namespace custom_operators { const popart::OperatorIdentifier three_reduce = { "test.poptorch", "ThreeReduceOp", 1, 3, 3}; // NOLINT } // namespace custom_operators class ThreeReduceOp; class ThreeReduceOpx; class ThreeReduceOp : public popart::Op { public: ThreeReduceOp(const popart::OperatorIdentifier &_opid, std::vector reductions, const popart::Op::Settings &settings_) : popart::Op(_opid, settings_), _reductions(std::move(reductions)) {} void setup() override { for (unsigned int i = 0; i < 3; i++) { auto in_tensor = inInfo(i); popart::Shape out_shape({}); outInfo(i).set(in_tensor.dataType(), out_shape); } } std::unique_ptr clone() const final { return std::unique_ptr(new ThreeReduceOp(*this)); } const std::vector &getReductions() { return _reductions; } // An estimate of how valuable sub-graph matching will be float getSubgraphValue() const final { return getLowSubgraphValue(); } private: const std::vector _reductions; }; popart::OpCreator three_reduce_op( {{custom_operators::three_reduce, {}}}, [](const popart::OpCreatorInfo &info) { auto reductions = info.attributes.getAttribute( "reductions"); return std::unique_ptr( new ThreeReduceOp(info.opid, reductions, info.settings)); }, true); class ThreeReduceOpx : public popart::popx::Opx { public: ThreeReduceOpx(popart::Op *op, popart::popx::Devicex *devicex) : popart::popx::Opx(op, devicex) { verifyOp(op, custom_operators::three_reduce); auto reductions = dynamic_cast(op)->getReductions(); _mean.reserve(reductions.size()); for (auto &reduction : reductions) { if (reduction == "mean") { _mean.emplace_back(true); } else { _mean.emplace_back(false); if (reduction != "sum") { exit(1); } } } } void grow(poplar::program::Sequence &prog) const override { for (unsigned int input_num = 0; input_num < 3; input_num++) { const poplar::Tensor &in_tensor(getInTensor(input_num)); auto in_tensor_1_d = in_tensor.flatten(); double scale = 1.0; if (_mean[input_num]) { scale /= in_tensor_1_d.dim(0); } auto scale_tensor = graph().addConstant(poplar::FLOAT, {}, scale, "scale"); graph().setTileMapping(scale_tensor, 0); auto out_tensor = popops::reduce(graph(), in_tensor_1_d, {0}, {popops::Operation::ADD, false, scale_tensor}, prog, debugContext("thee_reduce")); setOutTensor(input_num, out_tensor); } } private: // Mean if true, otherwise sum std::vector _mean; }; static popart::popx::OpxCreator reduce_opx_creator(custom_operators::three_reduce); } // extern "C" ================================================ FILE: tests/custom_ops_attributes_test.py ================================================ #!/usr/bin/env python3 # Copyright (c) 2021 Graphcore Ltd. All rights reserved. import collections import ctypes import pathlib import random import sys import pytest import torch import helpers import poptorch myso = list(pathlib.Path("tests").rglob("libcustom_*.*")) assert myso, "Failed to find libcustom_* libraries" for single_so in myso: ctypes.cdll.LoadLibrary(single_so) def test_float_attribute(): class Model(torch.nn.Module): def forward(self, x): x = poptorch.custom_op([x], "AddScalarFloat", "test.poptorch", 1, example_outputs=[x], attributes={"scalar": 3.5}) return x model = Model() x = torch.tensor([5.0]) inference_model = poptorch.inferenceModel(model) out = inference_model(x) expected = torch.tensor([8.5]) helpers.assert_allclose(actual=out[0], expected=expected) def test_float_attribute_too_low(): class Model(torch.nn.Module): def forward(self, x): x = poptorch.custom_op([x], "AddScalarFloat", "test.poptorch", 1, example_outputs=[x], attributes={"scalar": -sys.float_info.max}) return x model = Model() x = torch.tensor([5.0]) inference_model = poptorch.inferenceModel(model) with pytest.raises(poptorch.Error, match=r"-1\.79769e\+308 is too low for a Popart float " r"attribute\."): inference_model(x) def test_float_attribute_too_high(): class Model(torch.nn.Module): def forward(self, x): x = poptorch.custom_op([x], "AddScalarFloat", "test.poptorch", 1, example_outputs=[x], attributes={"scalar": sys.float_info.max}) return x model = Model() x = torch.tensor([5.0]) inference_model = poptorch.inferenceModel(model) with pytest.raises(poptorch.Error, match=r"1\.79769e\+308 is too high for a Popart float " r"attribute\."): inference_model(x) def test_int_attribute(): class Model(torch.nn.Module): def forward(self, x): x = poptorch.custom_op([x], "AddScalarInt", "test.poptorch", 1, example_outputs=[x], attributes={"scalar": 3}) return x model = Model() x = torch.tensor([5]) inference_model = poptorch.inferenceModel(model) out = inference_model(x) helpers.assert_allequal(actual=out[0], expected=torch.tensor([8], dtype=torch.int32)) def test_float_list_attribute(): class Model(torch.nn.Module): def forward(self, x): x = poptorch.custom_op([x], "AddScalarVecFloat", "test.poptorch", 1, example_outputs=[x], attributes={"vec": [1.0, 2.0, 3.0]}) return x model = Model() x = torch.tensor([3.0, 4.0, 5.0]) inference_model = poptorch.inferenceModel(model) out = inference_model(x) helpers.assert_allclose(actual=out[0], expected=torch.tensor([4.0, 6.0, 8.0])) def test_float_list_attribute_too_low(): class Model(torch.nn.Module): def forward(self, x): x = poptorch.custom_op( [x], "AddScalarVecFloat", "test.poptorch", 1, example_outputs=[x], attributes={"vec": [1.0, 2.0, -sys.float_info.max]}) return x model = Model() x = torch.tensor([3.0, 4.0, 5.0]) inference_model = poptorch.inferenceModel(model) with pytest.raises(poptorch.Error, match=r"-1\.79769e\+308 is too low for a Popart float " r"attribute\."): inference_model(x) def test_float_list_attribute_too_high(): class Model(torch.nn.Module): def forward(self, x): x = poptorch.custom_op( [x], "AddScalarVecFloat", "test.poptorch", 1, example_outputs=[x], attributes={"vec": [sys.float_info.max, 2.0, 3.0]}) return x model = Model() x = torch.tensor([3.0, 4.0, 5.0]) inference_model = poptorch.inferenceModel(model) with pytest.raises(poptorch.Error, match=r"1\.79769e\+308 is too high for a Popart float " r"attribute\."): inference_model(x) def test_float_tuple_attribute(): class Model(torch.nn.Module): def forward(self, x): x = poptorch.custom_op([x], "AddScalarVecFloat", "test.poptorch", 1, example_outputs=[x], attributes={"vec": (1.0, 2.0, 3.0)}) return x model = Model() x = torch.tensor([3.0, 4.0, 5.0]) inference_model = poptorch.inferenceModel(model) out = inference_model(x) helpers.assert_allclose(expected=out[0], actual=torch.tensor([4.0, 6.0, 8.0])) def test_int_list_attribute(): class Model(torch.nn.Module): def forward(self, x): x = poptorch.custom_op([x], "AddScalarVecInt", "test.poptorch", 1, example_outputs=[x], attributes={"vec": [1, 2, 3]}) return x model = Model() x = torch.tensor([3, 4, 5]) inference_model = poptorch.inferenceModel(model) out = inference_model(x) helpers.assert_allequal(actual=out[0], expected=torch.tensor([4, 6, 8], dtype=torch.int32)) def test_float_combined_attributes(): class Model(torch.nn.Module): def forward(self, x): x = poptorch.custom_op([x], "AddVecScalarMulFloat", "test.poptorch", 1, example_outputs=[x], attributes={ "vec": [1.0, 2.0, 3.0], "scalar": 2.0 }) return x model = Model() x = torch.tensor([3.0, 4.0, 5.0]) inference_model = poptorch.inferenceModel(model) out = inference_model(x) helpers.assert_allequal(actual=out[0], expected=torch.tensor([8.0, 12.0, 16.0])) def test_int_two_attributes(): class Model(torch.nn.Module): def forward(self, x): x = poptorch.custom_op([x], "AddScalarInt", "test.poptorch", 1, example_outputs=[x], attributes={"scalar": 3}) x = poptorch.custom_op(x, "AddScalarInt", "test.poptorch", 1, example_outputs=x, attributes={"scalar": 2}) return x model = Model() x = torch.tensor([5]) inference_model = poptorch.inferenceModel(model) out = inference_model(x) helpers.assert_allequal(actual=out[0], expected=torch.tensor([10], dtype=torch.int32)) @pytest.mark.parametrize("attr", ("sum", "mean")) def test_string_attribute(attr): class Model(torch.nn.Module): def forward(self, x): x = poptorch.custom_op([x], "ReduceOp", "test.poptorch", 1, example_outputs=[x], attributes={"reduction": attr}) return x model = Model() x = torch.tensor([5.0, 6.0, 7.0]) inference_model = poptorch.inferenceModel(model) out = inference_model(x) if attr == "mean": helpers.assert_allclose(actual=out[0], expected=torch.tensor(6.0)) else: helpers.assert_allclose(actual=out[0], expected=torch.tensor(18.0)) def test_non_ascii_string_attribute(): class Model(torch.nn.Module): def forward(self, x): x = poptorch.custom_op([x], "ReduceOp", "test.poptorch", 1, example_outputs=[x], attributes={"reduction": "a\u1f00b"}) return x model = Model() x = torch.tensor([5.0, 6.0, 7.0]) inference_model = poptorch.inferenceModel(model) with pytest.raises(ValueError, match="a\u1f00b contains non-ASCII characters."): inference_model(x) def test_string_list_attribute(): class Model(torch.nn.Module): def forward(self, x, y, z): x = poptorch.custom_op( [x, y, z], "ThreeReduceOp", "test.poptorch", 1, example_outputs=[x, y, z], attributes={"reductions": ["mean", "sum", "mean"]}) return x model = Model() x = torch.tensor([1.0, 2.0, 3.0]) y = torch.tensor([2.0, 3.0, 4.0]) z = torch.tensor([3.0, 4.0, 5.0]) inference_model = poptorch.inferenceModel(model) out = inference_model(x, y, z) helpers.assert_allequal(actual=out[0], expected=torch.tensor(2.0)) helpers.assert_allequal(actual=out[1], expected=torch.tensor(9.0)) helpers.assert_allequal(actual=out[2], expected=torch.tensor(4.0)) def test_non_asciistring_list_attribute(): class Model(torch.nn.Module): def forward(self, x, y, z): x = poptorch.custom_op( [x, y, z], "ThreeReduceOp", "test.poptorch", 1, example_outputs=[x, y, z], attributes={"reductions": ["a\u1f00b", "sum", "mean"]}) return x model = Model() x = torch.tensor([1.0, 2.0, 3.0]) y = torch.tensor([2.0, 3.0, 4.0]) z = torch.tensor([3.0, 4.0, 5.0]) inference_model = poptorch.inferenceModel(model) with pytest.raises(ValueError, match="a\u1f00b contains non-ASCII characters."): inference_model(x, y, z) ALL_ATTRIBUTES = { "float_one": 1.0, "float_minus_two": -2.0, "int_zero": 0, "int_minus_five": -5, "floats_one_two_three": [1.0, 2.0, 3.0], "floats_minus_one_two_three": [-1.0, -2.0, -3.0], "ints_one_two_three": [1, 2, 3], "ints_minus_one_two_three": [-1, -2, -3], "a_string": "string with quotes and slash \" ' \\ end", "strs": ["\x01", "\x02", "\x03"] } @pytest.mark.parametrize("seed", range(10)) def test_many_attributes(seed): attr_keys = list(ALL_ATTRIBUTES.keys()) random.seed(seed) random.shuffle(attr_keys) attrs_shuff = collections.OrderedDict() for key in attr_keys: attrs_shuff[key] = ALL_ATTRIBUTES[key] class Model(torch.nn.Module): def forward(self, x): x = poptorch.custom_op([x], "ManyAttributeOp", "test.poptorch", 1, example_outputs=[x], attributes=attrs_shuff) return x model = Model() x = torch.tensor([0.0]) inference_model = poptorch.inferenceModel(model) out = inference_model(x) helpers.assert_allequal(actual=out[0], expected=torch.tensor(1.0).reshape((1, ))) @pytest.mark.parametrize("seed", range(3)) def test_many_attributes_one_wrong(seed): attr_keys = list(ALL_ATTRIBUTES.keys()) random.seed(seed) random.shuffle(attr_keys) attrs_shuff = collections.OrderedDict() for key in attr_keys: attrs_shuff[key] = ALL_ATTRIBUTES[key] attrs_shuff["a_string"] = "Very wrong" class Model(torch.nn.Module): def forward(self, x): x = poptorch.custom_op([x], "ManyAttributeOp", "test.poptorch", 1, example_outputs=[x], attributes=attrs_shuff) return x model = Model() x = torch.tensor([0.0]) inference_model = poptorch.inferenceModel(model) out = inference_model(x) helpers.assert_allequal(actual=out[0], expected=torch.tensor(0.0).reshape((1, ))) #many_attribtes_examples_start def test_many_attributes_examples(): class Model(torch.nn.Module): def forward(self, x): attributes = { "float_one": 1.0, "float_minus_two": -2.0, "int_zero": 0, "int_minus_five": -5, "floats_one_two_three": [1.0, 2.0, 3.0], "floats_minus_one_two_three": [-1.0, -2.0, -3.0], "ints_one_two_three": [1, 2, 3], "ints_minus_one_two_three": [-1, -2, -3], "a_string": "string with quotes and slash \" ' \\ end", "strs": ["abc", "def", "ghi"] } x = poptorch.custom_op([x], "ManyAttributeOp", "test.poptorch", 1, example_outputs=[x], attributes=attributes) #many_attribtes_examples_end return x model = Model() x = torch.tensor([0.0]) inference_model = poptorch.inferenceModel(model) inference_model(x) ================================================ FILE: tests/custom_ops_test.py ================================================ #!/usr/bin/env python3 # Copyright (c) 2020 Graphcore Ltd. All rights reserved. import ctypes import pathlib import torch import torch.nn as nn import helpers import poptorch #loading_library_start myso = list(pathlib.Path("tests").rglob("libcustom_cube_op.*")) assert myso, "Failed to find libcustom_cube_op" myop = ctypes.cdll.LoadLibrary(myso[0]) #loading_library_end myso = list(pathlib.Path("tests").rglob("libcustom_leaky_relu_op.*")) assert myso, "Failed to find libcustom_leaky_relu_op" myop = ctypes.cdll.LoadLibrary(myso[0]) #inference_start def test_inference(): class BasicNetwork(nn.Module): def forward(self, x, bias): x, y = poptorch.custom_op([x, bias], "Cube", "com.acme", 1, example_outputs=[x, x]) return x, y #inference_end model = BasicNetwork() x = torch.full((1, 8), 2.0) bias = torch.full((1, 8), 4.0) inference_model = poptorch.inferenceModel(model) out = inference_model(x, bias) expected = (torch.full((1, 8), 12.0), torch.full((1, 8), 8.0)) helpers.assert_allclose(actual=out[0], expected=expected[0]) helpers.assert_allclose(actual=out[1], expected=expected[1]) def test_training(): def custom_loss(model_out, labels): l1 = torch.nn.functional.nll_loss(model_out[0], labels) # Popart errors if this is unused. l2 = torch.sum(model_out[1]) * 0.0001 return l1 + l2 class TrainingNetwork(nn.Module): def __init__(self): super().__init__() self.ln = torch.nn.Linear(100, 100) self.softmax = nn.Softmax(1) def forward(self, t, target): x = t[0] bias = t[1] x, y = poptorch.custom_op([x, bias], "Cube", "com.acme", 1, example_outputs=[x, x]) x = self.ln(x) x = self.softmax(x) out = (x, y) loss = custom_loss(out, target) return out, loss model = TrainingNetwork() x = torch.rand((1, 100)) bias = torch.full((1, 100), 2.0) y = torch.full([1], 42, dtype=torch.long) poptorch_model = poptorch.trainingModel(model) for _ in range(0, 100): x = torch.rand((1, 100)) out, _ = poptorch_model((x, bias), y) assert torch.argmax(out[0]) == 42 # Check that the custom op not only trains but also propagates the gradient backwards. def test_training_both_sides(): def custom_loss(model_out, labels): l1 = torch.nn.functional.nll_loss(model_out[0], labels) # Popart errors if this is unused. l2 = torch.sum(model_out[1]) * 0.0001 return l1 + l2 class TrainingNetwork(nn.Module): def __init__(self): super().__init__() self.ln1 = torch.nn.Linear(100, 100) self.ln2 = torch.nn.Linear(100, 100) self.softmax = nn.Softmax(1) def forward(self, t, target): x = self.ln1(t[0]) bias = t[1] x, y = poptorch.custom_op([x, bias], "Cube", "com.acme", 1, example_outputs=[x, x]) x = self.ln2(x) x = self.softmax(x) out = (x, y) loss = custom_loss(out, target) return out, loss model = TrainingNetwork() x = torch.rand((1, 100)) bias = torch.full((1, 100), 2.0) y = torch.full([1], 42, dtype=torch.long) weights_before = model.ln1.weight.clone() poptorch_model = poptorch.trainingModel(model) for _ in range(0, 100): x = torch.rand((1, 100)) out, _ = poptorch_model((x, bias), y) assert not torch.allclose(weights_before, model.ln1.weight) assert torch.argmax(out[0]) == 42 def test_inference_with_an_attribute(): #inference_with_attribute_start class Model(torch.nn.Module): def forward(self, x): x = poptorch.custom_op([x], "LeakyRelu", "com.acme", 1, example_outputs=[x], attributes={"alpha": 0.02}) return x[0] #inference_with_attribute_end model = Model() x = torch.tensor([-1.0, -0.5, 0.0, 0.5, 1.0]) inference_model = poptorch.inferenceModel(model) out = inference_model(x) helpers.assert_allclose(actual=out, expected=torch.tensor( [-0.02, -0.01, 0.0, 0.5, 1.0])) ================================================ FILE: tests/dataloader_test.py ================================================ # Copyright (c) 2020 Graphcore Ltd. All rights reserved. import itertools import functools import math import random import time import subprocess import marshal import re import os import sys import signal import torch import pytest import numpy import helpers import poptorch class BrokenDataset(torch.utils.data.Dataset): def __init__(self, length): super().__init__() self._length = length def __len__(self): return self._length def __getitem__(self, index): assert False, "Broken dataset" class IncrementDataset(torch.utils.data.Dataset): def __init__(self, shape, length, dtype=torch.float32): super().__init__() self._shape = shape self._length = length self._dtype = dtype def __len__(self): return self._length def __getitem__(self, index): if index >= self._length: raise StopIteration return torch.full(self._shape, index, dtype=self._dtype) class IncrementIterableDataset(torch.utils.data.IterableDataset): def __init__(self, shape, length, start=0, dtype=torch.float32): super().__init__() self._shape = shape self.length = length self.start = start self._dtype = dtype def __iter__(self): for index in range(self.length): yield torch.full(self._shape, self.start + index, dtype=self._dtype) def __getitem__(self, index): raise NotImplementedError("No __getitem__ for iterable datasets") class IncrementIterableDatasetWithLen(IncrementIterableDataset): def __len__(self): return self.length def __getitem__(self, index): raise NotImplementedError("No __getitem__ for iterable datasets") class IncrementDatasetWithLabels(torch.utils.data.Dataset): def __init__(self, shape, length): super().__init__() self._shape = shape self._length = length def __len__(self): return self._length def __getitem__(self, index): return (torch.full(self._shape, index, dtype=torch.float32), torch.full((1, ), index, dtype=torch.long)) class IncrementDatasetWithLabelsDict(torch.utils.data.Dataset): def __init__(self, shape, length): super().__init__() self._shape = shape self._length = length def __len__(self): return self._length def __getitem__(self, index): return { "data": torch.full(self._shape, index, dtype=torch.float32), "label": torch.full((1, ), index, dtype=torch.long) } class CheckOrderModel(torch.nn.Module): def forward(self, data, expected): # return expected + 1 if data was what we expected return torch.sum(data - expected) class DoubleData(torch.nn.Module): def forward(self, data): return data * 2 class DoubleDataLabel(torch.nn.Module): def forward(self, data, label): return data * 2, label * 2 def _run_test(shape=None, num_tensors=100, batch_size=1, num_workers=0, device_iterations=1, replication_factor=1): shape = shape or [2, 3] opts = poptorch.Options() opts.deviceIterations(device_iterations) opts.replicationFactor(replication_factor) data = poptorch.DataLoader(opts, IncrementDataset(shape, num_tensors), batch_size=batch_size, num_workers=num_workers) assert len(data) == num_tensors // (device_iterations * batch_size * replication_factor) model = poptorch.inferenceModel(CheckOrderModel(), opts) for it, d in enumerate(data): expected = torch.from_numpy( numpy.stack([ numpy.full(shape, i, dtype=numpy.float32) for i in range(data.combinedBatchSize * it, data.combinedBatchSize * (it + 1)) ])) diff = torch.sum(model(d, expected)) numpy.testing.assert_array_equal(diff.numpy(), [0.]) def test_simple(): _run_test() def test_batch(): _run_test(batch_size=4) def test_workers(): _run_test(num_workers=8) def test_device_iterations(): _run_test(device_iterations=4) @pytest.mark.ipuHardwareRequired def test_replica(): _run_test(replication_factor=4) @pytest.mark.ipuHardwareRequired def test_combined(): _run_test(batch_size=2, device_iterations=5, replication_factor=2, num_workers=4) def _run_process_test(shape=None, num_tensors=100, batch_size=1, num_workers=0, device_iterations=1, replication_factor=1, num_runs=1): shape = shape or [2, 3] opts = poptorch.Options() opts.deviceIterations(device_iterations) opts.replicationFactor(replication_factor) loader = poptorch.DataLoader(opts, IncrementDataset(shape, num_tensors), batch_size=batch_size, num_workers=num_workers, mode=poptorch.DataLoaderMode.Async) assert len(loader) == num_tensors // (device_iterations * batch_size * replication_factor) model = poptorch.inferenceModel(DoubleData(), opts) for _ in range(0, num_runs): for it, d in enumerate(loader): out = model(d) expected = torch.stack([ torch.full(shape, i * 2, dtype=torch.float32) for i in range(loader.combinedBatchSize * it, loader.combinedBatchSize * (it + 1)) ]) helpers.assert_allequal(actual=out, expected=expected) def test_multithreaded1(): _run_process_test(num_tensors=100, batch_size=2, device_iterations=1, replication_factor=1, num_workers=0) def test_multithreaded2(): _run_process_test(num_tensors=100, batch_size=2, device_iterations=10, replication_factor=1, num_workers=0) @pytest.mark.ipuHardwareRequired def test_multithreaded3(): _run_process_test(num_tensors=10, batch_size=2, device_iterations=1, replication_factor=4, num_workers=0) def _run_process_label_test(shape=None, num_tensors=100, batch_size=1, num_workers=0, device_iterations=1, replication_factor=1): shape = shape or [2, 3] opts = poptorch.Options() opts.deviceIterations(device_iterations) opts.replicationFactor(replication_factor) loader = poptorch.DataLoader(opts, IncrementDatasetWithLabels( shape, num_tensors), batch_size=batch_size, num_workers=num_workers, mode=poptorch.DataLoaderMode.Async) assert len(loader) == num_tensors // (device_iterations * batch_size * replication_factor) model = poptorch.inferenceModel(DoubleDataLabel(), opts) total = torch.zeros(shape) label_out = torch.zeros(1, dtype=torch.int) for (data, label) in loader: out, label = model(data, label) total += torch.sum(out, dim=0) label_out += torch.sum(label, dim=0) expected = 0 for i in range(0, num_tensors): expected += i * 2 numpy.testing.assert_array_equal(total[0][0].numpy(), [expected]) numpy.testing.assert_array_equal(label_out[0].item(), [expected]) def test_multithreaded4(): _run_process_label_test(num_tensors=60, batch_size=2, device_iterations=10, replication_factor=1, num_workers=0) def _run_subdataset_test(num_tensors=100, batch_size=1, num_workers=0, device_iterations=1, replication_factor=1, num_hosts=1): shape = [2, 3] dataset = IncrementDataset(shape, num_tensors) combined_batch_size = 0 next_expected = 0 for host_id in range(num_hosts): opts = poptorch.Options() opts.deviceIterations(device_iterations) opts.replicationFactor(replication_factor) opts.Distributed.configureProcessId(host_id, num_hosts) loader = poptorch.DataLoader(opts, dataset, batch_size=batch_size, num_workers=num_workers, mode=poptorch.DataLoaderMode.Async) combined_batch_size = loader.combinedBatchSize assert combined_batch_size == (device_iterations * batch_size * replication_factor) assert len(loader) == num_tensors // (combined_batch_size * num_hosts) for d in loader: for elt in d: val = int(elt[0][0].item()) assert val == next_expected next_expected += 1 # Number of processes shouldn't change how many tensors are returned num_expected = num_hosts * combined_batch_size * ( num_tensors // (combined_batch_size * num_hosts)) assert next_expected == num_expected def _run_shuffle_subdataset_test(num_tensors=100, batch_size=1, num_workers=0, device_iterations=1, replication_factor=1, num_hosts=1): shape = [2, 3] dataset = IncrementDataset(shape, num_tensors) total = [False] * num_tensors for host_id in range(num_hosts): seen = [False] * num_tensors opts = poptorch.Options() opts.deviceIterations(device_iterations) opts.replicationFactor(replication_factor) opts.randomSeed(42) opts.Distributed.configureProcessId(host_id, num_hosts) loader = poptorch.DataLoader(opts, dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers, mode=poptorch.DataLoaderMode.Async) combined_batch_size = loader.combinedBatchSize assert combined_batch_size == (device_iterations * batch_size * replication_factor) assert len(loader) == num_tensors // (combined_batch_size * num_hosts) for d in loader: for elt in d: val = int(elt[0][0].item()) assert not seen[val] seen[val] = True total[val] = True assert seen.count( True) == combined_batch_size * (num_tensors // (combined_batch_size * num_hosts)) # Iterate a second time to make sure the left over tensors get used too. for d in loader: for elt in d: val = int(elt[0][0].item()) total[val] = True # If we iterate twice in all the processes then all the tensors should be used. assert total.count(True) == num_tensors def test_subdataset(): _run_subdataset_test(batch_size=4, num_hosts=3) def test_subdataset2(): _run_subdataset_test(batch_size=2, num_hosts=2, num_workers=3) def test_shuffle_subdataset(): _run_shuffle_subdataset_test(batch_size=4, num_hosts=3) def test_shuffle_subdataset2(): _run_shuffle_subdataset_test(batch_size=2, num_hosts=2, num_workers=3) @pytest.mark.parametrize("num_processes", [2, 3, 4, 5]) @pytest.mark.parametrize("num_workers", [0, 1, 3]) def test_global_shuffle_each_epoch(num_processes, num_workers): each_process_data = [] for process_id in range(num_processes): each_process_data.append(list()) opts = poptorch.Options() opts.randomSeed(0) opts.Distributed.configureProcessId(process_id, num_processes) dataloader = poptorch.DataLoader( opts, IncrementDataset((), 100), batch_size=16, shuffle=True, num_workers=num_workers, ) for _ in range(5): each_epoch_data = [] for batch in dataloader: each_epoch_data += batch.tolist() each_process_data[process_id].append(sorted(each_epoch_data)) # Make sure data between epochs differs within the same process # for all processes. for process_data in each_process_data: for epoch_data_i, epoch_data_j in itertools.combinations( process_data, 2): assert epoch_data_i != epoch_data_j def test_interrupt_async_loader(): """Make sure the worker processes are stopped cleanly even when the end of the dataset is not reached.""" shape = [2, 3] num_tensors = 100 opts = poptorch.Options() loader = poptorch.DataLoader(opts, IncrementDataset(shape, num_tensors), batch_size=1, num_workers=1, mode=poptorch.DataLoaderMode.Async) assert len(loader) == num_tensors for _, _ in enumerate(loader): break def test_single_epoch(): shape = [2, 3] num_tensors = 100 opts = poptorch.Options() loader = poptorch.DataLoader(opts, IncrementDataset(shape, num_tensors), batch_size=1, num_workers=32, mode=poptorch.DataLoaderMode.Async) assert len(loader) == num_tensors for _, _ in enumerate(loader): continue def test_iterable_dataset(): shape = [2, 3] num_tensors = 100 loader = poptorch.AsynchronousDataAccessor( IncrementIterableDataset(shape, num_tensors)) for _, _ in enumerate(loader): continue # Make sure it works for more than 1 epoch for _, _ in enumerate(loader): continue def test_iterable_dataloader(): shape = [2, 3] num_tensors = 100 opts = poptorch.Options() loader = poptorch.DataLoader(opts, IncrementIterableDataset(shape, num_tensors), batch_size=1, num_workers=1, mode=poptorch.DataLoaderMode.Async) for _, t in enumerate(loader): assert t.shape == torch.Size([1, 2, 3]) continue # Make sure it works for more than 1 epoch for _, _ in enumerate(loader): continue @pytest.mark.parametrize("persistent_workers", {True, False}) def test_iterable_dataloader_reset(persistent_workers): shape = [2, 3] num_tensors = 10 opts = poptorch.Options() loader = poptorch.DataLoader(opts, IncrementDataset(shape, num_tensors), persistent_workers=persistent_workers, batch_size=1, num_workers=1, mode=poptorch.DataLoaderMode.Async) # Interrupt the first iteration for i, t in enumerate(loader): assert t.shape == torch.Size([1, 2, 3]) assert t[0][0][0] == i if i == 4: print(f"Last tensor first iteration {t}") break continue print("Second iterator") # Make sure the second iteration returns all the tensors for i, t in enumerate(loader): assert t[0][0][0] == i assert i == (num_tensors - 1) def test_early_preload(): shape = [2, 3] num_tensors = 10 num_buffers = 5 opts = poptorch.Options() data = IncrementDataset(shape, num_tensors) async_opts_preload = {'early_preload': True, 'buffer_size': num_buffers} async_opts_no_preload = { 'early_preload': False, 'buffer_size': num_buffers } dataloader_args = { 'options': opts, 'dataset': data, 'batch_size': 1, 'num_workers': 1 } preload = poptorch.DataLoader(**dataloader_args, mode=poptorch.DataLoaderMode.Async, async_options=async_opts_preload) no_preload = poptorch.DataLoader(**dataloader_args, mode=poptorch.DataLoaderMode.Async, async_options=async_opts_no_preload) time.sleep(2) # Give time for the worker to fill the buffer assert sum(no_preload._accessor._worker._data_buffers.indices_mem) == 1 # pylint: disable=protected-access, no-member assert sum( preload._accessor._worker._data_buffers.indices_mem) == num_buffers # pylint: disable=protected-access, no-member def test_batch_size_None(): shape = [2, 3] num_tensors = 10 opts = poptorch.Options() loader = poptorch.DataLoader(opts, IncrementIterableDataset(shape, num_tensors), batch_size=None, drop_last=False, num_workers=1, mode=poptorch.DataLoaderMode.Async) for _, t in enumerate(loader): assert t.shape == torch.Size([2, 3]) continue # Make sure it works for more than 1 epoch for _, _ in enumerate(loader): continue def test_iterable_dataset_len(): shape = [2, 3] num_tensors = 10 opts = poptorch.Options() loader = poptorch.DataLoader(opts, IncrementIterableDataset(shape, num_tensors), batch_size=None, drop_last=False, num_workers=1, mode=poptorch.DataLoaderMode.Async) with pytest.raises(TypeError, match="'IncrementIterableDataset' has no len()"): len(loader) loader = poptorch.DataLoader(opts, IncrementIterableDatasetWithLen( shape, num_tensors), batch_size=None, drop_last=False, num_workers=1, mode=poptorch.DataLoaderMode.Async) len(loader) def test_broken_dataset(): num_tensors = 100 opts = poptorch.Options() data = poptorch.DataLoader(opts, BrokenDataset(num_tensors), batch_size=1, num_workers=32) with pytest.raises(poptorch.Error, match="worker thread failed to start"): poptorch.AsynchronousDataAccessor(data) def test_subprocess_async_loader(): print(subprocess.check_output( ["python3", "-m", "pytest", __file__, "-k", "test_single_epoch"], stderr=subprocess.STDOUT).decode('utf-8'), flush=True) def test_subprocess_broken_dataset(): stdout = subprocess.check_output([ "python3", "-m", "pytest", __file__, "-k", "test_broken_dataset", "-s" ], stderr=subprocess.STDOUT).decode('utf-8') print(stdout) assert "AssertionError: Broken dataset" in stdout, ( "Couldn't find failure " "reason in stdout") @pytest.mark.parametrize("DatasetType", [IncrementDataset, IncrementIterableDataset]) @pytest.mark.parametrize("dtype", [torch.float32, torch.float16]) def test_reuse_workers(DatasetType, dtype): shape = [2, 3] num_tensors = 10 opts = poptorch.Options() loader = poptorch.DataLoader(opts, DatasetType(shape, num_tensors, dtype=dtype), batch_size=1, num_workers=2, mode=poptorch.DataLoaderMode.Async) loader_no_reuse = poptorch.DataLoader(opts, DatasetType(shape, num_tensors, dtype=dtype), batch_size=1, persistent_workers=False, num_workers=2, mode=poptorch.DataLoaderMode.Async) # Workers are created when the AsynchronousDataAccessor is instantiated # So the first iteration should be fast num_tensors = 0 start = time.perf_counter() for _ in loader_no_reuse: num_tensors += 1 end = time.perf_counter() print(f"First epoch no reuse: {end - start} {num_tensors}") # subsequent iterations will join and create new workers # when a new iterator is created. for _ in range(3): start = time.perf_counter() for _ in loader_no_reuse: num_tensors += 1 end = time.perf_counter() print(f"Other epoch no reuse: {end - start} {num_tensors}") start = time.perf_counter() num_tensors_reuse = 0 for _ in loader: num_tensors_reuse += 1 end = time.perf_counter() print(f"First epoch: {end - start} {num_tensors_reuse}") for _ in range(3): start = time.perf_counter() for _ in loader: num_tensors_reuse += 1 end = time.perf_counter() print(f"Other epoch: {end - start} {num_tensors_reuse}") assert num_tensors_reuse == num_tensors # Select a subset of the dataset for each worker def _worker_init_fn(worker_id): worker_info = torch.utils.data.get_worker_info() dataset = worker_info.dataset total_len = dataset.length per_worker = math.ceil(dataset.length / worker_info.num_workers) dataset.start = per_worker * worker_id if worker_id == worker_info.num_workers - 1: dataset.length = total_len - (per_worker * (worker_info.num_workers - 1)) else: dataset.length = per_worker @pytest.mark.parametrize( "mode", { poptorch.DataLoaderMode.Async, poptorch.DataLoaderMode.AsyncRebatched, poptorch.DataLoaderMode.Sync }) @pytest.mark.parametrize("dtype", [torch.float32, torch.float16]) def test_iterable_dataloader_drop_last(mode, dtype): shape = [2, 3] num_tensors = 101 num_workers = 7 batch_size = 4 if mode != poptorch.DataLoaderMode.AsyncRebatched: # Expected tensors # tensors per worker = ceil(101/7) = 15 # last worker = 10 tensor # batch size = 4 # Total = 6 * floor(15 / 4) + floor(10/4) # = 6 * 3 + 2 = 20 # Unused tensors = 101 - num_expected * 4 = 21 num_expected = 20 * batch_size else: # Best case expected: floor(101/4) = 25 -> unused = 1 num_expected = math.floor(num_tensors / batch_size) * batch_size opts = poptorch.Options() loader = poptorch.DataLoader(opts, IncrementIterableDataset(shape, num_tensors, dtype=dtype), batch_size=batch_size, num_workers=num_workers, mode=mode, drop_last=True, worker_init_fn=_worker_init_fn) values = set() for t in loader: assert t.shape == torch.Size([4, 2, 3]) for b in t: v = int(b[0][0]) assert v not in values values.add(v) assert len(values) == num_expected print("Missing tensors:") for i in range(num_tensors): if i not in values: print(i) # Make sure it works for more than 1 epoch values = set() for t in loader: assert t.shape == torch.Size([4, 2, 3]) for b in t: v = int(b[0][0]) assert v not in values values.add(v) assert len(values) == num_expected @pytest.mark.parametrize( "mode", { poptorch.DataLoaderMode.Async, poptorch.DataLoaderMode.AsyncRebatched, poptorch.DataLoaderMode.Sync }) @pytest.mark.parametrize("dtype", [torch.float32, torch.float16]) def test_indexable_dataloader_drop_last(mode, dtype): shape = [2, 3] num_tensors = 101 num_workers = 7 batch_size = 4 # Expected tensors # Best case expected: floor(101/4) = 25 -> unused = 1 num_expected = 100 opts = poptorch.Options() loader = poptorch.DataLoader(opts, IncrementDataset(shape, num_tensors, dtype=dtype), batch_size=batch_size, num_workers=num_workers, mode=mode) values = set() for t in loader: assert t.shape == torch.Size([4, 2, 3]) for b in t: v = int(b[0][0]) assert v not in values values.add(v) assert len(values) == num_expected print("Missing tensors:") for i in range(num_tensors): if i not in values: print(i) # Make sure it works for more than 1 epoch values = set() for t in loader: assert t.shape == torch.Size([4, 2, 3]) for b in t: v = int(b[0][0]) assert v not in values values.add(v) assert len(values) == num_expected @pytest.mark.parametrize( "mode", { poptorch.DataLoaderMode.Async, poptorch.DataLoaderMode.AsyncRebatched, poptorch.DataLoaderMode.Sync }) @pytest.mark.parametrize("dtype", [torch.float32, torch.float16]) def test_indexable_dataloader_len(mode, dtype): shape = [2, 3] num_tensors = 101 num_workers = 7 batch_size = 4 ds = IncrementDataset(shape, num_tensors, dtype=dtype) assert len(ds) == num_tensors n = 0 for n, _ in enumerate(ds): pass assert n + 1 == num_tensors opts = poptorch.Options() loader = poptorch.DataLoader(opts, ds, batch_size=batch_size, num_workers=num_workers, mode=mode) if mode == poptorch.DataLoaderMode.Sync: # Make sure the user can still manually create the # data accessor. (This can only be tested in Sync # mode as otherwise the loader already contains # a data accessor). accessor = poptorch.AsynchronousDataAccessor(loader) assert len(loader) == num_tensors // batch_size for n, _ in enumerate(accessor): pass assert n + 1 == num_tensors // batch_size accessor = poptorch.AsynchronousDataAccessor(ds) assert len(accessor) == num_tensors for n, _ in enumerate(accessor): pass assert n + 1 == num_tensors assert len(loader) == num_tensors // batch_size for n, _ in enumerate(loader): pass assert n + 1 == num_tensors // batch_size @pytest.mark.parametrize( "mode", { poptorch.DataLoaderMode.Async, poptorch.DataLoaderMode.AsyncRebatched, poptorch.DataLoaderMode.Sync }) def test_dictionary_dataset(mode): shape = [2, 3] num_tensors = 500 opts = poptorch.Options() opts.deviceIterations(2) opts.replicationFactor(3) loader = poptorch.DataLoader(opts, IncrementDatasetWithLabelsDict( shape, num_tensors), num_workers=3, mode=mode) shape_with_batch = [loader.combinedBatchSize] + shape it = 0 for d in loader: assert isinstance(d, dict) assert len(d) == 2 assert "data" in d assert "label" in d assert d["data"].shape == torch.Size(shape_with_batch) assert d["label"].shape == torch.Size([loader.combinedBatchSize, 1]) it += 1 assert it == num_tensors // loader.combinedBatchSize @pytest.mark.parametrize( "mode", { poptorch.DataLoaderMode.Async, poptorch.DataLoaderMode.AsyncRebatched, poptorch.DataLoaderMode.Sync }) @pytest.mark.parametrize("dtype", [torch.float32, torch.float16]) def test_iterable_dataloader_len(mode, dtype): shape = [2, 3] num_tensors = 101 num_workers = 7 batch_size = 4 # Note: Upstream torch returns the theoretical length # it doesn't take into account the items lost per worker. expected_len = math.floor(num_tensors / batch_size) if mode != poptorch.DataLoaderMode.AsyncRebatched: # Expected tensors # tensors per worker = ceil(101/7) = 15 # last worker = 10 tensor # batch size = 4 # Total = 6 * floor(15 / 4) + floor(10/4) # = 6 * 3 + 2 = 20 # Unused tensors = 101 - num_iterations_expected * 4 = 21 num_iterations_expected = 20 else: # Best case expected: floor(101/4) = 25 -> unused = 1 num_iterations_expected = expected_len ds = IncrementIterableDatasetWithLen(shape, num_tensors, dtype=dtype) assert len(ds) == num_tensors n = 0 for n, _ in enumerate(ds): pass assert n + 1 == num_tensors opts = poptorch.Options() loader = poptorch.DataLoader(opts, ds, batch_size=batch_size, num_workers=num_workers, worker_init_fn=_worker_init_fn, mode=mode) if mode == poptorch.DataLoaderMode.Sync: accessor = poptorch.AsynchronousDataAccessor(loader) assert len(loader) == expected_len for n, _ in enumerate(accessor): pass assert n + 1 == num_iterations_expected accessor = poptorch.AsynchronousDataAccessor(ds) assert len(accessor) == num_tensors for n, _ in enumerate(accessor): pass assert n + 1 == num_tensors assert len(loader) == expected_len for n, _ in enumerate(loader): pass assert n + 1 == num_iterations_expected @pytest.mark.parametrize( "mode", {poptorch.DataLoaderMode.AsyncRebatched, poptorch.DataLoaderMode.Sync}) @pytest.mark.parametrize("DatasetType", [IncrementDataset, IncrementIterableDatasetWithLen]) @pytest.mark.parametrize("dtype", [torch.float32, torch.float16]) def test_leftover(mode, DatasetType, dtype): shape = [2, 3] num_tensors = 101 num_workers = 7 batch_size = 6 # Note: Upstream torch returns the theoretical length # it doesn't take into account the items lost per worker. expected_len = math.ceil(num_tensors / batch_size) ds = DatasetType(shape, num_tensors, dtype=dtype) if isinstance(ds, torch.utils.data.IterableDataset ) and mode != poptorch.DataLoaderMode.AsyncRebatched: # Expected tensors # tensors per worker = ceil(101/7) = 15 # last worker = 11 tensor # batch size = 6 # Total = 6 * floor(15 / 6) + floor(11/6) # = 6 * 2 + 1 = 13 # Left over per worker: 3, 5 for the first one num_full_iterations_expected = 13 left_over_batches = [5] + [3] * 6 else: # Best case expected: floor(101/6) = 16 -> unused = 5 num_full_iterations_expected = 16 left_over_batches = [5] assert len(ds) == num_tensors n = 0 for n, d in enumerate(ds): assert d.shape == torch.Size(shape) assert n + 1 == num_tensors opts = poptorch.Options() worker_init_fn = None if isinstance(ds, torch.utils.data.IterableDataset): worker_init_fn = _worker_init_fn loader = poptorch.DataLoader(opts, ds, batch_size=batch_size, num_workers=num_workers, worker_init_fn=worker_init_fn, drop_last=False, mode=mode) assert len(loader) == expected_len for _ in range(2): # There is no guarantee about the order in which # the full vs partial batches will be returned # so we need to keep track of which ones we've seen so far # and assert at the end. full_iterations_left = num_full_iterations_expected left_overs_left = left_over_batches.copy() for n, d in enumerate(loader): print("Dequeued tensor shape ", d.shape) if d.shape[0] == batch_size: full_iterations_left -= 1 else: assert d.shape[0] in left_overs_left left_overs_left.remove(d.shape[0]) num_iterations_expected = num_full_iterations_expected + len( left_over_batches) assert full_iterations_left == 0 assert not left_overs_left assert n + 1 == num_iterations_expected @pytest.mark.parametrize("DatasetType", [IncrementDataset, IncrementIterableDatasetWithLen]) @pytest.mark.parametrize("dtype", [torch.float32, torch.float16]) @pytest.mark.parametrize("drop_last", [True, False]) @pytest.mark.parametrize("rebatched_worker_size", [1, 2, 3, 4, None]) def test_rebatched_worker_size(DatasetType, dtype, drop_last, rebatched_worker_size): shape = [2, 3] num_tensors = 101 num_workers = 7 batch_size = 4 ds = DatasetType(shape, num_tensors, dtype=dtype) worker_init_fn = None if isinstance(ds, torch.utils.data.IterableDataset): worker_init_fn = _worker_init_fn if drop_last: # Best case expected: floor(101/4) = 25 -> unused = 1 num_expected = math.floor(num_tensors / batch_size) * batch_size else: num_expected = num_tensors opts = poptorch.Options() loader = poptorch.DataLoader(opts, ds, batch_size=batch_size, num_workers=num_workers, mode=poptorch.DataLoaderMode.AsyncRebatched, drop_last=drop_last, rebatched_worker_size=rebatched_worker_size, worker_init_fn=worker_init_fn) values = set() for t in loader: assert not drop_last or t.shape == torch.Size([4, 2, 3]) for b in t: v = int(b[0][0]) assert v not in values values.add(v) assert len(values) == num_expected print("Missing tensors:") for i in range(num_tensors): if i not in values: print(i) # Make sure it works for more than 1 epoch values = set() for t in loader: assert not drop_last or t.shape == torch.Size([4, 2, 3]) for b in t: v = int(b[0][0]) assert v not in values values.add(v) assert len(values) == num_expected def process_to_kill_asyncdataloader(iterate_over_data: bool): """A function executed as a script meant to be killed ``test_KeyboardInterrupt_in_async_data_accessor`` Creates a dataloader and iterates over it. """ # pylint: disable=import-outside-toplevel # pylint: disable=reimported import time import poptorch import torch opts = poptorch.Options() opts.deviceIterations(2) opts.replicationFactor(1) features = torch.randn([100, 1, 128, 128]) labels = torch.empty([100], dtype=torch.long).random_(10) dataset = torch.utils.data.TensorDataset(features, labels) training_data = poptorch.DataLoader( opts, dataset=dataset, batch_size=16, shuffle=True, drop_last=True, num_workers=2, mode=poptorch.DataLoaderMode.Async, ) # Empty iteration through the data alters the state of the accessor if iterate_over_data: for _, _ in training_data: pass # Needed as a cooldown after the iteration, otherwise the accessor # may be in an unsafe state, this is representative of interractive # environments. time.sleep(1) print("[control] Dataloader prepared, waiting for sigint.") # Expect the parent process to be force closed in the next 30 seconds try: time.sleep(30) raise RuntimeError( "We should not reach this point, we should receive SIGINT before") except KeyboardInterrupt: print("[control] KeyboardInterrupt received in parent exiting.") @pytest.mark.parametrize("iterate_over_data", [True, False]) def test_KeyboardInterrupt_in_async_data_accessor(iterate_over_data: bool): """ Reproduces an error seen in Jupyter notebooks where dataloader Asynchronous Accessors get closed before their controller. Leading to error messages being spawned to the notebook command line. :args: iterate_over_data: Argument passed to ``process_to_kill_asyncdataloader``. Indicates whether to iterate over the data or not. """ print("Starting subprocess") parent = subprocess.Popen( [ sys.executable, "-u", # needed to ensure messages are sent to stdout immediately "-c", f""" import os # Needed to capture the PID of the AsynchronousDataAccessor os.environ["POPTORCH_LOG_LEVEL"] = "DEBUG" import marshal, types code = marshal.loads({marshal.dumps(process_to_kill_asyncdataloader.__code__)}) fn = types.FunctionType(code, globals(), "kill_this_process") fn({iterate_over_data}) """, ], universal_newlines=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, ) print("Subprocess started - waiting for signal") lines = [] worker_pid = None kill_worker = False # Capture the PID of AsynchronousDataAccessor and wait for the signal # that the dataloader is ready. for line in parent.stdout: lines.append(line) print("Child - {}".format(line.strip("\n"))) find_pid = re.match( r".*AsynchronousDataAccessor worker process: (\d+)", line) if find_pid: worker_pid = int(find_pid.group(1)) if re.match(r"\[control\] Dataloader prepared, waiting for sigint\.", line): kill_worker = True break # Check that both the PID and the signal were caught if not kill_worker: parent.send_signal(signal.SIGINT) raise RuntimeError("The termination signal for the worker process " "was not received.") if worker_pid is None: parent.send_signal(signal.SIGINT) raise RuntimeError( "Could not kill the AsynchronousDataAccessor, its " "PID could not be captured from the standard output.") print("Sending SIGINT to ", worker_pid) os.kill(worker_pid, signal.SIGINT) parent.send_signal(signal.SIGINT) for line in parent.stdout: lines.append(line) print("Child - {}".format(line.strip("\n"))) unexpected_lines = [ line for line in lines if "[debug]" not in line and "[control]" not in line ] assert not unexpected_lines, "Unexpected lines in output:\n%s" % "".join( unexpected_lines) class DynamicBatchSampler(torch.utils.data.Sampler): def __init__(self, sampler, batch_size): super().__init__(None) self.sampler = sampler self.batch_size = batch_size def __iter__(self): indices = [] idx = 0 reset = 1 for sample in self.sampler: if idx == reset: yield indices indices = [] idx = 0 reset += 1 if reset == self.batch_size + 1: reset = 1 indices.append(sample) idx += 1 if indices: yield indices @functools.lru_cache(None) def __len__(self): sampler_len = len(self.sampler) bins = 0 bins_elems = ((2 + (self.batch_size - 1)) * self.batch_size) // 2 bins += (sampler_len // bins_elems) * (self.batch_size) sampler_len = sampler_len % bins_elems if not sampler_len: return bins bins_elems -= self.batch_size for bin_size in reversed(range(1, self.batch_size)): if sampler_len == bins_elems: return bins + bin_size if sampler_len > bins_elems: return bins + bin_size + 1 bins_elems -= bin_size return bins class DynamicRandomBatchSampler(torch.utils.data.Sampler): def __init__(self, sampler, batch_size): super().__init__(None) self.sampler = sampler self.batch_size = batch_size def __iter__(self): random.seed(self.batch_size) length = len(self.sampler) iterator = self.sampler.__iter__() while length: batch_length = random.randint(1, min(length, self.batch_size)) indices = [iterator.__next__() for _ in range(batch_length)] yield indices length -= batch_length class CustomBatch: def __init__(self, data, label): self.data = data self.label = label class CustomBatchParser(poptorch.ICustomArgParser): def yieldTensors(self, struct): yield struct.data yield struct.label def reconstruct(self, original_structure, tensor_iterator): return type(original_structure)(*tensor_iterator) poptorch.registerCustomArgParser(CustomBatch, CustomBatchParser()) class DynamicPadCollateFunction(): def __init__(self, batch_size, return_type=None): self.batch_size = batch_size self.return_type = return_type def __call__(self, collate_data_list): if isinstance(collate_data_list[0], tuple): pad_data_len = self.batch_size - len(collate_data_list) batch = [] for index in range(len(collate_data_list[0])): elem_shape = collate_data_list[0][index].shape tensors = [data[index] for data in collate_data_list] tensors.extend([ torch.full(elem_shape, 0, dtype=torch.float32) for _ in range(pad_data_len) ]) batch.append(torch.stack(tensors)) if self.return_type not in [None, tuple]: return self.return_type(*batch) return tuple(batch) raise NotImplementedError() @pytest.mark.parametrize("batch_size", [2, 3]) @pytest.mark.parametrize("device_iteration", [1, 4, 5]) @pytest.mark.parametrize("drop_last", [True, False]) @pytest.mark.parametrize("num_workers", [1, 10]) def test_batch_sampler_basic(batch_size, device_iteration, drop_last, num_workers): combined_batch_size = batch_size * device_iteration shape = [2, 3] dataset_size = 100 dtype = torch.float32 exepected_num_batches = dataset_size // combined_batch_size last_batch_incomplete_size = dataset_size % combined_batch_size last_incomplete = last_batch_incomplete_size != 0 and not drop_last if last_incomplete: exepected_num_batches += 1 dataset = IncrementDataset(shape, dataset_size, dtype) simple_batch_sampler = torch.utils.data.BatchSampler( torch.utils.data.SequentialSampler(dataset), batch_size=batch_size, drop_last=drop_last) opts = poptorch.Options().deviceIterations(device_iteration) loader = poptorch.DataLoader(opts, dataset, batch_sampler=simple_batch_sampler, drop_last=drop_last, num_workers=num_workers) def expected_batch(batch_id, expected_batch_size): nonlocal combined_batch_size nonlocal shape nonlocal dtype index_base = batch_id * combined_batch_size return torch.stack([ torch.full(shape, index_base + index, dtype=dtype) for index in range(expected_batch_size) ]) batches = list(loader) assert len(batches) == exepected_num_batches expected_full_batch = functools.partial( expected_batch, expected_batch_size=combined_batch_size) full_batches = itertools.islice( batches, len(batches) - 1 if last_incomplete else None) batch_id = -1 for batch_id, batch in enumerate(full_batches): assert torch.equal(batch, expected_full_batch(batch_id)) if last_incomplete: assert torch.equal( batches[-1], expected_batch(batch_id + 1, last_batch_incomplete_size)) def get_item(batch, item, return_type): if return_type == tuple: if item == "data": return batch[0] return batch[1] if return_type == CustomBatch: return getattr(batch, item) return None @pytest.mark.parametrize("batch_size", [4, 11]) @pytest.mark.parametrize("device_iteration", [1, 5]) @pytest.mark.parametrize("num_workers", [0, 3]) @pytest.mark.parametrize("return_type", [tuple, CustomBatch]) @pytest.mark.parametrize("drop_last", [True, False]) @pytest.mark.parametrize("mode", [ poptorch.DataLoaderMode.Sync, poptorch.DataLoaderMode.Async, poptorch.DataLoaderMode.AsyncRebatched ]) def test_custom_batch_sampler(batch_size, device_iteration, num_workers, return_type, drop_last, mode): shape = [3, 1] dataset_size = 149 dataset = IncrementDatasetWithLabels(shape, dataset_size) dynamic_batch_sampler = DynamicBatchSampler( torch.utils.data.SequentialSampler(dataset), batch_size=batch_size) sampler_len = len(dynamic_batch_sampler) expected_num_batches = sampler_len // device_iteration incomplete_batches = sampler_len % device_iteration last_incomplete = not drop_last and incomplete_batches != 0 if last_incomplete: expected_num_batches += 1 collate_fn = DynamicPadCollateFunction(batch_size, return_type) opts = poptorch.Options().deviceIterations(device_iteration) loader = poptorch.DataLoader(opts, dataset, batch_sampler=dynamic_batch_sampler, collate_fn=collate_fn, num_workers=num_workers, drop_last=drop_last, mode=mode) batches = list(loader) assert len(batches) == expected_num_batches combined_batch_size = batch_size * device_iteration expected_data_full_size = torch.Size([combined_batch_size] + shape) expected_labels_full_size = torch.Size([combined_batch_size, 1]) full_batches = itertools.islice( batches, len(batches) - 1 if last_incomplete else None) for batch in full_batches: assert get_item(batch, "data", return_type).shape == expected_data_full_size assert get_item(batch, "label", return_type).shape == expected_labels_full_size if last_incomplete: combined_tail_batch_size = incomplete_batches * batch_size assert get_item(batches[-1], "data", return_type).shape == \ torch.Size([combined_tail_batch_size] + shape) assert get_item(batches[-1], "label", return_type).shape == \ torch.Size([combined_tail_batch_size, 1]) @pytest.mark.parametrize("batch_size", [1, 4]) @pytest.mark.parametrize("drop_last", [True, False]) def test_default_batch_sampler(batch_size, drop_last): device_iteration = 1 num_workers = 4 return_type = tuple mode = poptorch.DataLoaderMode.Async shape = [3, 1] # pseudo random value for number of expected batches produced by dataloader expected_num_batches = batch_size + device_iteration + num_workers incomplete_batches = 3 % device_iteration combined_batch_size = batch_size * device_iteration dataset_size = expected_num_batches * combined_batch_size dataset_size += incomplete_batches * batch_size last_incomplete = not drop_last and incomplete_batches != 0 if last_incomplete: expected_num_batches += 1 collate_fn = DynamicPadCollateFunction(batch_size, return_type) opts = poptorch.Options().deviceIterations(device_iteration) dataset = IncrementDatasetWithLabels(shape, dataset_size) loader = poptorch.DataLoader(opts, dataset, batch_size=batch_size, collate_fn=collate_fn, num_workers=num_workers, drop_last=drop_last, mode=mode) batches = list(loader) assert len(batches) == expected_num_batches expected_data_full_size = torch.Size([combined_batch_size] + shape) expected_labels_full_size = torch.Size([combined_batch_size, 1]) full_batches = itertools.islice( batches, len(batches) - 1 if last_incomplete else None) for batch in full_batches: assert get_item(batch, "data", return_type).shape == expected_data_full_size assert get_item(batch, "label", return_type).shape == expected_labels_full_size if last_incomplete: combined_tail_batch_size = incomplete_batches * batch_size assert get_item(batches[-1], "data", return_type).shape == \ torch.Size([combined_tail_batch_size] + shape) assert get_item(batches[-1], "label", return_type).shape == \ torch.Size([combined_tail_batch_size, 1]) @pytest.mark.parametrize("device_iteration", [1, 5]) @pytest.mark.parametrize("num_workers", [0, 3]) def test_custom_batch_sampler_non_deterministic_len(device_iteration, num_workers): shape = [2, 1] dataset_size = 111 batch_size = 13 dataset = IncrementDatasetWithLabels(shape, dataset_size) dynamic_batch_sampler = DynamicRandomBatchSampler( torch.utils.data.SequentialSampler(dataset), batch_size=batch_size) collate_fn = DynamicPadCollateFunction(batch_size, CustomBatch) opts = poptorch.Options().deviceIterations(device_iteration) loader = poptorch.DataLoader(opts, dataset, batch_sampler=dynamic_batch_sampler, collate_fn=collate_fn, num_workers=num_workers) batches = list(loader) assert len(batches) > 0 combined_batch_size = batch_size * device_iteration expected_data_full_size = torch.Size([combined_batch_size] + shape) expected_labels_full_size = torch.Size([combined_batch_size, 1]) for batch in batches: assert batch.data.shape == expected_data_full_size assert batch.label.shape == expected_labels_full_size ================================================ FILE: tests/debug_tensors_test.py ================================================ #!/usr/bin/env python3 # Copyright (c) 2021 Graphcore Ltd. All rights reserved. import os import re import tempfile import pytest import torch import helpers import poptorch class Model(torch.nn.Module): def __init__(self): super().__init__() self.fc1 = torch.nn.Linear(10, 10) self.relu = torch.nn.ReLU() self.fc2 = torch.nn.Linear(10, 10) self.loss = torch.nn.MSELoss(reduction="mean") def forward(self, x, labels=None): out = self.fc2(self.relu(self.fc1(x))) if self.training: return self.loss(out, labels) return out def test_tensor_names(): model = Model() poptorch_model = poptorch.trainingModel(model) input = torch.rand(10, 10) label = torch.rand(10, 10) with pytest.raises(AssertionError): poptorch_model.getTensorNames() poptorch_model(input, label) tensors = poptorch_model.getTensorNames() assert any(re.search(r"\bfc1\b", t) for t in tensors) assert any(re.search(r"\bfc2\b", t) for t in tensors) assert any(t.startswith('input') for t in tensors) assert any(t.startswith('loss') for t in tensors) assert any(t.startswith('Gradient___') for t in tensors) assert any(t.startswith('UpdatedVar__') for t in tensors) assert any(t.startswith('scaledLearningRate') for t in tensors) assert any(t.startswith('weightDecayScaleFactor') for t in tensors) @pytest.mark.ipuHardwareRequired def test_tensor_names_from_precompiled_model(): with tempfile.TemporaryDirectory() as tmp: filename = os.path.join(tmp, "model.poptorch") model = Model() poptorch_model = poptorch.trainingModel(model) input = torch.rand(10, 10) label = torch.rand(10, 10) # Running the model will trigger the executable compilation poptorch_model(input, label) # Save the executable and destroy the model poptorch_model.save(filename) poptorch_model.destroy() with pytest.raises(AssertionError): poptorch_model.getTensorNames() # Reload the model from file. poptorch_model = poptorch.load(filename) tensors = poptorch_model.getTensorNames() assert any(re.search(r"\bfc1\b", t) for t in tensors) assert any(re.search(r"\bfc2\b", t) for t in tensors) assert any(t.startswith('input') for t in tensors) assert any(t.startswith('loss') for t in tensors) assert any(t.startswith('weightDecayScaleFactor') for t in tensors) assert any(t.startswith('scaledLearningRate') for t in tensors) def test_tensor_values(): model = Model() opts = poptorch.Options() opts.anchorTensor('grad_bias', 'Gradient___fc2.bias') opts.anchorTensor('update_weight', 'UpdatedVar___fc2.weight') poptorch_model = poptorch.trainingModel(model, opts) input = torch.rand(10, 10) label = torch.rand(10, 10) poptorch_model(input, label) grad1 = poptorch_model.getAnchoredTensor('grad_bias') assert grad1.shape == (10, ) update1 = poptorch_model.getAnchoredTensor('update_weight') assert update1.shape == (10, 10) input = torch.rand(10, 10) label = torch.rand(10, 10) poptorch_model(input, label) grad2 = poptorch_model.getAnchoredTensor('grad_bias') assert grad2.shape == (10, ) update2 = poptorch_model.getAnchoredTensor('update_weight') assert update2.shape == (10, 10) assert not torch.equal(grad1, grad2) assert not torch.equal(update1, update2) output_modes = [[poptorch.OutputMode.All, 3, "ALL/1"], [poptorch.OutputMode.EveryN, 4, "EVERYN/4"], [poptorch.OutputMode.Final, 1, "FINAL/1"], [poptorch.OutputMode.Sum, 1, "Sum/1"]] @pytest.mark.parametrize("mode, period, expected_str", output_modes) @helpers.printCapfdOnExit @helpers.overridePoptorchLogLevel("DEBUG") def test_tensor_modes(capfd, mode, period, expected_str): model = Model() tensor_name = 'Gradient___fc2.bias' opts = poptorch.Options() opts.anchorTensor('grad_bias', tensor_name, mode, period) poptorch_model = poptorch.trainingModel(model, opts) input = torch.rand(10, 10) label = torch.rand(10, 10) poptorch_model(input, label) testlog = helpers.LogChecker(capfd) testlog.assert_contains(tensor_name + ' ' + expected_str) ================================================ FILE: tests/distance_ops_test.py ================================================ #!/usr/bin/env python3 # Copyright (c) 2020 Graphcore Ltd. All rights reserved. import torch import pytest import helpers import poptorch @pytest.mark.parametrize("norm", {1., 2., 3., 4.}) def test_pairwise_distance(norm): torch.manual_seed(42) size = [10, 5] input1 = torch.randn(size) input2 = torch.randn(size) shape = input1.shape model = helpers.ModelWithWeights(torch.nn.PairwiseDistance(norm), shape) poptorch_model = poptorch.trainingModel(model) # Run on CPU native_out, _ = model((input1, input2)) # Run on IPU poptorch_out, _ = poptorch_model((input1, input2)) # Inference test - check outputs helpers.assert_allclose(expected=native_out, actual=poptorch_out) # Training test - check weights changed poptorch_model.assert_weights_changed() @pytest.mark.parametrize("dim", {0, 1}) def test_cosine_similarity(dim): torch.manual_seed(42) size = [10, 5] input1 = torch.randn(size) input2 = torch.randn(size) shape = input1.shape model = helpers.ModelWithWeights(torch.nn.CosineSimilarity(dim), shape) poptorch_model = poptorch.trainingModel(model) # Run on CPU native_out, _ = model((input1, input2)) # Run on IPU poptorch_out, _ = poptorch_model((input1, input2)) # Inference test - check outputs helpers.assert_allclose(expected=native_out, actual=poptorch_out) # Training test - check weights changed poptorch_model.assert_weights_changed() @pytest.mark.parametrize("input_shapes", (((3, 2), (2, 2)), ((3, 2, 3), (3, 10, 3)), ((3, 5, 2, 7), (5, 11, 7)), ((3, 5, 1, 2, 7), (3, 1, 10, 11, 7)))) @pytest.mark.parametrize("p", (2, 3)) def test_cdist(input_shapes, p): a_shape, b_shape = input_shapes torch.manual_seed(42) class Cdist(torch.nn.Module): def __init__(self, p, *args, **kwargs) -> None: super().__init__(*args, **kwargs) self.p = p def forward(self, x, y): return torch.cdist(x, y, self.p) a = torch.rand(*a_shape) b = torch.rand(*b_shape) model = helpers.ModelWithWeights(Cdist(p), a.shape) # Run on CPU native_out, _ = model((a, b)) poptorch_model = poptorch.trainingModel(model) # Run on IPU poptorch_out, _ = poptorch_model((a, b)) # Inference test - check outputs helpers.assert_allclose(expected=native_out, actual=poptorch_out) # Training test - check weights changed poptorch_model.assert_weights_changed() ================================================ FILE: tests/exception_test.py ================================================ #!/usr/bin/env python3 # Copyright (c) 2020 Graphcore Ltd. All rights reserved. import pytest import torch import poptorch def harness(setting, Model, args): opts = poptorch.Options() if setting == "true": opts.Precision.enableFloatingPointExceptions(True) elif setting == "false": opts.Precision.enableFloatingPointExceptions(False) poptorch_model = poptorch.inferenceModel(Model(), opts) if setting == "true": with pytest.raises(poptorch.Error): poptorch_model(*args) else: poptorch_model(*args) @pytest.mark.ipuHardwareRequired @pytest.mark.parametrize("setting", {"default", "true", "false"}) def test_div0(setting): class Model(torch.nn.Module): def forward(self, x, y): return x / y x = torch.ones(10, 10) y = torch.zeros(10, 10) harness(setting, Model, [x, y]) @pytest.mark.ipuHardwareRequired @pytest.mark.parametrize("setting", {"default", "true", "false"}) def test_mul0inf(setting): class Model(torch.nn.Module): def forward(self, x, y): return x * y x = torch.zeros(10, 10) y = torch.div(torch.ones(10, 10), torch.zeros(10, 10)) harness(setting, Model, [x, y]) @pytest.mark.ipuHardwareRequired @pytest.mark.parametrize("setting", {"default", "true", "false"}) def test_nonreal(setting): class Model(torch.nn.Module): def forward(self, x): return torch.sqrt(x) x = torch.Tensor([-1, -2]) harness(setting, Model, [x]) @pytest.mark.parametrize("setting", {"default", "true", "false"}) @pytest.mark.ipuHardwareRequired def test_nan(setting): class Model(torch.nn.Module): def forward(self, x, y): return x > y x = torch.ones(10, 10) y = torch.div(torch.zeros(10, 10), torch.zeros(10, 10)) harness(setting, Model, [x, y]) @pytest.mark.parametrize("setting", {"default", "true", "false"}) @pytest.mark.ipuHardwareRequired def test_ovf(setting): class Model(torch.nn.Module): def forward(self, x): return torch.exp(x) x = torch.Tensor([3800, 4203]) harness(setting, Model, [x]) ================================================ FILE: tests/fine_tuning_test.py ================================================ #!/usr/bin/env python3 # Copyright (c) 2021 Graphcore Ltd. All rights reserved. import copy import os # pylint: disable=unused-import import unittest.mock import torch import torchvision.models as models import helpers import poptorch def fine_tuning_harness(imagenet_model): torch.manual_seed(42) num_classes = 2 num_examples = 2 num_epochs = 20 data = torch.randn((num_examples, 3, 224, 224)) target = torch.randint(0, num_classes, (num_examples, )) base_model = imagenet_model(pretrained=False) loss_fn = torch.nn.CrossEntropyLoss() class ModelWithLoss(torch.nn.Module): def __init__(self): super().__init__() self.base_model = base_model def forward(self, data, target): out = base_model(data) loss = loss_fn(out, target) return out, loss model = ModelWithLoss() for param in model.base_model.parameters(): param.requires_grad = False # Change the linear classifier at the top. model.base_model.fc = torch.nn.Linear(model.base_model.fc.in_features, num_classes) for param in model.base_model.fc.parameters(): assert param.requires_grad initial_params = copy.deepcopy(model).state_dict() # Fine tune. optim = torch.optim.SGD(model.base_model.fc.parameters(), lr=0.001) poptorch_model = poptorch.trainingModel(model, optimizer=optim) for _ in range(num_epochs): _ = poptorch_model(data, target) # Assert only the last layer was changed. for name, param in model.named_parameters(): if name.startswith('base_model.fc'): assert not torch.allclose(param.data, initial_params[name]) else: helpers.assert_allclose(actual=param.data, expected=initial_params[name]) @unittest.mock.patch.dict("os.environ", helpers.disableSmallModel()) def test_resnet18(): fine_tuning_harness(models.resnet18) ================================================ FILE: tests/functional_test.py ================================================ #!/usr/bin/env python3 # Copyright (c) 2021 Graphcore Ltd. All rights reserved. import torch import pytest import helpers import poptorch def test_one_hot(): class Model(torch.nn.Module): def forward(self, x): return torch.nn.functional.one_hot(x, num_classes=10) input = torch.randint(high=10, size=[10, 5, 4]) model = Model() # Run on CPU. nativeOut = model(input) # Run on IPU. poptorch_model = poptorch.inferenceModel(model) poptorch_out = poptorch_model(input) helpers.assert_allequal(actual=poptorch_out.long(), expected=nativeOut) def test_one_hot_invalid(): class Model(torch.nn.Module): def forward(self, x): return torch.nn.functional.one_hot(x, num_classes=-1) input = torch.randint(high=10, size=[10]) model = Model() msg = "OneHot num classes must be specified and must be constant." # Run on IPU. with pytest.raises(poptorch.Error, match=msg): poptorch_model = poptorch.inferenceModel(model) poptorch_model(input) def test_one_hot_casted(): class Model(torch.nn.Module): def forward(self, x): x = torch.nn.functional.one_hot(x, num_classes=10) return x.half() input = torch.randint(high=10, size=[10, 5, 4]) model = Model() # Run on CPU. nativeOut = model(input) # Run on IPU. poptorch_model = poptorch.inferenceModel(model) poptorch_out = poptorch_model(input) assert poptorch_out.dtype == torch.half helpers.assert_allequal(actual=poptorch_out, expected=nativeOut) @pytest.mark.parametrize("in_features,out_features", [(8, 7), (7, 6), (6, 5)]) def test_linear(in_features, out_features): class Model(torch.nn.Module): weight: torch.Tensor bias: torch.Tensor def __init__(self, in_features: int, out_features: int): super().__init__() self.weight = torch.nn.parameter.Parameter( torch.ones((out_features, in_features), dtype=torch.float)) self.bias = torch.nn.parameter.Parameter(torch.ones(out_features)) def forward(self, x): return torch.nn.functional.linear(x, self.weight, self.bias) input = torch.arange(out_features * in_features, dtype=torch.float).reshape(out_features, in_features) model = Model(in_features=in_features, out_features=out_features) # Run on CPU. native_out = model(input) # Run on IPU. poptorch_model = poptorch.inferenceModel(model) poptorch_out = poptorch_model(input) assert poptorch_out.dtype == torch.float helpers.assert_allclose(actual=poptorch_out, expected=native_out) ================================================ FILE: tests/generate_test_file.py ================================================ #!/usr/bin/env python3 # Copyright (c) 2020 Graphcore Ltd. All rights reserved. import argparse import contextlib import os import io import re import sys import pytest parser = argparse.ArgumentParser(description="Generate CTestTestfile.cmake") parser.add_argument("test_dir", help="Path to the folder containing the tests") parser.add_argument("output_file", help="Path to CTestTestfile.cmake") parser.add_argument("--add-to-sys-path", help="Path to add to sys.path") parser.add_argument("--external-datasets-dir", type=str, help=("The directory where the external datasets will be " "downloaded.")) parser.add_argument("--extra-pytest-args", type=str, help=("Extra arguments to pass to pytest when generating " "the list of tests.")) args = parser.parse_args() if args.add_to_sys_path: for path in args.add_to_sys_path.split(";"): print(f"Adding {path}") sys.path.insert(0, path) # This script doesn't actually need poptorch, but pytest later on will import # it while compiling the list of tests and if it fails then we usually don't # get the reason (Because the collection happens in a subprocess). import poptorch # pylint: disable=unused-import,wrong-import-position # Collect the list of tests: list_tests = io.StringIO() pytest_args = [ "-x", args.test_dir, "--collect-only", "-q", ] extra_args = [] if args.extra_pytest_args: arg = args.extra_pytest_args.replace("\"", "") if arg: extra_args = arg.split(",") pytest_args += extra_args with contextlib.redirect_stdout(list_tests): retval = pytest.main(pytest_args) assert retval == pytest.ExitCode.OK, f"{str(retval)}: {list_tests.getvalue()}" # Run all the tests contained in these files in a single process # because they're small / short to run (Under 1 minute) # NB tests requring custom_ops libraries must go in here #pylint: disable=line-too-long # yapf: disable short_tests = [ "activations_test.py", "batching_test.py", "blas_test.py", "buffers_test.py", "custom_loss_test.py", "custom_ops_attributes_test.py", "custom_ops_test.py", "inputs_test.py", "loop_test.py", "if_test.py", "lstm_test.py", "non_contiguous_tensors_test.py", "ops_test.py", "options_test.py", "outputs_test.py", "pipelining_test.py", "poplar_executor_test.py", "precompilation_test.py", "random_sampling_test.py", "replicated_graph_test.py", "requires_grad_test.py", "sharding_test.py", "gnn/test_cluster_loader.py", "gnn/test_collate.py", "gnn/test_dataloader.py", "gnn/test_fixed_size_options.py", "gnn/test_masker.py", "gnn/test_model_args.py", "gnn/test_stream_packing_sampler.py", ] # The only tests that should be run in doc-only builds. docs_only_test_files = [ "test_doc_urls.py" ] long_tests = [ "bert_small_and_medium_test.py::test_bert_medium_result", "half_test.py::test_resnet", "math_ops_test.py::test_sort[descending:True-shape1]" "math_ops_test.py::test_sort[descending:False-shape1]" "io_performance_test.py::test_compare_io_performance", "torch_nn_test.py::test_pytorch_nn[trace_model:False-use_half:False-test_name:test_nn_Conv2d_circular_stride2_pad2]", "torch_nn_test.py::test_pytorch_nn[trace_model:True-use_half:False-test_name:test_nn_Conv2d_circular_stride2_pad2]", "torchvision_inference_test.py::test_googlenet", "torchvision_inference_test.py::test_inception_v3", "torchvision_inference_test.py::test_mnasnet1_0", "torchvision_inference_test.py::test_mobilenet_v2", "torchvision_inference_test.py::test_resnet18", "torchvision_inference_test.py::test_resnext50_32x4d", "torchvision_inference_test.py::test_squeezenet1_1", ] # Tests depending on external data being downloaded to run. external_data_tests = [ "bert_small_and_medium_test.py::test_bert_medium_result", "bert_small_and_medium_test.py::test_bert_small", "bert_small_and_medium_test.py::test_bert_small_half", "gnn/test_schnet.py", ] # yapf: enable # Tests that cannot run in parallel with other tests # Note: these are files not, tests serial_test_files = [ "attach_detach_test.py", # Needs specific IPUs "attach_detach_wait_for_ipu_test.py", # Needs specific IPUs "fine_tuning_test.py", # Takes too much memory for the AWS builders. "io_performance_test.py", # Measures performance ] serial_tests = [ "half_test.py::test_resnet", ] cpp_tests = ["cpp/GNNOptimizationsTest"] #pylint: enable=line-too-long def add_pytest(output, test, root_folder, folder, test_id, test_properties, extra_args): extra = " ".join([f"\"{a}\"" for a in extra_args]) output.write( f"add_test({test} \"{root_folder}/timeout_handler.py\" \"python3\"" f" \"-m\" \"pytest\" \"-sv\" \"{folder}/{test}\" " f"\"--external-datasets-dir={args.external_datasets_dir}\" " f"\"--junitxml=junit/junit-test{test_id}.xml\" {extra})\n") props_string = " ".join(f"{k} {v}" for k, v in test_properties.items()) output.write(f"set_tests_properties({test} PROPERTIES\n{props_string})\n") def add_cpp_test(output, test, root_folder, folder, test_properties): output.write(f"add_test({test} \"{root_folder}/timeout_handler.py\" " f"\"{folder}/{test}\" )\n") props_string = " ".join(f"{k} {v}" for k, v in test_properties.items()) output.write(f"set_tests_properties({test} PROPERTIES\n{props_string})\n") work_dir = os.getcwd() with open(args.output_file, "w") as output: test_id = 0 # Add the short_tests files for test in short_tests: add_pytest(output, test, args.test_dir, args.test_dir, test_id, { "LABELS": "short", "WORKING_DIRECTORY": work_dir }, extra_args) test_id += 1 # Process the list of tests returned by pytest for test in list_tests.getvalue().split("\n"): # Extract the file name from the test name m = re.match("^(.*)::(.*)", test) if m: test_properties = {"WORKING_DIRECTORY": work_dir} # Mark tests as timed out 1 second after TEST_TIMEOUT appears in # their output (see tests/timeout_handler.py) test_properties["TIMEOUT_AFTER_MATCH"] = "\"1;TEST_TIMEOUT\"" # Use os.path.basename() to ensure we only have # the filename test_file = os.path.basename(m.group(1)) dir_path = args.test_dir if os.path.dirname(m.group(1)) != "tests": # Convert to a proper path. path = os.path.normpath(m.group(1)) # Seperate out the dirs and remove the "tests" from the start # and the test name from the end. separate_dirs = path.split(os.sep)[1:-1] # Append the dirs to the start of the root dir one. dir_path = os.path.join(dir_path, *separate_dirs) if test_file in short_tests: continue test_name = f"{test_file}::{m.group(2)}" labels = [] if test_name in long_tests: labels.append("long") if test_name in external_data_tests: labels.append("external_data") if test_file in docs_only_test_files: labels.append("docs_only") if test_file in serial_test_files or test_name in serial_tests: test_properties['RUN_SERIAL'] = 'TRUE' if labels: test_properties['LABELS'] = ";".join(labels) add_pytest(output, test_name, args.test_dir, dir_path, test_id, test_properties, extra_args) test_id += 1 # Process the list of cpp tests for test in cpp_tests: test_properties = {"WORKING_DIRECTORY": work_dir} # Mark tests as timed out 1 second after TEST_TIMEOUT appears in # their output (see tests/timeout_handler.py) test_properties["TIMEOUT_AFTER_MATCH"] = "\"1;TEST_TIMEOUT\"" # Use os.path.basename() to ensure we only have # the filename test_file = os.path.basename(test) dir_path = os.path.join(work_dir, "tests", test) dir_path = os.path.dirname(dir_path) add_cpp_test(output, test_file, args.test_dir, dir_path, test_properties) ================================================ FILE: tests/gnn/.gitignore ================================================ .datasets ================================================ FILE: tests/gnn/benchgnn/README.md ================================================ # benchgnn Benchmark tool for testing performance of GNN models ## Usage example ``benchgnn --dataset FakeDataset --model GAT --bs 1 100 --cpu --output outfile`` Type ``benchgnn --help`` to print detailed information about supported options. ================================================ FILE: tests/gnn/benchgnn/benchgnn.py ================================================ # Copyright (c) 2022 Graphcore Ltd. All rights reserved. import argparse import json import os.path as osp import sys from collections import namedtuple from itertools import product, starmap from warnings import warn import torch from datasets import DataSets from models import GAT, GCN, GIN, PNA, RGCN, SAGE from torch_geometric.loader import DataLoader from torch_geometric.nn import PNAConv from torch_geometric.transforms import Pad from utils import all_formats, merge_results, print_results import poptorch from poptorch_geometric import TrainingStepper, set_aggregation_dim_size from poptorch_geometric.dataloader import DataLoader as IPUDataLoader, FixedSizeDataLoader supported_sets = { 'Cora': [GCN, GAT, GIN, PNA, SAGE], 'CiteSeer': [GCN, GAT, GIN, PNA, SAGE], 'PubMed': [GCN, GAT, GIN, PNA, SAGE], 'mutag': [RGCN], 'FakeDataset': [GCN, GAT, GIN, PNA, SAGE], } all_models = list(set(m.__name__ for v in supported_sets.values() for m in v)) all_datasets = list(supported_sets.keys()) all_loaders = ['torch', 'poptorch', 'poptorch_fixed_size'] all_transforms = [None, 'Pad'] Config = namedtuple('Config', ['Model', 'ds', 'bs', 'loader', 'transform']) def run_benchmark(args, configs): ipu_opts = poptorch.Options() if args['synthetic_data']: ipu_opts.enableSyntheticData(True) results = [] for cfg in configs: if cfg.transform == 'Pad': max_num_nodes = args['max_num_nodes'] max_num_edges = args['max_num_edges'] assert max_num_nodes is not None and max_num_edges is not None cfg.ds.transform = Pad(max_num_nodes=max_num_nodes, max_num_edges=max_num_edges) if cfg.loader == 'torch': loader = DataLoader(cfg.ds, batch_size=cfg.bs, shuffle=False) elif cfg.loader == 'poptorch': loader = IPUDataLoader(cfg.ds, batch_size=cfg.bs) else: loader = FixedSizeDataLoader(dataset=cfg.ds, num_nodes=cfg.ds[0].num_nodes, batch_size=cfg.bs) d = next(iter(loader)) params = {'loss_fn': torch.nn.MSELoss()} if cfg.Model.__name__ != 'GIN': params['out_channels'] = cfg.ds.num_classes if cfg.Model.__name__ == 'PNA': params['degree'] = PNAConv.get_degree_histogram(loader) if cfg.Model.__name__ == 'RGCN': batch = (d.edge_index, d.edge_type) params['in_channels'] = d.num_nodes params['num_relations'] = cfg.ds.num_relations else: batch = (d.x, d.edge_index) params['disable_dropout'] = args['check_values'] params['in_channels'] = cfg.ds.num_features model = cfg.Model(**params) set_aggregation_dim_size(model, int(d.edge_index.max()) + 1) stepper = TrainingStepper(model, options=ipu_opts, enable_fp_exception=False) if args['check_values']: warn( 'Models run without dropout layers. Turn off ' 'check-values to run the full model.', UserWarning) stepper.run(4, batch) devices = [dev for dev in ('cpu', 'gpu', 'ipu') if args[dev] is True] times = stepper.benchmark(args['iters'], batch, devices=devices) result = { 'model': cfg.Model.__name__, 'dataset': cfg.ds.name, '#features': cfg.ds.num_features, '#classes': cfg.ds.num_classes, '#nodes': getattr(d, 'num_nodes', d.x.size(0)), '#edges': getattr(d, 'num_edges', d.edge_index.size(1)), '#iters': args['iters'], 'bs': cfg.bs, 'dataloader': cfg.loader, } result.update(times) results.append(result) return results def add_main_arguments(parser): main_group = parser.add_argument_group('Main') main_group.add_argument('--cfg', type=str, default=None, metavar='file', help="Configuration file") main_group.add_argument('--print-cfg', type=str, default=None, metavar='file', help="Show configuration file content") main_group.add_argument('--model', nargs='+', default=all_models, help='Models to test') main_group.add_argument('--dataset', nargs='+', default=all_datasets, help='Datasets to use for testing') main_group.add_argument('--ipu', action='store_true', default=True, help="Run on IPU") main_group.add_argument('--cpu', action='store_true', default=False, help="Run on CPU") main_group.add_argument('--gpu', action='store_true', default=False, help="Run on GPU") main_group.add_argument('--iters', type=int, default=200, help="Number of iterations") main_group.add_argument('--bs', nargs='+', default=[1], type=int, help="Number of graphs in batch.") main_group.add_argument('--check-values', action='store_true', default=False, help='Run checks to make sure the results are' 'correct. Models run without dropout layers.') main_group.add_argument( '--synthetic-data', action='store_true', default=False, help='Use synthetic data on IPU (no data transfers to ' 'device)') main_group.add_argument( '--loader', nargs='+', default=['torch'], help= 'Dataloader, possible values: [torch, poptorch, poptorch_fixed_size]') main_group.add_argument( '--transform', nargs='+', default=[None], help='Dataloader, possible values: [None, Pad]. Pass the required ' 'transformation parameters, for example: --max-num-nodes=30') main_group.add_argument('--fmt', type=str, default='rounded_outline', help=f'Output format, one of: {all_formats}') main_group.add_argument( '--output', type=str, default=None, help='Store JSON output file with configuration and ' 'results. You can load such file later using ' '--cfg option.') transform_group = parser.add_argument_group('Arguments for Pad transform') transform_group.add_argument( '--max-num-nodes', type=int, default=None, help='Pad transform argument. The number of nodes after padding') transform_group.add_argument( '--max-num-edges', type=int, default=None, help='Pad transform argument. The edges of nodes after padding') return parser def get_args(): help_formatter = argparse.ArgumentDefaultsHelpFormatter parser = argparse.ArgumentParser(description="Whatever comes here " "...", add_help=True, formatter_class=help_formatter) parser = add_main_arguments(parser) args, unknown = parser.parse_known_args() assert len(unknown) == 0, f'Unknown options {unknown}' args = vars(args) loaded_args = {} cfg_file = args['print_cfg'] or args['cfg'] if cfg_file is not None: with open(cfg_file, "r") as infile: loaded_args_ = json.load(infile) loaded_args.update(loaded_args_) # Override some of the loaded args with cmd-line args # Can't override those args that define a test set overwrite_args = [ 'synthetic_data', 'check_values', 'output', 'cfg', 'print_cfg', 'ipu', 'cpu', 'gpu' ] for arg in overwrite_args: loaded_args_[arg] = args[arg] args = loaded_args_ assert all(d in all_datasets for d in args['dataset']), 'Unknown dataset' assert all(m in all_models for m in args['model']), 'Unknown model' assert all(ld in all_loaders for ld in args['loader']), 'Unknown dataloader' assert all(t in all_transforms for t in args['transform']), 'Unknown transform' return args, loaded_args def print_cfg_and_results(args, loaded_args, loaded_results): print(f'\nArgs loaded from {args["print_cfg"]}:') print(loaded_args) print(f'\nResults loaded from {args["print_cfg"]}:') print_results(loaded_results, args['fmt']) def get_tst_configs(args): root = osp.join(osp.dirname(osp.realpath(__file__)), 'test_data') datasets = DataSets(root) datasets = [getattr(datasets, name)() for name in args['dataset']] models = [globals()[name] for name in args['model']] batch_sizes = args['bs'] loaders = args['loader'] transforms = args['transform'] configs = starmap( Config, product(models, datasets, batch_sizes, loaders, transforms)) def is_supported(cfg): return cfg.Model in supported_sets[cfg.ds.name] configs = filter(is_supported, configs) return configs def save_cfg_and_results(args, results): with open(args['output'], "w") as outfile: args['results'] = results json.dump(args, outfile, indent=4) if __name__ == '__main__': args, loaded_args = get_args() loaded_results = loaded_args.get('results', None) loaded_args['results'] = None if args['print_cfg']: print_cfg_and_results(args, loaded_args, loaded_results) sys.exit() configs = get_tst_configs(args) results = run_benchmark(args, configs) if args['output'] is not None: save_cfg_and_results(args, results) if loaded_results: results = merge_results(results, loaded_results) print_results(results, args['fmt']) ================================================ FILE: tests/gnn/benchgnn/datasets.py ================================================ # Copyright (c) 2022 Graphcore Ltd. All rights reserved. import os.path as osp from torch_geometric import seed_everything from torch_geometric.datasets import Entities from torch_geometric.datasets import FakeDataset as FDS from torch_geometric.datasets import Planetoid from torch_geometric.transforms import Compose, GCNNorm, NormalizeFeatures class DataSets: def __init__(self, root): self.root = root def Cora(self): return Planetoid(osp.join(self.root, 'Cora'), 'Cora') def CiteSeer(self): return Planetoid(osp.join(self.root, 'CiteSeer'), 'CiteSeer') def PubMed(self): return Planetoid(osp.join(self.root, 'PubMed'), 'PubMed') def mutag(self): return Entities(osp.join(self.root, 'EntitiesMUTAG'), 'mutag') def FakeDataset(self): seed_everything(0) transform = Compose([GCNNorm(), NormalizeFeatures()]) dataset = FDS( num_graphs=1000, avg_num_nodes=16, avg_degree=5, transform=transform, num_channels=64, ) setattr(dataset, 'name', 'FakeDataset') return dataset ================================================ FILE: tests/gnn/benchgnn/models.py ================================================ # Copyright (c) 2022 Graphcore Ltd. All rights reserved. import torch import torch.nn.functional as F from torch.nn import Linear as Lin from torch.nn import ReLU from torch.nn import Sequential as Seq from torch_geometric.nn import (FastRGCNConv, GATConv, GCNConv, GINConv, PNAConv, SAGEConv) class GCN(torch.nn.Module): def __init__(self, in_channels=0, out_channels=0, loss_fn=None, disable_dropout=False): super().__init__() self.conv1 = GCNConv(in_channels, 32, add_self_loops=False) self.conv2 = GCNConv(32, out_channels, add_self_loops=False) self.loss_fn = loss_fn self.disable_dropout = disable_dropout def forward(self, *args): x, edge_index = args x = F.relu(self.conv1(x, edge_index)) if not self.disable_dropout: x = F.dropout(x, training=self.training) x = self.conv2(x, edge_index) x = F.log_softmax(x, dim=1) if self.training: target = torch.ones_like(x) loss = self.loss_fn(x, target) return x, loss return x class GIN(torch.nn.Module): def __init__(self, in_channels=0, loss_fn=None, disable_dropout=False): super().__init__() nn1 = Seq(Lin(in_channels, 32), ReLU(), Lin(32, 32)) self.conv1 = GINConv(nn1, train_eps=True) nn2 = Seq(Lin(32, 32), ReLU(), Lin(32, 32)) self.conv2 = GINConv(nn2, train_eps=True) self.loss_fn = loss_fn self.disable_dropout = disable_dropout def forward(self, *args): x, edge_index = args x = F.relu(self.conv1(x, edge_index)) if not self.disable_dropout: x = F.dropout(x, training=self.training) x = self.conv2(x, edge_index) x = F.log_softmax(x, dim=1) if self.training: target = torch.ones_like(x) loss = self.loss_fn(x, target) return x, loss return x class GAT(torch.nn.Module): def __init__(self, in_channels=0, out_channels=0, loss_fn=None, disable_dropout=False): super().__init__() dropout_val = 0 if disable_dropout else 0.6 self.conv1 = GATConv(in_channels, 8, heads=8, dropout=dropout_val, add_self_loops=False) self.conv2 = GATConv(8 * 8, out_channels, dropout=dropout_val, add_self_loops=False) self.loss_fn = loss_fn self.disable_dropout = disable_dropout def forward(self, *args): x, edge_index = args if not self.disable_dropout: x = F.dropout(x, p=0.6, training=self.training) x = F.elu(self.conv1(x, edge_index)) if not self.disable_dropout: x = F.dropout(x, p=0.6, training=self.training) x = self.conv2(x, edge_index) x = F.log_softmax(x, dim=1) if self.training: target = torch.ones_like(x) loss = self.loss_fn(x, target) return x, loss return x class RGCN(torch.nn.Module): def __init__(self, in_channels=0, out_channels=0, num_relations=0, loss_fn=None): super().__init__() self.conv1 = FastRGCNConv(in_channels, 8, num_relations, num_bases=15, add_self_loops=False) self.conv2 = FastRGCNConv(8, out_channels, num_relations, num_bases=15, add_self_loops=False) self.loss_fn = loss_fn def forward(self, *args): edge_index, edge_type = args x = F.relu(self.conv1(None, edge_index, edge_type)) x = self.conv2(x, edge_index, edge_type) x = F.log_softmax(x, dim=1) if self.training: target = torch.ones_like(x) loss = self.loss_fn(x, target) return x, loss return x class PNA(torch.nn.Module): def __init__(self, in_channels=0, out_channels=0, loss_fn=None, disable_dropout=False, degree=None): super().__init__() aggregators = ['mean', 'min', 'max', 'std'] scalers = ['identity', 'amplification', 'attenuation'] self.conv = PNAConv(in_channels, out_channels, aggregators, scalers, deg=degree, add_self_loops=False) self.loss_fn = loss_fn self.disable_dropout = disable_dropout def forward(self, *args): x, edge_index = args x = self.conv(x, edge_index) if not self.disable_dropout: x = F.dropout(x, training=self.training) if self.training: target = torch.ones_like(x) loss = self.loss_fn(x, target) return x, loss return x class SAGE(torch.nn.Module): def __init__(self, in_channels=0, out_channels=0, loss_fn=None, disable_dropout=False): super().__init__() self.conv = SAGEConv(in_channels, out_channels, add_self_loops=False) self.loss_fn = loss_fn self.disable_dropout = disable_dropout def forward(self, *args): x, edge_index = args x = self.conv(x, edge_index) if not self.disable_dropout: x = F.dropout(x, training=self.training) if self.training: target = torch.ones_like(x) loss = self.loss_fn(x, target) return x, loss return x ================================================ FILE: tests/gnn/benchgnn/requirements.txt ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. pytest-benchmark pytest-cov nbformat nbconvert pandas rdflib tabulate ================================================ FILE: tests/gnn/benchgnn/utils.py ================================================ # Copyright (c) 2022 Graphcore Ltd. All rights reserved. from tabulate import _table_formats, tabulate all_formats = sorted(list(_table_formats.keys())) def merge_results(results, prev_results): if prev_results: keys = prev_results[0].keys() time_keys = {'ipu_time', 'gpu_time', 'cpu_time'}.intersection(keys) for curr, prev in zip(results, prev_results): for key in time_keys: curr['prev_' + key.split('_')[0]] = prev[key] return results def include_speedups_ratio(results): keys = list(results[0].keys()) # Calculate speedup over other times if 'ipu_time' in keys: other = filter(lambda x: x in keys, ('cpu_time', 'prev_cpu', 'prev_gpu', 'prev_ipu')) for t in other: for res in results: res['ipu/' + t] = res[t] / res["ipu_time"] return results def print_results(results, format): results = include_speedups_ratio(results) content = [list(results[0].keys())] prev_model = None for res in results: curr_model = res['model'] if prev_model != curr_model: if prev_model is not None: content.append([]) prev_model = curr_model else: res['model'] = '' row = [f'{x:.2f}' if isinstance(x, float) else x for x in res.values()] content.append(row) body = tabulate(content, headers='firstrow', tablefmt=format) print('\n', body, sep='') ================================================ FILE: tests/gnn/benchgnn_ops/README.md ================================================ # benchgnn Benchmark tool for testing performance of GNN operators ## Usage example Running single benchmark test case scenario from command line: ``python3 benchgnn_ops.py --num_sample_rounds 10 scatter --src_shape [1,12] --input_shape [1,12] --index_shape [1,12] --dim 0`` Running multiple benchmark test case scenarios from yaml configuration files from given directory: ``python3 benchgnn_ops.py --common_config=example_configs/common.yaml --config_dir=example_configs`` Running multiple benchmark test case scenarios from given yaml configuration files: ``python3 benchgnn_ops.py --common_config=example_configs/common.yaml --config_files=[example_configs/scatter_testcase1.yaml,example_configs/scatter_testcase2.yaml]`` Running multiple benchmark test case scenarios - combining all available options: ``python3 benchgnn_ops.py --common_config=example_configs/common.yaml --config_dir=example_configs --config_files=[example_configs/scatter_testcase1.yaml,example_configs/scatter_testcase2.yaml] scatter --src_shape [1,12] --input_shape [1,12] --index_shape [1,12] --dim 0`` Type ``python3 benchgnn_ops.py --help`` to print detailed information about supported options. ================================================ FILE: tests/gnn/benchgnn_ops/benchgnn_ops.py ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. import os from typing import List, Tuple import jsonargparse import pandas as pd import torch from builder import BenchModelBuilder from metrics import PerfMetrics, to_data_frame from ops import bench_ops from tqdm import tqdm def prepare_parser() -> jsonargparse.ArgumentParser: jsonargparse.set_docstring_parse_options(attribute_docstrings=True) jsonargparse.typing.register_type(torch.Size, torch.Size, torch.Size) parser = jsonargparse.ArgumentParser(prog='GNN Ops Benchmark') parser.add_class_arguments(BenchModelBuilder, 'compile_options') parser.add_argument('--seed', type=int, default=0, help='the random seed to use.') parser.add_argument( '--num_warmup_rounds', type=int, default=4, help='num_warmup_rounds: initial set of runs to discard.') parser.add_argument( '--num_sample_rounds', type=int, default=1, help='num_sample_rounds: the number of runs used to average the ' 'runtime.') parser.add_argument( '--calc_samples_mean', type=bool, default=True, help='calculate mean over collected `num_sample_rounds` measurements.') parser.add_argument( '--clock', type=int, default=None, help='manually override clock value (Mhz) read by gcipuinfo.') parser.add_argument( '--common_config', type=jsonargparse.typing.Path_fr, default=None, help='yaml file containing configuration options shared between all ' 'benchmark testcases.') parser.add_argument( '--config_dir', type=jsonargparse.typing.Path_drw, default=None, help='directory with a set of yaml benchmark test case scenario' 'files.') parser.add_argument( '--config_files', type=List[jsonargparse.typing.Path_fr], default=None, help='set of yaml file paths containig benchmark test case scenarios.') subcommands = parser.add_subcommands(required=False, dest='operation') for command, op_type in bench_ops.items(): subparser = jsonargparse.ArgumentParser() subparser.add_class_arguments(op_type) subcommands.add_subcommand(command, subparser) return parser def collect_measurements(config_src: str, testcase_config: jsonargparse.namespace.Namespace ) -> pd.DataFrame: torch.manual_seed(testcase_config.seed) op_name = testcase_config.operation op_params = getattr(testcase_config, op_name) operator = bench_ops[op_name](**op_params.as_dict()) builder = BenchModelBuilder(**testcase_config.compile_options.as_dict()) compiled_model = builder.create_model(operator) metrics = PerfMetrics(config_src, operator, testcase_config.compile_options.num_repeats, op_name, str(op_params), testcase_config.clock) for _ in range(testcase_config.num_warmup_rounds): _ = compiled_model() measurements = [] for _ in range(testcase_config.num_sample_rounds): _ = compiled_model() measurements.append( metrics.get_measurement(compiled_model.cycleCount())) return to_data_frame(measurements, testcase_config.calc_samples_mean) def run_benchmark(testcases: List[Tuple[str, jsonargparse.namespace.Namespace]] ) -> pd.DataFrame: bar = tqdm(range(len(testcases)), desc="Benchmarking progress", unit="testcase", position=3) data_frames = [] for testcase_config in testcases: data_frames.append(collect_measurements(*testcase_config)) bar.update() bar.refresh() bar.clear() bar.close() return pd.concat(data_frames, ignore_index=True) def set_defaults_from_yaml_config( parser: jsonargparse.ArgumentParser, common_config_path: jsonargparse.typing.Path_fr) -> None: common_config_raw = parser.parse_path(common_config_path, defaults=False) parser.set_defaults(**dict(common_config_raw.as_flat()._get_kwargs())) # pylint: disable=protected-access def set_defaults_from_user_params(parser: jsonargparse.ArgumentParser, user_params: jsonargparse.namespace.Namespace ) -> None: default_params = user_params.clone() if 'operation' in default_params: op = default_params['operation'] del default_params[op] del default_params['operation'] parser.set_defaults(**dict(default_params.as_flat()._get_kwargs())) # pylint: disable=protected-access def set_defaults(parser: jsonargparse.ArgumentParser, user_params: jsonargparse.namespace.Namespace) -> None: common_config_path = None if 'common_config' in user_params: common_config_path = user_params.common_config.abs_path set_defaults_from_yaml_config(parser, common_config_path) set_defaults_from_user_params(parser, user_params) def get_test_case_config_paths(user_params: jsonargparse.namespace.Namespace ) -> List[str]: test_case_config_paths = [] common_config_path = None if 'common_config' in user_params: common_config_path = user_params.common_config.abs_path def is_valid_path(path: str) -> bool: return os.path.isfile(path) and path != common_config_path if 'config_dir' in user_params: base_dir = user_params.config_dir.abs_path for filename in os.listdir(base_dir): file_path = os.path.join(base_dir, filename) if is_valid_path(file_path): test_case_config_paths.append(file_path) if 'config_files' in user_params: for file_path in user_params.config_files: file_abs_path = file_path.abs_path if is_valid_path(file_abs_path): test_case_config_paths.append(file_abs_path) return test_case_config_paths def parse_test_case_config_files(test_case_config_paths: List[str] ) -> List[jsonargparse.namespace.Namespace]: test_case_configs = [] for file_path in test_case_config_paths: try: test_case_configs.append(( os.path.basename(file_path), parser.parse_path(file_path), )) except Exception as e: print(f'Parsing {file_path} failed.') raise e return test_case_configs def get_test_case_configs(parser: jsonargparse.ArgumentParser, user_params: jsonargparse.namespace.Namespace ) -> List[jsonargparse.namespace.Namespace]: test_case_configs = [] if 'operation' in user_params: test_case_configs.append(( 'cmd', parser.parse_args(defaults=True), )) config_paths = get_test_case_config_paths(user_params) test_case_configs.extend(parse_test_case_config_files(config_paths)) return test_case_configs if __name__ == "__main__": parser = prepare_parser() user_params = parser.parse_args(defaults=False) set_defaults(parser, user_params) test_case_configs = get_test_case_configs(parser, user_params) if test_case_configs: results = run_benchmark(test_case_configs) print(results.to_string()) else: print('No test cases to benchmark. Please check `python3 ' 'benchgnn_ops.py --help`.') ================================================ FILE: tests/gnn/benchgnn_ops/builder.py ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. from typing import Optional import torch import poptorch class BenchModel(torch.nn.Module): def __init__(self, operator: torch.nn.Module, num_repeats: int) -> None: super().__init__() self.num_repeats = num_repeats self.operator = operator def forward(self) -> torch.Tensor: return poptorch.for_loop(self.num_repeats, self.operator, self.operator.loop_inputs())[-1] def _create_poptorch_options( synthetic_data: bool = 0, available_memory_proportion: Optional[float] = None, profile_dir: Optional[str] = None, cache_dir: str = 'benchgnn_model_cache') -> poptorch.Options: options = poptorch.Options() options.enableSyntheticData(synthetic_data) options.logCycleCount(True) options.enableExecutableCaching(cache_dir) options.connectionType(poptorch.ConnectionType.OnDemand) if available_memory_proportion is not None: amp_dict = {"IPU0": available_memory_proportion} options.setAvailableMemoryProportion(amp_dict) if profile_dir: options.enableProfiling(profile_dir) return options class BenchModelBuilder(): def __init__(self, synthetic_data: bool = False, available_memory_proportion: Optional[float] = None, profile_dir: Optional[str] = None, cache_dir: str = 'benchgnn_model_cache', num_repeats: int = 128) -> None: """ model compile options Args: synthetic_data (bool, optional): Use synthetic data on the device to disable I/O. (default: :obj:`False`) available_memory_proportion (float, optional): the AMP budget used for planning ops. (default: :obj:`None`) profile_dir (str, optional): saves the profiling report to the provided location. (default: :obj:`None`) cache_dir (str, optional): saves the executable cache to the provided location. (default: :obj:`benchgnn_model_cache`) num_repeats (int, optional): the number of times to invoke the operator on device. (default: :obj:`128`) """ self.num_repeats = num_repeats self.options = _create_poptorch_options(synthetic_data, available_memory_proportion, profile_dir, cache_dir) def create_model(self, operator: torch.nn.Module): model = BenchModel(num_repeats=self.num_repeats, operator=operator) pop_model = poptorch.inferenceModel(model, options=self.options) pop_model.compile() return pop_model ================================================ FILE: tests/gnn/benchgnn_ops/example_configs/common.yaml ================================================ num_sample_rounds: 25 compile_options: num_repeats: 100 ================================================ FILE: tests/gnn/benchgnn_ops/example_configs/scatter_testcase1.yaml ================================================ scatter: src_shape: [1,120] input_shape: [1,120] index_shape: [1,120] dim: 0 ================================================ FILE: tests/gnn/benchgnn_ops/example_configs/scatter_testcase2.yaml ================================================ scatter: src_shape: [1,12] input_shape: [1,12] index_shape: [1,12] dim: 0 ================================================ FILE: tests/gnn/benchgnn_ops/metrics.py ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. import warnings from typing import Any, Dict, List, Optional try: import gcipuinfo # type: ignore except ImportError: gcipuinfo = None import pandas as pd import torch def _get_clock_value() -> int: if gcipuinfo is None: default_clock_value = 1850 warnings.warn('Unable to import gcipuinfo. Using default value ' f'{default_clock_value} MHz') return default_clock_value try: clock = int(gcipuinfo.gcipuinfo().getDevices()[0]['clock'][:-3]) except Exception as e: raise RuntimeError( 'Getting clock frequency using gcipuinfo failed') from e return clock def _mean(col: pd.core.series.Series) -> Any: if pd.api.types.is_numeric_dtype(col): mean = col.mean() if col.name == 'cycles' or col.name == 'clock (MHz)': mean = mean.astype('int64') return mean return col.unique() def to_data_frame(measurements: List[Dict[str, Any]], calc_mean=False) -> pd.DataFrame: data_frame = pd.DataFrame(measurements) if calc_mean: return data_frame.agg(_mean) return data_frame class PerfMetrics: r"""Track performance metrics from: * recorded number of cycles * sizes of input / output Defines an effective bandwidth from the size of the output result. """ def __init__(self, config_src: str, operator: torch.nn.Module, num_repeats: int, op_name: str, op_params: str, clock: Optional[int] = None) -> None: output = operator.output numels = output.numel() numbytes = torch.finfo(output.dtype).bits // 8 self.out_gib = numels * numbytes / 1024**3 self.num_repeats = num_repeats self.clock = _get_clock_value() if clock is None else clock self.op_name = op_name self.op_params = op_params self.config_src = config_src def get_measurement(self, cycles: int) -> Dict[str, Any]: avg_cycles = cycles / self.num_repeats time_us = avg_cycles / self.clock time_s = time_us * 10**-6 effective_bandwidth = self.out_gib / time_s return { 'operator': self.op_name, 'cycles': avg_cycles, 'clock (MHz)': self.clock, 'time (us)': time_us, 'effective bandwidth (GiB/s)': effective_bandwidth, 'parameters': self.op_params, 'config source': self.config_src } ================================================ FILE: tests/gnn/benchgnn_ops/ops.py ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. from typing import List, Tuple import torch class ScatterOp(torch.nn.Module): def __init__(self, dim: int, input_shape: torch.Size, index_shape: torch.Size, src_shape: torch.Size) -> None: """Scatter Op. Args: dim (int): the axis along which to index. input_shape (torch.Size): the scatter input tensor shape. index_shape (torch.Size): the indices shape of elements to scatter. src_shape (torch.Size): the source element(s) shape to scatter. """ super().__init__() self.dim = dim input = torch.randn(*input_shape) index = torch.randint(input_shape[dim], index_shape) src = torch.randn(*src_shape) self.register_buffer('input', input) self.register_buffer('index', index) self.register_buffer('src', src) self.register_buffer('output', self(input, index, src, None)[-1]) def loop_inputs(self) -> List[torch.Tensor]: return [self.input, self.index, self.src, self.output] def forward( self, input: torch.tensor, index: torch.tensor, src: torch.tensor, output: torch.tensor # pylint: disable=unused-argument ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: return input, index, src, torch.scatter(input, self.dim, index, src) class ScatterReduceOp(torch.nn.Module): def __init__(self, dim: int, input_shape: torch.Size, index_shape: torch.Size, src_shape: torch.Size, reduce: str, include_self: bool = True) -> None: """ScatterReduce Op. Args: dim (int): the axis along which to index. input_shape (torch.Size): the scatter input tensor shape. index_shape (torch.Size): the indices shape of elements to scatter. src_shape (torch.Size): the source element(s) shape to scatter. reduce (str): the reduction operation to apply for non-unique indices. ("sum", "prod", "mean", "amax", "amin") include_self (bool, optional): whether elements from the self tensor are included in the reduction. (default: :obj:`True`) """ super().__init__() self.dim = dim self.reduce = reduce self.include_self = include_self input = torch.randn(*input_shape) index = torch.randint(input_shape[dim], index_shape) src = torch.randn(*src_shape) self.register_buffer('input', input) self.register_buffer('index', index) self.register_buffer('src', src) self.register_buffer('output', self(input, index, src, None)[-1]) def loop_inputs(self) -> List[torch.Tensor]: return [self.input, self.index, self.src, self.output] def forward( self, input: torch.tensor, index: torch.tensor, src: torch.tensor, output: torch.tensor # pylint: disable=unused-argument ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: return input, index, src, input.scatter_reduce( self.dim, index, src, self.reduce, include_self=self.include_self) class IndexReduceOp(torch.nn.Module): def __init__(self, dim: int, input_shape: torch.Size, index_shape: torch.Size, src_shape: torch.Size, reduce: str, include_self: bool = True) -> None: """IndexReduce Op. Args: dim: the axis along which to index. input_shape: the index reduce input tensor shape. index_shape: the indices shape of elements to select from. src_shape: the source element(s) shape. reduce: the reduction operation to apply ("prod", "mean", "amax", "amin") include_self: whether elements from the self tensor are included in the reduction """ super().__init__() self.dim = dim self.reduce = reduce self.include_self = include_self input = torch.randn(*input_shape) index = torch.randint(input_shape[dim], index_shape) src = torch.randn(*src_shape) self.register_buffer('input', input) self.register_buffer('index', index) self.register_buffer('src', src) self.register_buffer('output', self(input, index, src, None)[-1]) def loop_inputs(self) -> List[torch.Tensor]: return [self.input, self.index, self.src, self.output] def forward( self: torch.tensor, input: torch.tensor, index: torch.tensor, src: torch.tensor, output: torch.tensor # pylint: disable=unused-argument ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: return input, index, src, input.index_reduce_( self.dim, index, src, self.reduce, include_self=self.include_self) class IndexSelectOp(torch.nn.Module): def __init__(self, dim: int, input_shape: torch.Size, index_size: int) -> None: """IndexSelect Op. Args: dim: the axis along which to index. input_shape: the input tensor shape. index_size: the indices size. """ super().__init__() self.dim = dim input = torch.randn(*input_shape) index = torch.randint(input_shape[dim], (index_size, )) self.register_buffer('input', input) self.register_buffer('index', index) self.register_buffer('output', self(input, index, None)[-1]) def loop_inputs(self) -> List[torch.Tensor]: return [self.input, self.index, self.output] def forward(self, input: torch.tensor, index: torch.tensor, _: torch.tensor ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: return input, index, input.index_select(dim=self.dim, index=index) class GatherOp(torch.nn.Module): def __init__(self, dim: int, input_shape: torch.Size, index_shape: torch.Size) -> None: """Gather Op. Args: dim: the axis along which to index. input_shape: the scatter input tensor shape. index_shape: the indices shape of elements to gather. """ super().__init__() self.dim = dim input = torch.randn(*input_shape) index = torch.randint(input_shape[dim], index_shape) self.register_buffer('input', input) self.register_buffer('index', index) self.register_buffer('output', self(input, index, None)[-1]) def loop_inputs(self) -> List[torch.Tensor]: return [self.input, self.index, self.output] def forward(self: torch.tensor, input: torch.tensor, index: torch.tensor, _: torch.tensor ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: return input, index, torch.gather(input, self.dim, index) bench_ops = { 'gather': GatherOp, 'index_reduce': IndexReduceOp, 'index_select': IndexSelectOp, 'scatter': ScatterOp, 'scatter_reduce': ScatterReduceOp } ================================================ FILE: tests/gnn/benchgnn_ops/requirements.txt ================================================ jsonargparse==4.19.0 docstring-parser==0.15 tqdm==4.64.1 ================================================ FILE: tests/gnn/conftest.py ================================================ # Copyright (c) 2022 Graphcore Ltd. All rights reserved. import os.path as osp import pytest import torch_geometric as pyg @pytest.fixture(scope="module") def pyg_qm9(pytestconfig): qm9root = osp.join(pytestconfig.getoption("external_datasets_dir"), "qm9") if not osp.exists(qm9root): raise RuntimeError(f'Path {qm9root} not exists.') return pyg.datasets.QM9(root=qm9root) @pytest.fixture(scope="module") def planetoid_cora(pytestconfig): planetoid_root = osp.join(pytestconfig.getoption("external_datasets_dir"), "planetoid") if not osp.exists(planetoid_root): raise RuntimeError(f'Path {planetoid_root} not exists.') return pyg.datasets.Planetoid(planetoid_root, "Cora", transform=pyg.transforms.NormalizeFeatures()) @pytest.fixture(scope="module") def molecule(pyg_qm9): # The index of the largest molecule in the QM9 dataset, which looks like: # Data(edge_attr=[56, 4], edge_index=[2, 56], idx=[1], name="gdb_57518", # pos=[29, 3], x=[29, 11], y=[1, 19], z=[29]) max_index = 55967 return pyg_qm9[max_index] @pytest.fixture(scope="module") def fake_small_dataset() -> pyg.datasets.FakeDataset: pyg.seed_everything(42) dataset = pyg.datasets.FakeDataset(num_graphs=10, avg_num_nodes=30, avg_degree=5) return dataset @pytest.fixture(scope="module") def fake_large_dataset() -> pyg.datasets.FakeDataset: pyg.seed_everything(42) dataset = pyg.datasets.FakeDataset(num_graphs=100, avg_num_nodes=10) return dataset @pytest.fixture(scope="module") def fake_node_task_dataset() -> pyg.datasets.FakeDataset: pyg.seed_everything(42) dataset = pyg.datasets.FakeDataset(num_graphs=500, avg_num_nodes=10, task='node') return dataset @pytest.fixture(scope="module") def fake_hetero_dataset() -> pyg.datasets.FakeHeteroDataset: pyg.seed_everything(1410) dataset = pyg.datasets.FakeHeteroDataset(num_graphs=100, num_node_types=2, num_edge_types=5, avg_num_nodes=50) return dataset @pytest.fixture(scope="module") def fake_node_task_hetero_dataset() -> pyg.datasets.FakeHeteroDataset: pyg.seed_everything(1410) dataset = pyg.datasets.FakeHeteroDataset(num_graphs=100, num_node_types=2, num_edge_types=5, avg_num_nodes=50, task='node') return dataset @pytest.fixture(scope="module") def fake_hetero_data() -> pyg.datasets.FakeHeteroDataset: pyg.seed_everything(1410) dataset = pyg.datasets.FakeHeteroDataset(num_graphs=1, num_node_types=2, num_edge_types=5, avg_num_nodes=50) return dataset[0] @pytest.fixture(scope="module") def fake_molecular_dataset() -> pyg.datasets.FakeDataset: # setup a dataset which looks like a molecular dataset. pyg.seed_everything(42) avg_num_nodes = 20 avg_degree = 3 dataset = pyg.datasets.FakeDataset( num_graphs=1000, avg_num_nodes=avg_num_nodes, avg_degree=avg_degree, num_channels=20, task="graph", ) return dataset ================================================ FILE: tests/gnn/nn/aggr/aggr_utils.py ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. from typing import List import torch import torch_geometric from poptorch_geometric import TrainingStepper def aggr_harness(aggr, dim_size, dataloader=None, post_proc=None, sorted_index=False, loss_fn=torch.nn.MSELoss(), num_steps=4, atol=5e-3, rtol=5e-3, equal_nan=False, enable_fp_exception=True): class AggrWrapper(torch.nn.Module): def __init__(self, aggr, loss_fn, post_proc=None): assert hasattr(loss_fn, 'reduction') # No support for other reduction types yet assert loss_fn.reduction in ('sum', 'mean') super().__init__() self.aggr = aggr self.loss_fn = loss_fn self.post_proc = post_proc self.mean_reduction_in_loss = (loss_fn.reduction == 'mean') def forward(self, *args): x = args[0] edge_index = args[1] nodes_mask = args[2] size = args[3] broadcast_index = edge_index[1] if sorted_index else edge_index[0] aggr_index = edge_index[0] if sorted_index else edge_index[1] x_broadcasted = torch.index_select(x, 0, broadcast_index) kwargs = {} if isinstance(self.aggr, (torch_geometric.nn.aggr.SortAggregation, torch_geometric.nn.aggr.GRUAggregation, torch_geometric.nn.aggr.GraphMultisetTransformer, torch_geometric.nn.aggr.SetTransformerAggregation, torch_geometric.nn.aggr.LSTMAggregation)): kwargs["max_num_elements"] = size result = self.aggr(x_broadcasted, aggr_index, dim_size=size, **kwargs) if self.post_proc is not None: if isinstance(result, List): nodes_mask = nodes_mask.repeat(len(result)) result = torch.cat(result) result = self.post_proc(result) # Apply nodes mask, so that the loss may be computed properly result[~nodes_mask] = 0 if self.training: if isinstance(result, List): result = torch.cat(result) target = torch.ones_like(result) target[~nodes_mask] = 0 loss = self.loss_fn(result, target) # In case, the loss function applies mean reduction, the result # has to be rescaled by the effective size of the batch # (excluding padding). if self.mean_reduction_in_loss: real_size = torch.count_nonzero(nodes_mask) loss = loss * size / real_size return result, loss return result model = AggrWrapper(aggr, loss_fn=loss_fn, post_proc=post_proc) stepper = TrainingStepper(model, atol=atol, rtol=rtol, equal_nan=equal_nan, enable_fp_exception=enable_fp_exception) if dataloader is not None: for step, batch in enumerate(dataloader): if step == num_steps: break stepper.run( 1, (batch.x, batch.edge_index, batch.nodes_mask, dim_size)) ================================================ FILE: tests/gnn/nn/aggr/conftest.py ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. import pytest from torch_geometric import seed_everything from torch_geometric.datasets import FakeDataset from torch_geometric.transforms import NormalizeFeatures from poptorch_geometric.dataloader import FixedSizeDataLoader from poptorch_geometric.fixed_size_options import FixedSizeOptions from poptorch_geometric.pyg_dataloader import FixedSizeStrategy @pytest.fixture def dataloader(): seed_everything(42) dataset = FakeDataset(num_graphs=4, avg_num_nodes=8, avg_degree=3, transform=NormalizeFeatures(), num_channels=8) dataloader = FixedSizeDataLoader( dataset, fixed_size_options=FixedSizeOptions(num_nodes=12, num_edges=32), fixed_size_strategy=FixedSizeStrategy.StreamPack, add_pad_masks=True) return dataloader ================================================ FILE: tests/gnn/nn/aggr/test_attention.py ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. import torch from torch_geometric.nn import MLP from torch_geometric.nn.aggr import AttentionalAggregation from aggr_utils import aggr_harness def test_attentional_aggregation(dataloader): first_sample = next(iter(dataloader)) in_channels = first_sample.num_node_features out_channels = in_channels * 2 gate_nn = MLP([in_channels, 1], act='relu') nn = MLP([in_channels, in_channels], act='relu') aggr = AttentionalAggregation(gate_nn, nn) post_proc = torch.nn.Linear(in_channels, out_channels) aggr_harness(aggr, first_sample.num_nodes, dataloader, post_proc, atol=1e-3, rtol=5e-3) ================================================ FILE: tests/gnn/nn/aggr/test_basic.py ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. import pytest import torch from torch_geometric.nn import ( MaxAggregation, MeanAggregation, MinAggregation, MulAggregation, PowerMeanAggregation, SoftmaxAggregation, StdAggregation, SumAggregation, VarAggregation, ) from aggr_utils import aggr_harness @pytest.mark.parametrize('Aggregation', [ MeanAggregation, SumAggregation, MaxAggregation, MinAggregation, MulAggregation, VarAggregation, StdAggregation, ]) def test_basic_aggregation(dataloader, Aggregation): first_sample = next(iter(dataloader)) in_channels = first_sample.num_node_features out_channels = in_channels * 2 aggr = Aggregation() post_proc = torch.nn.Linear(in_channels, out_channels) aggr_harness(aggr, first_sample.num_nodes, dataloader, post_proc) @pytest.mark.parametrize('Aggregation', [ SoftmaxAggregation, PowerMeanAggregation, ]) @pytest.mark.parametrize('learn', [True, False]) def test_gen_aggregation(dataloader, Aggregation, learn): first_sample = next(iter(dataloader)) in_channels = first_sample.num_node_features out_channels = in_channels * 2 aggr = Aggregation(learn=learn) post_proc = torch.nn.Linear(in_channels, out_channels) if isinstance(aggr, PowerMeanAggregation): enable_fp_exception = False equal_nan = True else: enable_fp_exception = True equal_nan = False aggr_harness(aggr, first_sample.num_nodes, dataloader, post_proc, equal_nan=equal_nan, enable_fp_exception=enable_fp_exception) @pytest.mark.parametrize('Aggregation', [ SoftmaxAggregation, PowerMeanAggregation, ]) def test_learnable_channels_aggregation(dataloader, Aggregation): first_sample = next(iter(dataloader)) channels = first_sample.num_node_features aggr = Aggregation(learn=True, channels=channels) if isinstance(aggr, PowerMeanAggregation): enable_fp_exception = False equal_nan = True else: enable_fp_exception = True equal_nan = False aggr_harness(aggr, first_sample.num_nodes, dataloader, equal_nan=equal_nan, enable_fp_exception=enable_fp_exception) ================================================ FILE: tests/gnn/nn/aggr/test_deep_sets.py ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. from torch_geometric.nn import DeepSetsAggregation, Linear from aggr_utils import aggr_harness def test_deep_sets_aggregation(dataloader): first_sample = next(iter(dataloader)) channels = first_sample.num_node_features aggr = DeepSetsAggregation( local_nn=Linear(channels, channels * 2), global_nn=Linear(channels * 2, channels * 4), ) aggr.reset_parameters() aggr_harness(aggr, first_sample.num_nodes, dataloader) ================================================ FILE: tests/gnn/nn/aggr/test_equilibrium.py ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. import pytest from torch_geometric.nn import EquilibriumAggregation from aggr_utils import aggr_harness @pytest.mark.skip(reason="TODO(AFS-354)") @pytest.mark.parametrize('grad_iter', [0, 1, 5]) def test_equilibrium(dataloader, grad_iter): first_sample = next(iter(dataloader)) channels = first_sample.num_node_features aggr = EquilibriumAggregation(channels, channels // 2, num_layers=[10, 10], grad_iter=grad_iter) aggr_harness(aggr, first_sample.num_nodes, dataloader) ================================================ FILE: tests/gnn/nn/aggr/test_fused.py ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. import pytest import torch from torch_geometric.nn.aggr.fused import FusedAggregation from aggr_utils import aggr_harness @pytest.mark.parametrize('aggrs', [ ['sum', 'mean', 'min', 'max', 'mul', 'var', 'std'], ['sum', 'min', 'max', 'mul', 'var', 'std'], ['min', 'max', 'mul', 'var', 'std'], ['mean', 'min', 'max', 'mul', 'var', 'std'], ['sum', 'min', 'max', 'mul', 'std'], ['mean', 'min', 'max', 'mul', 'std'], ['min', 'max', 'mul', 'std'], ]) def test_fused_aggregation(dataloader, aggrs): first_sample = next(iter(dataloader)) in_channels = first_sample.num_node_features out_channels = in_channels * 2 aggr = FusedAggregation(aggrs) post_proc = torch.nn.Linear(in_channels, out_channels) aggr_harness(aggr, first_sample.num_nodes, dataloader, post_proc) ================================================ FILE: tests/gnn/nn/aggr/test_gmt.py ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. import pytest from torch_geometric.nn.aggr import GraphMultisetTransformer from aggr_utils import aggr_harness @pytest.mark.skip(reason="TODO(AFS-351)") def test_graph_multiset_transformer(dataloader): first_sample = next(iter(dataloader)) print(first_sample) print(first_sample.num_nodes) channels = first_sample.num_node_features aggr = GraphMultisetTransformer(channels, k=2, heads=2) aggr.reset_parameters() aggr_harness(aggr, first_sample.num_nodes, dataloader, sorted_index=True, enable_fp_exception=False, equal_nan=True) ================================================ FILE: tests/gnn/nn/aggr/test_gru.py ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. from torch_geometric.nn import GRUAggregation from aggr_utils import aggr_harness def test_gru_aggregation(dataloader): first_sample = next(iter(dataloader)) channels = first_sample.num_node_features aggr = GRUAggregation(channels, channels * 2) aggr.reset_parameters() aggr_harness(aggr, first_sample.num_nodes, dataloader, sorted_index=True) ================================================ FILE: tests/gnn/nn/aggr/test_lstm.py ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. from torch_geometric.nn import LSTMAggregation from aggr_utils import aggr_harness def test_lstm_aggregation(dataloader): first_sample = next(iter(dataloader)) channels = first_sample.num_node_features aggr = LSTMAggregation(channels, channels * 2) aggr_harness(aggr, first_sample.num_nodes, dataloader, sorted_index=True) ================================================ FILE: tests/gnn/nn/aggr/test_mlp_aggr.py ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. from torch_geometric.nn import MLPAggregation from aggr_utils import aggr_harness def test_mlp_aggregation(dataloader): first_sample = next(iter(dataloader)) channels = first_sample.num_node_features aggr = MLPAggregation( in_channels=channels, out_channels=channels * 2, max_num_elements=first_sample.num_nodes, num_layers=1, ) aggr_harness(aggr, first_sample.num_nodes, dataloader, sorted_index=True) ================================================ FILE: tests/gnn/nn/aggr/test_multi.py ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. import pytest import torch from torch_geometric.nn import MultiAggregation from aggr_utils import aggr_harness @pytest.mark.parametrize('mode', [ 'cat', 'proj', 'attn', 'sum', 'mean', 'max', 'min', 'logsumexp', 'std', 'var' ]) def test_multi_aggr(dataloader, mode): first_sample = next(iter(dataloader)) in_channels = first_sample.num_node_features out_channels = in_channels * 2 mode_kwargs = None if mode == 'proj': mode_kwargs = dict(in_channels=in_channels, out_channels=in_channels) elif mode == 'attn': mode_kwargs = dict(in_channels=in_channels, out_channels=in_channels, num_heads=in_channels // 4) aggrs = ['mean', 'sum', 'max'] aggr = MultiAggregation(aggrs, mode=mode, mode_kwargs=mode_kwargs) aggr.reset_parameters() if mode == 'cat': # The 'cat' combine mode will expand the output dimensions # the number of aggregators. in_channels = in_channels * len(aggrs) out_channels = out_channels * len(aggrs) post_proc = torch.nn.Linear(in_channels, out_channels) aggr_harness(aggr, first_sample.num_nodes, dataloader, post_proc, atol=1e-3) ================================================ FILE: tests/gnn/nn/aggr/test_quantile.py ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. import pytest import torch from torch_geometric.nn import MedianAggregation, QuantileAggregation from aggr_utils import aggr_harness @pytest.mark.parametrize('q', [0., .1, .2, .3, .4, .5, .6, .7, .8, .9, 1.]) @pytest.mark.parametrize('interpolation', QuantileAggregation.interpolations) def test_quantile_aggregation(dataloader, q, interpolation): torch.manual_seed(42) first_sample = next(iter(dataloader)) in_channels = first_sample.num_node_features out_channels = in_channels * 2 aggr = QuantileAggregation(q=q, interpolation=interpolation) post_proc = torch.nn.Linear(in_channels, out_channels) aggr_harness(aggr, first_sample.num_nodes, dataloader, post_proc, sorted_index=True) def test_median_aggregation(dataloader): first_sample = next(iter(dataloader)) in_channels = first_sample.num_node_features out_channels = in_channels * 2 aggr = MedianAggregation() post_proc = torch.nn.Linear(in_channels, out_channels) aggr_harness(aggr, first_sample.num_nodes, dataloader, post_proc) ================================================ FILE: tests/gnn/nn/aggr/test_scaler.py ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. import pytest import torch from torch_geometric.nn import DegreeScalerAggregation from aggr_utils import aggr_harness @pytest.mark.parametrize('scaler', [['identity'], ['amplification'], ['attenuation'], ['linear'], ['inverse_linear']]) @pytest.mark.parametrize('train_norm', [True, False]) def test_degree_scaler_aggregation(dataloader, scaler, train_norm): first_sample = next(iter(dataloader)) in_channels = first_sample.num_node_features out_channels = in_channels * 2 deg = torch.tensor([2, 5, 3, 1, 2, 3, 4, 1, 2, 0]) basic_aggrs = ['mean', 'sum', 'max'] aggr = DegreeScalerAggregation(basic_aggrs, scaler, deg, train_norm=train_norm) post_proc = torch.nn.Linear(in_channels * len(basic_aggrs), out_channels * len(basic_aggrs)) aggr_harness(aggr, first_sample.num_nodes, dataloader, post_proc) ================================================ FILE: tests/gnn/nn/aggr/test_set2set.py ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. from torch_geometric.nn.aggr import Set2Set from aggr_utils import aggr_harness def test_set2set(dataloader): first_sample = next(iter(dataloader)) channels = first_sample.num_node_features aggr = Set2Set(in_channels=channels, processing_steps=1) aggr_harness(aggr, first_sample.num_nodes, dataloader) ================================================ FILE: tests/gnn/nn/aggr/test_set_transformer.py ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. import pytest from torch_geometric.nn.aggr import SetTransformerAggregation from aggr_utils import aggr_harness @pytest.mark.skip(reason="TODO(AFS-351)") def test_set_transformer_aggregation(dataloader): first_sample = next(iter(dataloader)) channels = first_sample.num_node_features aggr = SetTransformerAggregation(channels, num_seed_points=2, heads=2) aggr.reset_parameters() aggr_harness(aggr, first_sample.num_nodes, dataloader, sorted_index=True, enable_fp_exception=False, equal_nan=True) ================================================ FILE: tests/gnn/nn/aggr/test_sort.py ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. import torch from torch_geometric.nn.aggr import SortAggregation from aggr_utils import aggr_harness def test_sort_aggregation(dataloader): first_sample = next(iter(dataloader)) in_channels = first_sample.num_node_features out_channels = in_channels * 2 k = 5 aggr = SortAggregation(k=k) post_proc = torch.nn.Linear(k * in_channels, k * out_channels) aggr_harness(aggr, first_sample.num_nodes, dataloader, post_proc, sorted_index=True) ================================================ FILE: tests/gnn/nn/conftest.py ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. import pytest from torch_geometric import seed_everything from torch_geometric.datasets import FakeDataset from torch_geometric.transforms import Compose, GCNNorm, NormalizeFeatures from poptorch_geometric.dataloader import FixedSizeDataLoader, DataLoader from poptorch_geometric.fixed_size_options import FixedSizeOptions from poptorch_geometric.pyg_dataloader import FixedSizeStrategy def get_dataset(num_channels=16): seed_everything(0) transform = Compose([GCNNorm(), NormalizeFeatures()]) dataset = FakeDataset(avg_num_nodes=32, avg_degree=5, transform=transform, num_channels=num_channels) data = dataset[0] data.num_classes = dataset.num_classes return data @pytest.fixture def dataset(): return get_dataset() @pytest.fixture def fake_dataset(): seed_everything(0) dataset = FakeDataset(num_graphs=4, avg_num_nodes=8, avg_degree=3, transform=NormalizeFeatures(), num_channels=10) return dataset @pytest.fixture def fixed_size_dataloader(fake_dataset): dataloader = FixedSizeDataLoader( fake_dataset, fixed_size_options=FixedSizeOptions(num_nodes=12), fixed_size_strategy=FixedSizeStrategy.StreamPack, add_pad_masks=True) return dataloader @pytest.fixture def dataloader(fake_dataset): dataloader = DataLoader(fake_dataset, shuffle=False) return dataloader ================================================ FILE: tests/gnn/nn/conv/conv_utils.py ================================================ # Copyright (c) 2022 Graphcore Ltd. All rights reserved. import torch from torch_geometric import seed_everything from torch_geometric.data import HeteroData from poptorch_geometric import TrainingStepper def conv_harness(conv, dataset=None, post_proc=None, loss_fn=torch.nn.MSELoss(), num_steps=4, atol=1e-5, rtol=1e-4, batch=None, training=True): class ConvWrapper(torch.nn.Module): def __init__(self, conv, loss_fn, post_proc=None): super().__init__() self.conv = conv self.loss_fn = loss_fn self.post_proc = post_proc def forward(self, *args): x = self.conv(*args) if self.post_proc is not None: x = self.post_proc(x) if isinstance(x, tuple): x = x[0] if self.training: target = torch.ones_like(x) loss = self.loss_fn(x, target) return x, loss return x model = ConvWrapper(conv, loss_fn=loss_fn, post_proc=post_proc) if batch is None and dataset is not None: batch = (dataset.x, dataset.edge_index) stepper = TrainingStepper(model, atol=atol, rtol=rtol) if training: stepper.run(num_steps, batch) else: stepper.run_inference(batch) def generate_edge_index(num_src_nodes, num_dst_nodes, num_edges): row = torch.randint(num_src_nodes, (num_edges, ), dtype=torch.long) col = torch.randint(num_dst_nodes, (num_edges, ), dtype=torch.long) return torch.stack([row, col], dim=0) def random_heterodata(in_channels=None): seed_everything(0) if in_channels is None: in_channels = {'author': 16, 'paper': 12, 'term': 3} data = HeteroData() data['author'].x = torch.randn(6, in_channels['author']) data['paper'].x = torch.randn(5, in_channels['paper']) data['term'].x = torch.randn(4, in_channels['term']) data[('author', 'author')].edge_index = generate_edge_index(6, 6, 15) data[('author', 'paper')].edge_index = generate_edge_index(6, 5, 10) data[('paper', 'term')].edge_index = generate_edge_index(5, 4, 8) return data, in_channels def hetero_conv_harness(conv, data, output_key, forward_args=None, loss_fn=torch.nn.MSELoss(), num_steps=4, atol=1e-3, rtol=1e-2, enable_fp_exception=True): if forward_args is None: forward_args = ['x_dict', 'edge_index_dict'] class ConvWrapper(torch.nn.Module): def __init__(self, conv, loss_fn): super().__init__() self.conv = conv self.loss_fn = loss_fn def forward(self, *args): out = self.conv(*args) out = out[output_key] if self.training: target = torch.ones_like(out) loss = self.loss_fn(out, target) return out, loss return out model = ConvWrapper(conv, loss_fn) stepper = TrainingStepper(model, atol=atol, rtol=rtol, enable_fp_exception=enable_fp_exception) inputs = [getattr(data, f_arg) for f_arg in forward_args] stepper.run(num_steps, inputs) ================================================ FILE: tests/gnn/nn/conv/test_agnn_conv.py ================================================ # Copyright (c) 2022 Graphcore Ltd. All rights reserved. from torch_geometric.nn import AGNNConv from conv_utils import conv_harness conv_kwargs = {"add_self_loops": False} def test_agnn_conv(dataset): conv = AGNNConv(**conv_kwargs) conv_harness(conv, dataset) ================================================ FILE: tests/gnn/nn/conv/test_antisymmetric_conv.py ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. from torch_geometric.nn import AntiSymmetricConv from torch_geometric.nn.conv import GCNConv from conv_utils import conv_harness def test_antisymmetric_conv(dataset): in_channels = dataset.num_node_features phi = GCNConv(in_channels, in_channels, bias=False, add_self_loops=False) conv = AntiSymmetricConv(in_channels, phi=phi) conv_harness(conv, dataset) ================================================ FILE: tests/gnn/nn/conv/test_appnp.py ================================================ # Copyright (c) 2022 Graphcore Ltd. All rights reserved. import torch from torch_geometric.nn import APPNP from conv_utils import conv_harness out_channels = 16 conv_kwargs = {"add_self_loops": False} def test_appnp(dataset): in_channels = dataset.num_node_features lin = torch.nn.Linear(in_channels, out_channels) conv = APPNP(K=10, alpha=0.1, dropout=0.0, **conv_kwargs) conv_harness(conv, dataset, post_proc=lin) ================================================ FILE: tests/gnn/nn/conv/test_arma_conv.py ================================================ # Copyright (c) 2022 Graphcore Ltd. All rights reserved. from torch_geometric.nn import ARMAConv from conv_utils import conv_harness out_channels = 32 def test_arma_conv(dataset): in_channels = dataset.num_node_features conv = ARMAConv(in_channels, out_channels, num_stacks=8, num_layers=4) conv_harness(conv, dataset) ================================================ FILE: tests/gnn/nn/conv/test_cg_conv.py ================================================ # Copyright (c) 2022 Graphcore Ltd. All rights reserved. import pytest from torch_geometric.nn import CGConv from conv_utils import conv_harness @pytest.mark.parametrize('batch_norm', [False]) def test_cg_conv(dataset, batch_norm): in_channels = dataset.num_node_features conv = CGConv(in_channels, batch_norm=batch_norm) conv_harness(conv, dataset) ================================================ FILE: tests/gnn/nn/conv/test_cheb_conv.py ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. import torch import pytest from torch_geometric.nn import ChebConv from conv_utils import conv_harness @pytest.mark.skip( reason="ChebConv won't work, because algorithm requires removing " "self loops and we are adding self loops to ensure that " "tensors have fixed size.") def test_cheb_conv(dataset): in_channels = dataset.num_node_features out_channels = 32 conv = ChebConv(in_channels, out_channels, K=3, add_self_loops=False) conv_harness(conv, dataset) batch = (dataset.x, dataset.edge_index, dataset.edge_weight) conv_harness(conv, batch=batch) batch = (dataset.x, dataset.edge_index, dataset.edge_weight, None, 3.0) conv_harness(conv, batch=batch) num_nodes = dataset.num_nodes batch_mask = [int(i > num_nodes // 2) for i in range(num_nodes)] batch_mask = torch.tensor(batch_mask) lambda_max = torch.tensor([2.0, 3.0]) batch = (dataset.x, dataset.edge_index, dataset.edge_weight, batch_mask, lambda_max) conv_harness(conv, batch=batch) ================================================ FILE: tests/gnn/nn/conv/test_cluster_gcn_conv.py ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. from torch_geometric.nn import ClusterGCNConv from conv_utils import conv_harness def test_cluster_gcn_conv(dataset): in_channels = dataset.num_node_features out_channels = 32 conv = ClusterGCNConv(in_channels, out_channels, diag_lambda=1., add_self_loops=False) conv_harness(conv, dataset) ================================================ FILE: tests/gnn/nn/conv/test_dna_conv.py ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. import pytest import torch from torch_geometric.nn import DNAConv from conv_utils import conv_harness conv_kwargs_list = [{ 'heads': 4, 'groups': 8, }, { 'heads': 4, 'groups': 8, }, { 'heads': 4, 'groups': 8, 'cached': True }] @pytest.mark.parametrize('conv_kwargs', conv_kwargs_list) def test_dna_conv(conv_kwargs): channels = 32 num_layers = 3 edge_index = torch.tensor([[0, 0, 0, 1, 2, 3], [1, 2, 3, 0, 0, 0]]) num_nodes = edge_index.max().item() + 1 x = torch.randn((num_nodes, num_layers, channels)) conv = DNAConv(channels, dropout=0.0, add_self_loops=False, **conv_kwargs) conv_harness(conv, batch=(x, edge_index)) ================================================ FILE: tests/gnn/nn/conv/test_edge_conv.py ================================================ # Copyright (c) 2022 Graphcore Ltd. All rights reserved. from torch.nn import Linear as Lin from torch.nn import ReLU from torch.nn import Sequential as Seq from torch_geometric.nn import DynamicEdgeConv, EdgeConv from conv_utils import conv_harness out_channels = 32 def test_edge_conv(dataset): in_channels = dataset.num_node_features nn = Seq(Lin(in_channels * 2, in_channels), ReLU(), Lin(in_channels, out_channels)) conv = EdgeConv(nn) conv_harness(conv, dataset) def test_dynamic_edge_conv(dataset): in_channels = dataset.num_node_features nn = Seq(Lin(in_channels * 2, in_channels), ReLU(), Lin(in_channels, out_channels)) conv = DynamicEdgeConv(nn, k=2) conv_harness(conv, dataset, batch=(dataset.x, )) ================================================ FILE: tests/gnn/nn/conv/test_eg_conv.py ================================================ # Copyright (c) 2022 Graphcore Ltd. All rights reserved. import pytest from torch_geometric.nn import EGConv from conv_utils import conv_harness conv_kwargs_list = [ { "add_self_loops": False }, { "add_self_loops": False, "aggregators": ["max", "min"] }, ] @pytest.mark.parametrize('conv_kwargs', conv_kwargs_list) def test_eg_conv(dataset, conv_kwargs): in_channels = dataset.num_node_features conv = EGConv(in_channels, 32, **conv_kwargs) conv_harness(conv, dataset) ================================================ FILE: tests/gnn/nn/conv/test_fa_conv.py ================================================ # Copyright (c) 2022 Graphcore Ltd. All rights reserved. from torch_geometric.nn import FAConv from conv_utils import conv_harness conv_kwargs = {"add_self_loops": False} def test_fa_conv(dataset): in_channels = dataset.num_node_features conv = FAConv(in_channels, eps=1.0, **conv_kwargs) batch = (dataset.x, dataset.x, dataset.edge_index) conv_harness(conv, dataset, batch=batch) ================================================ FILE: tests/gnn/nn/conv/test_feast_conv.py ================================================ # Copyright (c) 2022 Graphcore Ltd. All rights reserved. from torch_geometric.nn import FeaStConv from conv_utils import conv_harness out_channels = 32 conv_kwargs = {"add_self_loops": False} def test_feast_conv(dataset): in_channels = dataset.num_node_features conv = FeaStConv(in_channels, out_channels, heads=2, **conv_kwargs) conv_harness(conv, dataset) ================================================ FILE: tests/gnn/nn/conv/test_film_conv.py ================================================ # Copyright (c) 2022 Graphcore Ltd. All rights reserved. import pytest from torch_geometric.nn import FiLMConv from conv_utils import conv_harness out_channels = 32 @pytest.mark.parametrize('num_relations', [1]) def test_film_conv(dataset, num_relations): in_channels = dataset.num_node_features conv = FiLMConv(in_channels, out_channels, num_relations=num_relations) conv_harness(conv, dataset) ================================================ FILE: tests/gnn/nn/conv/test_gat_conv.py ================================================ # Copyright (c) 2022 Graphcore Ltd. All rights reserved. import pytest from torch_geometric.nn import GATConv from conv_utils import conv_harness out_channels = 32 conv_kwargs_list = [ { 'edge_dim': None }, { 'edge_dim': 1, 'fill_value': 0.5 }, { 'edge_dim': 1, 'fill_value': 'mean' }, { 'edge_dim': 4, 'fill_value': 0.5 }, { 'edge_dim': 4, 'fill_value': 'mean' }, ] @pytest.mark.parametrize('conv_kwargs', conv_kwargs_list) def test_gat_conv(dataset, conv_kwargs): in_channels = dataset.num_node_features conv_kwargs["add_self_loops"] = False conv = GATConv(in_channels, out_channels, heads=2, **conv_kwargs) conv_harness(conv, dataset) ================================================ FILE: tests/gnn/nn/conv/test_gated_graph_conv.py ================================================ # Copyright (c) 2022 Graphcore Ltd. All rights reserved. from torch_geometric.nn import GatedGraphConv from conv_utils import conv_harness out_channels = 32 def test_gated_graph_conv(dataset): in_channels = dataset.num_node_features conv = GatedGraphConv(in_channels, num_layers=3) conv_harness(conv, dataset) ================================================ FILE: tests/gnn/nn/conv/test_gatv2_conv.py ================================================ # Copyright (c) 2022 Graphcore Ltd. All rights reserved. import pytest from torch_geometric.nn import GATv2Conv from conv_utils import conv_harness out_channels = 32 conv_kwargs_list = [ { 'edge_dim': None }, { 'edge_dim': 1, 'fill_value': 0.5 }, { 'edge_dim': 1, 'fill_value': 'mean' }, { 'edge_dim': 4, 'fill_value': 0.5 }, { 'edge_dim': 4, 'fill_value': 'mean' }, ] @pytest.mark.parametrize('conv_kwargs', conv_kwargs_list) def test_gatv2_conv(dataset, conv_kwargs): in_channels = dataset.num_node_features conv_kwargs["add_self_loops"] = False conv = GATv2Conv(in_channels, out_channels, heads=2, **conv_kwargs) conv_harness(conv, dataset, atol=1e-4, rtol=1e-3) ================================================ FILE: tests/gnn/nn/conv/test_gcn2_conv.py ================================================ # Copyright (c) 2022 Graphcore Ltd. All rights reserved. import torch from torch_geometric.nn import GCN2Conv from conv_utils import conv_harness out_channels = 16 def test_gcn2_conv(dataset): print(dataset) in_channels = dataset.num_node_features conv = GCN2Conv(in_channels, alpha=float(0.2), add_self_loops=False) x2 = torch.randn_like(dataset.x) batch = (dataset.x, x2, dataset.edge_index) conv_harness(conv, dataset, batch=batch, num_steps=1) ================================================ FILE: tests/gnn/nn/conv/test_gcn_conv.py ================================================ # Copyright (c) 2022 Graphcore Ltd. All rights reserved. import pytest from torch_geometric.nn import GCNConv from conv_utils import conv_harness out_channels = 32 conv_kwargs = {'add_self_loops': False} @pytest.mark.parametrize('flow', ['source_to_target', 'target_to_source']) def test_gcn_conv(dataset, flow): in_channels = dataset.num_node_features conv = GCNConv(in_channels, out_channels, flow, **conv_kwargs) conv_harness(conv, dataset) ================================================ FILE: tests/gnn/nn/conv/test_gen_conv.py ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. import pytest import torch from torch_geometric.nn import GENConv from conv_utils import conv_harness @pytest.mark.parametrize('aggr', ['softmax', 'powermean']) def test_gen_conv(aggr, dataset): in_channels = dataset.num_node_features conv = GENConv(in_channels, 32, aggr, edge_dim=16, add_self_loops=False, norm='layer') conv_harness(conv, dataset) x2 = torch.randn(dataset.x.shape) batch = ((dataset.x, x2), dataset.edge_index) conv_harness(conv, dataset, batch=batch) conv = GENConv((in_channels, in_channels), 32, aggr, add_self_loops=False, norm='layer') conv_harness(conv, dataset, batch=batch) ================================================ FILE: tests/gnn/nn/conv/test_general_conv.py ================================================ # Copyright (c) 2022 Graphcore Ltd. All rights reserved. import pytest import torch from torch_geometric.nn import GeneralConv from conv_utils import conv_harness out_channels = 32 num_edge_attr = 16 conv_kwargs_list = [{ 'skip_linear': True }, { 'directed_msg': False }, { 'heads': 3 }, { 'attention': True }, { 'heads': 3, 'attention': True }, { 'heads': 3, 'attention': True, 'attention_type': 'dot_product' }, { 'l2_normalize': True }] @pytest.mark.parametrize('conv_kwargs', conv_kwargs_list) def test_general_conv(dataset, conv_kwargs): in_channels = dataset.num_node_features conv = GeneralConv(in_channels, out_channels, num_edge_attr, **conv_kwargs) e1 = torch.randn(dataset.num_edges, num_edge_attr) batch = (dataset.x, dataset.edge_index, e1) conv_harness(conv, dataset, batch=batch) ================================================ FILE: tests/gnn/nn/conv/test_gin_conv.py ================================================ # Copyright (c) 2022 Graphcore Ltd. All rights reserved. import torch from torch.nn import Linear as Lin from torch.nn import ReLU from torch.nn import Sequential as Seq from torch_geometric.nn import GINConv, GINEConv from conv_utils import conv_harness out_channels = 32 def test_gin_conv(dataset): in_channels = dataset.num_node_features nn = Seq(Lin(in_channels, 32), ReLU(), Lin(32, 32)) conv = GINConv(nn, train_eps=True) conv_harness(conv, dataset) def test_gine_conv(dataset): in_channels = dataset.num_node_features nn = Seq(Lin(in_channels, 32), ReLU(), Lin(32, 32)) conv = GINEConv(nn, train_eps=True) value = torch.randn(dataset.num_edges, 16) batch = (dataset.x, dataset.edge_index, value) conv_harness(conv, dataset, batch=batch) ================================================ FILE: tests/gnn/nn/conv/test_gmm_conv.py ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. import pytest import torch from torch_geometric.nn import GMMConv from conv_utils import conv_harness @pytest.mark.parametrize('separate_gaussians', [True, False]) def test_gmm_conv(separate_gaussians, dataset): in_channels = dataset.num_node_features conv = GMMConv(in_channels, 32, dim=3, kernel_size=25, separate_gaussians=separate_gaussians, add_self_loops=False) value = torch.rand(dataset.num_edges, 3) batch = (dataset.x, dataset.edge_index, value) conv_harness(conv, batch=batch) @pytest.mark.parametrize('separate_gaussians', [True, False]) def test_gmm_conv_bipartite(separate_gaussians, dataset): in_channels = dataset.num_node_features conv = GMMConv((in_channels, in_channels), 32, dim=3, kernel_size=5, separate_gaussians=separate_gaussians, add_self_loops=False) value = torch.rand(dataset.num_edges, 3) x2 = torch.randn(dataset.x.shape) batch = ((dataset.x, x2), dataset.edge_index, value) conv_harness(conv, batch=batch) batch = ((dataset.x, None), dataset.edge_index, value) conv_harness(conv, batch=batch) ================================================ FILE: tests/gnn/nn/conv/test_gps_conv.py ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. import pytest import torch from torch_geometric.nn import GPSConv, SAGEConv from conv_utils import conv_harness @pytest.mark.skip(reason="TODO(AFS-279, AFS-162)") @pytest.mark.parametrize('norm', [None, 'batch_norm', 'layer_norm']) def test_gps_conv(norm, dataset): in_channels = dataset.num_node_features conv = GPSConv(in_channels, conv=SAGEConv(16, 16, add_self_loops=False), heads=4, norm=norm) conv.reset_parameters() conv_harness(conv, dataset) @pytest.mark.skip(reason="TODO(AFS-279, AFS-162)") @pytest.mark.parametrize('norm', [None, 'batch_norm', 'layer_norm']) def test_gps_conv_with_batch_index_tensor(norm, dataset): in_channels = dataset.num_node_features conv = GPSConv(in_channels, conv=SAGEConv(16, 16, add_self_loops=False), heads=4, norm=norm) conv.reset_parameters() batch_index = [ i > dataset.num_nodes // 2 for i in range(dataset.num_nodes) ] batch_index = torch.tensor(batch_index, dtype=torch.int64) batch = (dataset.x, dataset.edge_index, batch_index) conv_harness(conv, batch=batch) ================================================ FILE: tests/gnn/nn/conv/test_graph_conv.py ================================================ # Copyright (c) 2022 Graphcore Ltd. All rights reserved. from torch_geometric.nn import GraphConv from conv_utils import conv_harness out_channels = 16 def test_graph_conv(dataset): in_channels = dataset.num_node_features conv = GraphConv(in_channels, out_channels) conv_harness(conv, dataset) ================================================ FILE: tests/gnn/nn/conv/test_gravnet_conv.py ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. import torch from torch_geometric.nn import GravNetConv from torch_geometric.testing import withPackage from conv_utils import conv_harness @withPackage('torch_cluster') def test_gravnet_conv(dataset): in_channels = dataset.num_node_features out_channels = 32 conv = GravNetConv(in_channels, out_channels, space_dimensions=4, propagate_dimensions=8, k=2, add_self_loops=False) conv_harness(conv, batch=(dataset.x, )) num_nodes = dataset.num_nodes batch_index = [1 if i > num_nodes // 2 else 0 for i in range(num_nodes)] conv_harness(conv, batch=(dataset.x, batch_index)) x2 = torch.randn_like(dataset.x) conv_harness(conv, batch=((dataset.x, x2), ), atol=5e-05, rtol=0.001) conv_harness(conv, batch=((dataset.x, x2), (torch.Tensor(batch_index), torch.Tensor(batch_index))), atol=5e-03, rtol=0.1) ================================================ FILE: tests/gnn/nn/conv/test_han_conv.py ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. from torch_geometric.nn import HANConv from conv_utils import hetero_conv_harness, random_heterodata def test_han_conv(): data, in_channels = random_heterodata() metadata = data.metadata() conv = HANConv(in_channels, 16, metadata, heads=2, add_self_loops=False) hetero_conv_harness(conv, data, 'author') def test_han_conv_lazy(): data, _ = random_heterodata() metadata = data.metadata() conv = HANConv(-1, 16, metadata, heads=2, add_self_loops=False) _ = conv(data.x_dict, data.edge_index_dict) hetero_conv_harness(conv, data, 'author') ================================================ FILE: tests/gnn/nn/conv/test_heat_conv.py ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. import pytest import torch from torch_geometric.nn import HEATConv from conv_utils import conv_harness @pytest.mark.parametrize('concat', [True, False]) def test_heat_conv(concat): x = torch.randn(4, 8) edge_index = torch.tensor([[0, 1, 2, 3], [0, 0, 1, 1]]) edge_attr = torch.randn((4, 2)) node_type = torch.tensor([0, 0, 1, 2]) edge_type = torch.tensor([0, 2, 1, 2]) conv = HEATConv(in_channels=8, out_channels=16, num_node_types=3, num_edge_types=3, edge_type_emb_dim=5, edge_dim=2, edge_attr_emb_dim=6, heads=2, concat=concat, add_self_loops=False) conv_harness(conv, batch=(x, edge_index, node_type, edge_type, edge_attr), atol=5e-4, rtol=0.3) ================================================ FILE: tests/gnn/nn/conv/test_hetero_conv.py ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. import pytest import torch from torch_geometric.data import HeteroData from torch_geometric.nn import (GATConv, GCNConv, HeteroConv, Linear, MessagePassing, SAGEConv, to_hetero) import torch_geometric.transforms as T from conv_utils import hetero_conv_harness def get_edge_index(num_src_nodes, num_dst_nodes, num_edges): row = torch.randint(num_src_nodes, (num_edges, ), dtype=torch.long) col = torch.randint(num_dst_nodes, (num_edges, ), dtype=torch.long) return torch.stack([row, col], dim=0) def get_dummy_data(): data = HeteroData() data['paper'].x = torch.randn(50, 32) data['author'].x = torch.randn(30, 64) data['paper', 'paper'].edge_index = get_edge_index(50, 50, 200) data['paper', 'author'].edge_index = get_edge_index(50, 30, 100) data['paper', 'author'].edge_attr = torch.randn(100, 3) data['author', 'paper'].edge_index = get_edge_index(30, 50, 100) data['paper', 'paper'].edge_weight = torch.rand(200) data['author', 'author'].edge_index = get_edge_index(30, 30, 100) return data @pytest.mark.parametrize('aggr', ['sum', 'mean', 'min', 'max', 'cat', None]) def test_hetero_conv(aggr): data = get_dummy_data() conv = HeteroConv( { ('paper', 'to', 'paper'): GCNConv(-1, 64, add_self_loops=False), ('author', 'to', 'paper'): SAGEConv((-1, -1), 64, add_self_loops=False), ('paper', 'to', 'author'): GATConv((-1, -1), 64, edge_dim=3, add_self_loops=False), }, aggr=aggr) _ = conv(data.x_dict, data.edge_index_dict, data.edge_attr_dict, edge_weight_dict=data.edge_weight_dict) forward_args = ('x_dict', 'edge_index_dict', 'edge_attr_dict', 'edge_weight_dict') hetero_conv_harness(conv, data, 'author', forward_args=forward_args) @pytest.mark.parametrize('aggr', ['sum', 'mean', 'min', 'max', 'cat']) @pytest.mark.parametrize('num_layers', [2, 5]) def test_hetero_conv_multiple_layers(aggr, num_layers): data = get_dummy_data() class MultiLayerHeteroConv(torch.nn.Module): def __init__(self, num_layers): super().__init__() self.convs = torch.nn.ModuleList() for _ in range(num_layers): self.convs.append( HeteroConv( { ('paper', 'to', 'paper'): GCNConv(-1, 64, add_self_loops=False), ('author', 'to', 'paper'): SAGEConv((-1, -1), 64, add_self_loops=False), ('paper', 'to', 'author'): GATConv( (-1, -1), 64, edge_dim=3, add_self_loops=False), }, aggr=aggr)) def forward(self, x_dict, edge_index_dict, *args, **kwargs): for conv in self.convs: x_dict = conv(x_dict, edge_index_dict, *args, **kwargs) x_dict = {key: x.relu() for key, x in x_dict.items()} return x_dict conv = MultiLayerHeteroConv(num_layers) _ = conv(data.x_dict, data.edge_index_dict, data.edge_attr_dict, edge_weight_dict=data.edge_weight_dict) forward_args = ('x_dict', 'edge_index_dict', 'edge_attr_dict', 'edge_weight_dict') hetero_conv_harness(conv, data, 'author', forward_args=forward_args, enable_fp_exception=False) @pytest.mark.parametrize('aggr', ['sum', 'mean', 'min', 'max', 'cat']) @pytest.mark.parametrize('num_layers', [2, 5]) def test_hetero_conv_multiple_layers_with_data_transforms(aggr, num_layers): data = get_dummy_data() data = T.ToUndirected()(data) data = T.AddSelfLoops()(data) data = T.NormalizeFeatures()(data) class MultiLayerHeteroConv(torch.nn.Module): def __init__(self, num_layers): super().__init__() self.convs = torch.nn.ModuleList() for _ in range(num_layers): self.convs.append( HeteroConv( { ('paper', 'to', 'paper'): GCNConv(-1, 64, add_self_loops=False), ('author', 'to', 'author'): GCNConv(-1, 64, add_self_loops=False), ('author', 'to', 'paper'): SAGEConv((-1, -1), 64, add_self_loops=False), ('paper', 'to', 'author'): GATConv( (-1, -1), 64, edge_dim=3, add_self_loops=False), ('paper', 'rev_to', 'author'): SAGEConv((-1, -1), 64, add_self_loops=False), ('author', 'rev_to', 'paper'): GATConv( (-1, -1), 64, edge_dim=3, add_self_loops=False), }, aggr=aggr)) def forward(self, x_dict, edge_index_dict, *args, **kwargs): for conv in self.convs: x_dict = conv(x_dict, edge_index_dict, *args, **kwargs) x_dict = {key: x.relu() for key, x in x_dict.items()} return x_dict conv = MultiLayerHeteroConv(num_layers) _ = conv(data.x_dict, data.edge_index_dict, data.edge_attr_dict, edge_weight_dict=data.edge_weight_dict) forward_args = ('x_dict', 'edge_index_dict', 'edge_attr_dict', 'edge_weight_dict') hetero_conv_harness(conv, data, 'author', forward_args=forward_args) # pylint: disable=abstract-method # pylint: disable=arguments-differ class CustomConv(MessagePassing): def __init__(self, out_channels): super().__init__(aggr='add') self.lin = Linear(-1, out_channels) def forward(self, x, edge_index, y, z): return self.propagate(edge_index, x=x, y=y, z=z) def message(self, x_j, y_j, z_j): return self.lin(torch.cat([x_j, y_j, z_j], dim=-1)) def test_hetero_conv_with_custom_conv(): data = HeteroData() data['paper'].x = torch.randn(50, 32) data['paper'].y = torch.randn(50, 3) data['paper'].z = torch.randn(50, 3) data['author'].x = torch.randn(30, 64) data['author'].y = torch.randn(30, 3) data['author'].z = torch.randn(30, 3) data['paper', 'paper'].edge_index = get_edge_index(50, 50, 200) data['paper', 'author'].edge_index = get_edge_index(50, 30, 100) data['author', 'paper'].edge_index = get_edge_index(30, 50, 100) conv = HeteroConv({key: CustomConv(64) for key in data.edge_types}) _ = conv(data.x_dict, data.edge_index_dict, data.y_dict, data.z_dict) forward_args = ('x_dict', 'edge_index_dict', 'y_dict', 'z_dict') hetero_conv_harness(conv, data, 'author', forward_args=forward_args) @pytest.mark.parametrize('aggr', ['sum', 'mean', 'min', 'max']) def test_to_hetero_transformation_basic(aggr): data = get_dummy_data() class Model(torch.nn.Module): def __init__(self): super().__init__() self.conv1 = SAGEConv((-1, -1), 64) self.conv2 = SAGEConv((-1, -1), 64) def forward(self, x, edge_index): x = self.conv1(x, edge_index).relu() x = self.conv2(x, edge_index) return x model = Model() model = to_hetero(model, data.metadata(), aggr=aggr) _ = model(data.x_dict, data.edge_index_dict) forward_args = ('x_dict', 'edge_index_dict') hetero_conv_harness(model, data, 'author', forward_args=forward_args) @pytest.mark.parametrize('aggr', ['sum', 'mean', 'min', 'max']) def test_to_hetero_transformation_skip_connections(aggr): data = get_dummy_data() class Model(torch.nn.Module): def __init__(self): super().__init__() self.conv1 = SAGEConv((-1, -1), 64) self.lin1 = Linear(-1, 64) self.conv2 = SAGEConv((-1, -1), 64) self.lin2 = Linear(-1, 64) def forward(self, x, edge_index): x = self.conv1(x, edge_index) + self.lin1(x) x = x.relu() x = self.conv2(x, edge_index) + self.lin2(x) return x model = Model() model = to_hetero(model, data.metadata(), aggr=aggr) _ = model(data.x_dict, data.edge_index_dict) forward_args = ('x_dict', 'edge_index_dict') hetero_conv_harness(model, data, 'author', forward_args=forward_args) ================================================ FILE: tests/gnn/nn/conv/test_hgt_conv.py ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. from collections import defaultdict from torch_geometric.nn import HGTConv from conv_utils import hetero_conv_harness, random_heterodata def test_hgt_conv_same_dimensions(): in_channels = defaultdict(lambda: 16) data, _ = random_heterodata(in_channels) conv = HGTConv(in_channels['author'], in_channels['paper'], metadata=data.metadata(), heads=2) hetero_conv_harness(conv, data, 'author') def test_hgt_conv_different_dimensions(): in_channels = defaultdict(lambda: 16) in_channels['paper'] = 32 data, _ = random_heterodata(in_channels) conv = HGTConv(in_channels=in_channels, out_channels=32, metadata=data.metadata(), heads=2) hetero_conv_harness(conv, data, 'author') def test_hgt_conv_lazy(): in_channels = defaultdict(lambda: 16) in_channels['paper'] = 32 data, _ = random_heterodata(in_channels) conv = HGTConv(-1, 32, metadata=data.metadata(), heads=2) _ = conv(data.x_dict, data.edge_index_dict) hetero_conv_harness(conv, data, 'author') ================================================ FILE: tests/gnn/nn/conv/test_hypergraph_conv.py ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. import torch from torch_geometric.nn import HypergraphConv from conv_utils import conv_harness def test_hypergraph_conv_with_more_nodes_than_edges(): torch.manual_seed(42) in_channels, out_channels = (16, 32) hyperedge_index = torch.tensor([[0, 0, 1, 1, 2, 3], [0, 1, 0, 1, 0, 1]]) hyperedge_weight = torch.tensor([1.0, 0.5]) num_nodes = hyperedge_index[0].max().item() + 1 num_edges = hyperedge_index[1].max().item() + 1 x = torch.randn((num_nodes, in_channels)) hyperedge_attr = torch.randn((num_edges, in_channels)) conv = HypergraphConv(in_channels, out_channels, add_self_loops=False) conv_harness(conv, batch=(x, hyperedge_index, None, None, num_edges)) conv = HypergraphConv(in_channels, out_channels, use_attention=True, heads=2, add_self_loops=False) conv_harness(conv, batch=(x, hyperedge_index, hyperedge_weight, hyperedge_attr, num_edges)) def test_hypergraph_conv_with_more_edges_than_nodes(): torch.manual_seed(42) in_channels, out_channels = (16, 32) hyperedge_index = torch.tensor([[0, 0, 1, 1, 2, 3, 3, 3, 2, 1, 2], [0, 1, 2, 1, 2, 1, 0, 3, 3, 4, 4]]) hyperedge_weight = torch.tensor([1.0, 0.5, 0.8, 0.2, 0.7]) num_nodes = hyperedge_index[0].max().item() + 1 num_edges = hyperedge_index[1].max().item() + 1 x = torch.randn((num_nodes, in_channels)) conv = HypergraphConv(in_channels, out_channels) conv_harness(conv, batch=(x, hyperedge_index, None, None, num_edges)) conv_harness(conv, batch=(x, hyperedge_index, hyperedge_weight, None, num_edges)) ================================================ FILE: tests/gnn/nn/conv/test_le_conv.py ================================================ # Copyright (c) 2022 Graphcore Ltd. All rights reserved. from torch_geometric.nn import LEConv from conv_utils import conv_harness out_channels = 16 def test_le_conv(dataset): in_channels = dataset.num_node_features conv = LEConv(in_channels, out_channels) conv_harness(conv, dataset) ================================================ FILE: tests/gnn/nn/conv/test_lg_conv.py ================================================ # Copyright (c) 2022 Graphcore Ltd. All rights reserved. import torch from torch_geometric.nn import LGConv from conv_utils import conv_harness out_channels = 16 def test_lg_conv(dataset): in_channels = dataset.num_node_features conv = LGConv() lin = torch.nn.Linear(in_channels, out_channels) conv_harness(conv, dataset, post_proc=lin) def test_lg_edge_weights_conv(dataset): in_channels = dataset.num_node_features conv = LGConv() lin = torch.nn.Linear(in_channels, out_channels) batch = (dataset.x, dataset.edge_index, dataset.edge_weight) conv_harness(conv, dataset, batch=batch, post_proc=lin) ================================================ FILE: tests/gnn/nn/conv/test_mf_conv.py ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. import torch from torch_geometric.nn import MFConv from conv_utils import conv_harness def test_mf_conv(dataset): in_channels = dataset.num_node_features out_channels = 32 conv = MFConv(in_channels, out_channels, add_self_loops=False) conv_harness(conv, dataset) conv = MFConv((in_channels, in_channels), out_channels, add_self_loops=False) x2 = torch.randn(dataset.x.shape) batch = ((dataset.x, x2), dataset.edge_index) conv_harness(conv, batch=batch) batch = ((dataset.x, None), dataset.edge_index) conv_harness(conv, batch=batch) ================================================ FILE: tests/gnn/nn/conv/test_nn_conv.py ================================================ # Copyright (c) 2022 Graphcore Ltd. All rights reserved. import torch from torch.nn import Linear as Lin from torch.nn import ReLU from torch.nn import Sequential as Seq from torch_geometric.nn import NNConv from conv_utils import conv_harness out_channels = 16 def test_nn_conv(dataset): in_channels = dataset.num_node_features nn = Seq(Lin(3, 32), ReLU(), Lin(32, 8 * 32)) conv = NNConv(in_channels, out_channels, nn=nn) value = torch.rand(dataset.num_edges, 3) batch = (dataset.x, dataset.edge_index, value) conv_harness(conv, dataset, batch=batch) ================================================ FILE: tests/gnn/nn/conv/test_pan_conv.py ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. import pytest from torch_geometric.nn import PANConv from conv_utils import conv_harness @pytest.mark.skip(reason="TODO(AFS-262)") def test_pan_conv(dataset): in_channels = dataset.num_node_features conv = PANConv(in_channels, 32, filter_size=2, add_self_loops=False) conv_harness(conv, dataset) ================================================ FILE: tests/gnn/nn/conv/test_pdn_conv.py ================================================ # Copyright (c) 2022 Graphcore Ltd. All rights reserved. import torch from torch_geometric.nn import PDNConv from conv_utils import conv_harness out_channels = 16 def test_pdn_conv(dataset): in_channels = dataset.num_node_features conv = PDNConv(in_channels, out_channels, edge_dim=8, hidden_channels=128, add_self_loops=False) edge_attr = torch.randn(dataset.num_edges, 8) batch = (dataset.x, dataset.edge_index, edge_attr) conv_harness(conv, dataset, batch=batch) ================================================ FILE: tests/gnn/nn/conv/test_pna_conv.py ================================================ # Copyright (c) 2022 Graphcore Ltd. All rights reserved. import torch from torch_geometric.nn import PNAConv from conv_utils import conv_harness out_channels = 16 aggregators = ['sum', 'mean', 'min', 'max', 'var', 'std'] scalers = [ 'identity', 'amplification', 'attenuation', 'linear', 'inverse_linear' ] def test_pna_conv(dataset): in_channels = dataset.num_node_features deg = PNAConv.get_degree_histogram([dataset]) conv = PNAConv(in_channels, out_channels, aggregators, scalers, deg=deg, edge_dim=3, towers=4) value = torch.rand(dataset.num_edges, 3) batch = (dataset.x, dataset.edge_index, value) conv_harness(conv, dataset, batch=batch) ================================================ FILE: tests/gnn/nn/conv/test_point_conv.py ================================================ # Copyright (c) 2022 Graphcore Ltd. All rights reserved. import torch from torch.nn import Linear as Lin from torch.nn import ReLU from torch.nn import Sequential as Seq from torch_geometric.nn import PointNetConv from conv_utils import conv_harness out_channels = 16 def test_point_net_conv(dataset): local_nn = Seq(Lin(16 + 3, 32), ReLU(), Lin(32, 32)) global_nn = Seq(Lin(32, 32)) conv = PointNetConv(local_nn, global_nn, add_self_loops=False) pos = torch.rand(dataset.num_nodes, 3) batch = (dataset.x, pos, dataset.edge_index) conv_harness(conv, dataset, batch=batch) def test_point2_net_conv(dataset): local_nn = Seq(Lin(16 + 3, 32), ReLU(), Lin(32, 32)) global_nn = Seq(Lin(32, 32)) conv = PointNetConv(local_nn, global_nn, add_self_loops=False) pos1 = torch.rand(dataset.num_nodes, 3) pos2 = torch.rand(dataset.num_nodes, 3) batch = (dataset.x, (pos1, pos2), dataset.edge_index) conv_harness(conv, dataset, batch=batch) ================================================ FILE: tests/gnn/nn/conv/test_point_gnn_conv.py ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. import torch from torch_geometric import seed_everything from torch_geometric.nn import MLP, PointGNNConv from conv_utils import conv_harness def test_pointgnn_conv(): seed_everything(42) x = torch.rand(6, 8) pos = torch.rand(6, 3) edge_index = torch.tensor([[0, 1, 1, 1, 2, 5], [1, 2, 3, 4, 3, 4]]) conv = PointGNNConv( mlp_h=MLP([8, 16, 3], norm='LayerNorm'), mlp_f=MLP([3 + 8, 16, 8], norm='LayerNorm'), mlp_g=MLP([8, 16, 8], norm='LayerNorm'), ) batch = (x, pos, edge_index) conv_harness(conv, batch=batch) ================================================ FILE: tests/gnn/nn/conv/test_point_transformer_conv.py ================================================ # Copyright (c) 2022 Graphcore Ltd. All rights reserved. import torch from torch.nn import Linear as Lin from torch.nn import ReLU from torch.nn import Sequential as Seq from torch_geometric.nn import PointTransformerConv from conv_utils import conv_harness out_channels = 32 def test_point_transformer_conv(dataset): in_channels = dataset.num_node_features conv = PointTransformerConv(in_channels, out_channels, add_self_loops=False) pos = torch.rand(dataset.num_nodes, 3) batch = (dataset.x, pos, dataset.edge_index) conv_harness(conv, dataset, batch=batch, atol=1e-4, rtol=1e-3) def test_point_transformer_nn_conv(dataset): in_channels = dataset.num_node_features pos_nn = Seq(Lin(3, 16), ReLU(), Lin(16, 32)) attn_nn = Seq(Lin(32, 32), ReLU(), Lin(32, 32)) conv = PointTransformerConv(in_channels, out_channels, pos_nn, attn_nn, add_self_loops=False) pos = torch.rand(dataset.num_nodes, 3) batch = (dataset.x, pos, dataset.edge_index) conv_harness(conv, dataset, batch=batch, atol=1e-3, rtol=1e-2) ================================================ FILE: tests/gnn/nn/conv/test_ppf_conv.py ================================================ # Copyright (c) 2022 Graphcore Ltd. All rights reserved. import torch import torch.nn.functional as F from torch.nn import Linear as Lin from torch.nn import ReLU from torch.nn import Sequential as Seq from torch_geometric.nn import PPFConv from conv_utils import conv_harness def test_ppf_conv(dataset): local_nn = Seq(Lin(16 + 4, 32), ReLU(), Lin(32, 32)) global_nn = Seq(Lin(32, 32)) conv = PPFConv(local_nn, global_nn, add_self_loops=False) pos = torch.rand(dataset.num_nodes, 3) n = F.normalize(torch.rand(dataset.num_nodes, 3), dim=-1) batch = (dataset.x, pos, n, dataset.edge_index) conv_harness(conv, dataset, batch=batch) def test_ppf2_conv(dataset): local_nn = Seq(Lin(16 + 4, 32), ReLU(), Lin(32, 32)) global_nn = Seq(Lin(32, 32)) conv = PPFConv(local_nn, global_nn, add_self_loops=False) pos1 = torch.rand(dataset.num_nodes, 3) pos2 = torch.rand(dataset.num_nodes, 3) n1 = F.normalize(torch.rand(dataset.num_nodes, 3), dim=-1) n2 = F.normalize(torch.rand(dataset.num_nodes, 3), dim=-1) batch = (dataset.x, (pos1, pos2), (n1, n2), dataset.edge_index) conv_harness(conv, dataset, batch=batch) ================================================ FILE: tests/gnn/nn/conv/test_res_gated_graph_conv.py ================================================ # Copyright (c) 2022 Graphcore Ltd. All rights reserved. from torch_geometric.nn import ResGatedGraphConv from conv_utils import conv_harness out_channels = 16 def test_res_gated_graph_conv(dataset): in_channels = dataset.num_node_features conv = ResGatedGraphConv(in_channels, out_channels) conv_harness(conv, dataset) ================================================ FILE: tests/gnn/nn/conv/test_rgat_conv.py ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. import pytest import torch from torch_geometric import seed_everything from torch_geometric.nn import RGATConv from conv_utils import conv_harness @pytest.mark.parametrize('mod', [ 'additive', 'scaled', 'f-additive', 'f-scaled', ]) @pytest.mark.parametrize('attention_mechanism', [ 'within-relation', 'across-relation', ]) @pytest.mark.parametrize('attention_mode', [ 'additive-self-attention', 'multiplicative-self-attention', ]) def test_rgat_conv(mod, attention_mechanism, attention_mode): seed_everything(0) if attention_mechanism == 'within-relation': pytest.skip("Condition from torch.nonzero is used to compute softmax. " "Fixed size tensor can change softmax result.") if mod != 'additive' or attention_mode != 'additive-self-attention': pytest.skip("TODO(AFS-200)") x = torch.randn(4, 8) edge_index = torch.tensor([[0, 1, 2, 3], [0, 0, 1, 1]]) edge_type = torch.tensor([0, 2, 1, 2]) edge_attr = torch.randn((4, 8)) conv = RGATConv(8, 20, num_relations=4, num_bases=4, mod=mod, attention_mechanism=attention_mechanism, attention_mode=attention_mode, heads=2, dim=1, edge_dim=8, add_self_loops=False) batch = (x, edge_index, edge_type, edge_attr) conv_harness(conv, batch=batch) ================================================ FILE: tests/gnn/nn/conv/test_rgcn_conv.py ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. import pytest import torch from torch_geometric.nn import FastRGCNConv, RGCNConv from conv_utils import conv_harness out_channels = 16 @pytest.mark.parametrize('rgcn', [FastRGCNConv, RGCNConv]) def test_rgcn_conv(rgcn): if rgcn == RGCNConv: pytest.skip("RGCNConv uses dynamic shapes") in_channels = 4 out_channels = 32 num_relations = 4 edge_index = torch.tensor([[0, 1, 1, 2, 2, 3], [0, 0, 1, 0, 1, 1]]) edge_type = torch.tensor([0, 1, 1, 0, 0, 1]) conv = rgcn(in_channels, out_channels, num_relations, num_bases=15, add_self_loops=False) batch = (None, edge_index, edge_type) conv_harness(conv, batch=batch) ================================================ FILE: tests/gnn/nn/conv/test_sage_conv.py ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. from torch_geometric.nn import SAGEConv from conv_utils import conv_harness out_channels = 16 aggregators = ['sum', 'mean', 'min', 'max', 'var', 'std'] def test_sage_conv(dataset): in_channels = dataset.num_node_features conv = SAGEConv(in_channels, out_channels, aggr=aggregators, normalize=True, root_weight=True, project=True, bias=True) conv_harness(conv, dataset) ================================================ FILE: tests/gnn/nn/conv/test_sg_conv.py ================================================ # Copyright (c) 2022 Graphcore Ltd. All rights reserved. from torch_geometric.nn import SGConv from conv_utils import conv_harness out_channels = 16 def test_sg_conv(dataset): in_channels = dataset.num_node_features conv = SGConv(in_channels, out_channels, K=10, add_self_loops=False) conv_harness(conv, dataset) def test_sg_weights_conv(dataset): in_channels = dataset.num_node_features conv = SGConv(in_channels, out_channels, K=10, add_self_loops=False) batch = (dataset.x, dataset.edge_index, dataset.edge_weight) conv_harness(conv, dataset, batch=batch) ================================================ FILE: tests/gnn/nn/conv/test_signed_conv.py ================================================ # Copyright (c) 2022 Graphcore Ltd. All rights reserved. import torch from torch_geometric.nn import SignedConv from conv_utils import conv_harness out_channels = 16 def test_signed_conv(dataset): in_channels = dataset.num_node_features class Convs(torch.nn.Module): def __init__(self): super().__init__() self.conv1 = SignedConv(in_channels, out_channels, first_aggr=True, add_self_loops=False) self.conv2 = SignedConv(out_channels, 32, first_aggr=False, add_self_loops=False) def forward(self, x, pos_edge_index, neg_edge_index): x = self.conv1(x, pos_edge_index, neg_edge_index) x = self.conv2(x, pos_edge_index, neg_edge_index) return x conv = Convs() batch = (dataset.x, dataset.edge_index, dataset.edge_index) conv_harness(conv, dataset, batch=batch) ================================================ FILE: tests/gnn/nn/conv/test_simple_conv.py ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. import pytest import torch from torch_geometric.nn import SimpleConv from conv_utils import conv_harness @pytest.mark.parametrize('combine_root', ['sum', 'cat', 'self_loop', None]) def test_simple_conv(dataset, combine_root): in_channels = dataset.num_node_features out_channels = 64 if combine_root == 'cat': in_channels = in_channels * 2 lin = torch.nn.Linear(in_channels, out_channels) conv = SimpleConv(combine_root=combine_root) conv_harness(conv, dataset, post_proc=lin) ================================================ FILE: tests/gnn/nn/conv/test_spline_conv.py ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. import pytest import torch from torch_geometric.nn import SplineConv from torch_geometric.testing import withPackage from conv_utils import conv_harness @pytest.mark.parametrize("training", [True, False]) @withPackage('torch_spline_conv') def test_spline_conv(training): if training: pytest.skip('reason="TODO(AFS-216, AFS-218)') x1 = torch.randn(4, 4) x2 = torch.randn(2, 8) edge_index = torch.tensor([[0, 1, 2, 3], [0, 0, 1, 1]]) value = torch.rand(edge_index[0].size(0), 3) conv = SplineConv(4, 32, dim=3, kernel_size=5) conv_harness(conv, batch=(x1, edge_index, value), training=training) conv = SplineConv((4, 8), 32, dim=3, kernel_size=5) batch = ((x1, x2), edge_index, value) conv_harness(conv, batch=batch, training=training) batch = ((x1, None), edge_index, value, (4, 2)) conv_harness(conv, batch=batch, training=training) ================================================ FILE: tests/gnn/nn/conv/test_ssg_conv.py ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. import torch from torch_geometric.nn import SSGConv from conv_utils import conv_harness def test_ssg_conv(dataset): in_channels = dataset.num_node_features out_channels = 32 conv = SSGConv(in_channels, out_channels, alpha=0.1, K=10, add_self_loops=False) conv_harness(conv, dataset) value = torch.rand(dataset.num_edges) conv_harness(conv, batch=(dataset.x, dataset.edge_index, value)) ================================================ FILE: tests/gnn/nn/conv/test_supergat_conv.py ================================================ # Copyright (c) 2022 Graphcore Ltd. All rights reserved. import pytest from torch_geometric.nn import SuperGATConv from conv_utils import conv_harness out_channels = 16 @pytest.mark.skip(reason="TODO(AFS-36)") @pytest.mark.parametrize('att_type', ['MX', 'SD']) def test_supergat_conv(dataset, att_type): in_channels = dataset.num_node_features conv = SuperGATConv(in_channels, out_channels, heads=2, attention_type=att_type, neg_sample_ratio=1.0, edge_sample_ratio=1.0, add_self_loops=False) conv_harness(conv, dataset) ================================================ FILE: tests/gnn/nn/conv/test_tag_conv.py ================================================ # Copyright (c) 2022 Graphcore Ltd. All rights reserved. from torch_geometric.nn import TAGConv from conv_utils import conv_harness out_channels = 16 def test_tag_conv(dataset): in_channels = dataset.num_node_features conv = TAGConv(in_channels, out_channels) conv_harness(conv, dataset) def test_tag_weights_conv(dataset): in_channels = dataset.num_node_features conv = TAGConv(in_channels, out_channels) batch = (dataset.x, dataset.edge_index, dataset.edge_weight) conv_harness(conv, dataset, batch=batch) ================================================ FILE: tests/gnn/nn/conv/test_transformer_conv.py ================================================ # Copyright (c) 2022 Graphcore Ltd. All rights reserved. from torch_geometric.nn import TransformerConv from conv_utils import conv_harness out_channels = 16 def test_transformer_conv(dataset): in_channels = dataset.num_node_features conv = TransformerConv(in_channels, out_channels, heads=2, beta=True) conv_harness(conv, dataset) ================================================ FILE: tests/gnn/nn/conv/test_wl_conv.py ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. import pytest import torch from torch_geometric.nn import WLConv from conv_utils import conv_harness @pytest.mark.skip(reason="Algorithm requires reading tensors which " "are placed on the IPU.") def test_wl_conv(): x = torch.tensor([1, 0, 0, 1]) edge_index = torch.tensor([[0, 1, 1, 2, 2, 3], [1, 0, 2, 1, 3, 2]]) conv = WLConv() _ = conv(x, edge_index) conv_harness(conv, batch=(x, edge_index), training=False) ================================================ FILE: tests/gnn/nn/conv/test_wl_conv_continuous.py ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. import torch from torch_geometric.nn import WLConvContinuous from conv_utils import conv_harness def test_wl_conv_cont(dataset): in_channels = dataset.num_node_features conv = WLConvContinuous() lin = torch.nn.Linear(in_channels, 8) conv_harness(conv, dataset, post_proc=lin) batch = ((dataset.x, None), dataset.edge_index, dataset.edge_weight) conv_harness(conv, batch=batch, post_proc=lin) x2 = torch.randn(dataset.x.shape) batch = ((dataset.x, x2), dataset.edge_index, dataset.edge_weight) conv_harness(conv, batch=batch, post_proc=lin) ================================================ FILE: tests/gnn/nn/conv/test_x_conv.py ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. import torch from torch_geometric.nn import XConv from torch_geometric.testing import withPackage from conv_utils import conv_harness @withPackage('torch_cluster') def test_x_conv(): x = torch.randn(8, 16) pos = torch.rand(8, 5) batch = torch.tensor([0, 0, 0, 0, 1, 1, 1, 1]) conv = XConv(16, 32, dim=5, kernel_size=2, dilation=2) torch.manual_seed(0) # We need to pass very loose atol and rtol here due to TODO(AFS-276) conv_harness(conv, batch=(x, pos), atol=0.1, rtol=0.1) conv_harness(conv, batch=(x, pos, batch), atol=0.1, rtol=0.1) ================================================ FILE: tests/gnn/nn/dense/dense_utils.py ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. import torch from poptorch_geometric import TrainingStepper def dense_harness(dense, batch=None, post_proc=None, loss_fn=torch.nn.MSELoss(), num_steps=4, atol=1e-5, rtol=1e-4): class DenseWrapper(torch.nn.Module): def __init__(self, dense, loss_fn, post_proc=None): super().__init__() self.dense = dense self.loss_fn = loss_fn self.post_proc = post_proc def forward(self, *args): x = self.dense(*args) if self.post_proc is not None: x = self.post_proc(x) if self.training: target = torch.ones_like(x) loss = self.loss_fn(x, target) return x, loss return x model = DenseWrapper(dense, loss_fn=loss_fn, post_proc=post_proc) stepper = TrainingStepper(model, atol=atol, rtol=rtol) stepper.run(num_steps, batch) ================================================ FILE: tests/gnn/nn/dense/test_convs.py ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. import pytest from torch_geometric.nn import DenseGCNConv, DenseGraphConv, DenseGINConv, DenseSAGEConv import torch from torch.nn import Linear as Lin from torch.nn import ReLU from torch.nn import Sequential as Seq from dense_utils import dense_harness @pytest.mark.parametrize( "conv_fn", [DenseGCNConv, DenseGraphConv, DenseGINConv, DenseSAGEConv]) def test_dense_convs(conv_fn): channels = 16 if conv_fn is DenseGINConv: nn = Seq(Lin(channels, channels), ReLU(), Lin(channels, channels)) conv = conv_fn(nn) else: conv = conv_fn(channels, channels) x = torch.randn((5, channels)) x = torch.cat([x, x.new_zeros(1, channels)], dim=0).view(2, 3, channels) adj = torch.Tensor([ [ [0, 1, 1], [1, 0, 1], [1, 1, 0], ], [ [0, 1, 0], [1, 0, 0], [0, 0, 0], ], ]) mask = torch.tensor([[1, 1, 1], [1, 1, 0]], dtype=torch.bool) batch = (x, adj, mask) dense_out = conv(*batch) assert dense_out.size() == (2, 3, channels) assert dense_out[1, 2].abs().sum().item() == 0 dense_harness(conv, batch) @pytest.mark.parametrize( "conv_fn", [DenseGCNConv, DenseGraphConv, DenseGINConv, DenseSAGEConv]) def test_dense_convs_with_broadcasting(conv_fn): batch_size, num_nodes, channels = 8, 3, 16 if conv_fn is DenseGINConv: nn = Seq(Lin(channels, channels), ReLU(), Lin(channels, channels)) conv = conv_fn(nn) else: conv = conv_fn(channels, channels) x = torch.randn(batch_size, num_nodes, channels) adj = torch.Tensor([ [0, 1, 1], [1, 0, 1], [1, 1, 0], ]) assert conv(x, adj).size() == (batch_size, num_nodes, channels) mask = torch.tensor([1, 1, 1], dtype=torch.bool) batch = (x, adj, mask) assert conv(*batch).size() == (batch_size, num_nodes, channels) dense_harness(conv, batch) ================================================ FILE: tests/gnn/nn/functional/test_bro.py ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. import pytest import torch from torch_geometric.nn.functional import bro import poptorch @pytest.mark.skip(reason="TODO(AFS-269)") def test_bro(): batch = torch.tensor([0, 0, 0, 0, 1, 1, 1, 2, 2]) g1 = torch.tensor([ [0.2, 0.2, 0.2, 0.2], [0.0, 0.2, 0.2, 0.2], [0.2, 0.0, 0.2, 0.2], [0.2, 0.2, 0.0, 0.2], ]) g2 = torch.tensor([ [0.2, 0.2, 0.2, 0.2], [0.0, 0.2, 0.2, 0.2], [0.2, 0.0, 0.2, 0.2], ]) g3 = torch.tensor([ [0.2, 0.2, 0.2, 0.2], [0.2, 0.0, 0.2, 0.2], ]) class Model(torch.nn.Module): def forward(self, g1, g2, g3, batch): return bro(torch.cat([g1, g2, g3], dim=0), batch) model = Model() poptorch_model = poptorch.inferenceModel(model) ipu_out = poptorch_model(g1, g2, g3, batch) s = 0. for g in [torch.cat([g1, g2, g3]) / 3]: s += torch.norm(g @ g.t() - torch.eye(g.shape[0]), p=2) assert torch.isclose(s / 3., ipu_out) ================================================ FILE: tests/gnn/nn/functional/test_gini.py ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. import torch from torch_geometric.nn.functional import gini import poptorch def test_gini(): w = torch.tensor([[0., 0., 0., 0.], [0., 0., 0., 1000.0]]) class Model(torch.nn.Module): def forward(self, w): return gini(w) model = Model() poptorch_model = poptorch.inferenceModel(model) ipu_out = poptorch_model(w) assert torch.isclose(ipu_out, torch.tensor(0.5)) ================================================ FILE: tests/gnn/nn/kge/kge_utils.py ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. from typing import List import torch from poptorch_geometric import TrainingStepper def kge_harness(kge, dataloader, post_proc=None, loss_fn=torch.nn.MSELoss(), num_steps=4, atol=5e-3, rtol=5e-3, equal_nan=False, enable_fp_exception=True): class KgeWrapper(torch.nn.Module): def __init__(self, kge, loss_fn, post_proc=None): super().__init__() self.model = kge self.loss_fn = loss_fn self.post_proc = post_proc def forward(self, *args): result = self.model(*args) if self.post_proc is not None: if isinstance(result, List): result = torch.cat(result) result = self.post_proc(result) if self.training: if isinstance(result, List): result = torch.cat(result) target = torch.ones_like(result) loss = self.loss_fn(result, target) return result, loss return result model = KgeWrapper(kge, loss_fn=loss_fn, post_proc=post_proc) stepper = TrainingStepper(model, atol=atol, rtol=rtol, equal_nan=equal_nan, enable_fp_exception=enable_fp_exception) if dataloader is not None: for step, batch in enumerate(dataloader): if step == num_steps: break stepper.run(1, batch) ================================================ FILE: tests/gnn/nn/kge/test_complex.py ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. import torch from torch_geometric.nn import ComplEx from kge_utils import kge_harness def test_complex_scoring(): model = ComplEx(num_nodes=5, num_relations=2, hidden_channels=1) model.node_emb.weight.data = torch.tensor([ [2.], [3.], [5.], [1.], [2.], ]) model.node_emb_im.weight.data = torch.tensor([ [4.], [1.], [3.], [1.], [2.], ]) model.rel_emb.weight.data = torch.tensor([ [2.], [3.], ]) model.rel_emb_im.weight.data = torch.tensor([ [3.], [1.], ]) head_index = torch.tensor([1, 3]) rel_type = torch.tensor([1, 0]) tail_index = torch.tensor([2, 4]) loader = model.loader(head_index, rel_type, tail_index, batch_size=5) kge_harness(model, loader) def test_complex(): model = ComplEx(num_nodes=10, num_relations=5, hidden_channels=32) assert str(model) == 'ComplEx(10, num_relations=5, hidden_channels=32)' head_index = torch.tensor([0, 2, 4, 6, 8]) rel_type = torch.tensor([0, 1, 2, 3, 4]) tail_index = torch.tensor([1, 3, 5, 7, 9]) loader = model.loader(head_index, rel_type, tail_index, batch_size=5) kge_harness(model, loader) ================================================ FILE: tests/gnn/nn/kge/test_distmult.py ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. import torch from torch_geometric.nn import DistMult from kge_utils import kge_harness def test_distmult(): model = DistMult(num_nodes=10, num_relations=5, hidden_channels=32) assert str(model) == 'DistMult(10, num_relations=5, hidden_channels=32)' head_index = torch.tensor([0, 2, 4, 6, 8]) rel_type = torch.tensor([0, 1, 2, 3, 4]) tail_index = torch.tensor([1, 3, 5, 7, 9]) loader = model.loader(head_index, rel_type, tail_index, batch_size=5) kge_harness(model, loader) ================================================ FILE: tests/gnn/nn/kge/test_rotate.py ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. import torch from torch_geometric.nn import RotatE from kge_utils import kge_harness def test_rotate(): model = RotatE(num_nodes=10, num_relations=5, hidden_channels=32) assert str(model) == 'RotatE(10, num_relations=5, hidden_channels=32)' head_index = torch.tensor([0, 2, 4, 6, 8]) rel_type = torch.tensor([0, 1, 2, 3, 4]) tail_index = torch.tensor([1, 3, 5, 7, 9]) loader = model.loader(head_index, rel_type, tail_index, batch_size=5) kge_harness(model, loader) ================================================ FILE: tests/gnn/nn/kge/test_transe.py ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. import torch from torch_geometric.nn import TransE from kge_utils import kge_harness def test_transe(): model = TransE(num_nodes=10, num_relations=5, hidden_channels=32) assert str(model) == 'TransE(10, num_relations=5, hidden_channels=32)' head_index = torch.tensor([0, 2, 4, 6, 8]) rel_type = torch.tensor([0, 1, 2, 3, 4]) tail_index = torch.tensor([1, 3, 5, 7, 9]) loader = model.loader(head_index, rel_type, tail_index, batch_size=5) kge_harness(model, loader) ================================================ FILE: tests/gnn/nn/nn_utils.py ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. import copy import inspect import math import torch import helpers from torch_geometric.data import Batch, Data import poptorch # Need to import poptorch_geometric to ensure that our arg parser implementation is # registered with poptorch ahead of running these tests import poptorch_geometric # pylint: disable=unused-import class ModelWW(helpers.ModelWithWeights): def __init__(self, op, first_input_shape): super().__init__(op, first_input_shape) self.op = copy.deepcopy(op) self.loss_fn = torch.nn.MSELoss() self.first_input_shape = first_input_shape self.first_input_numel = first_input_shape.numel() self.out_fn = torch.nn.Linear(self.first_input_numel, self.first_input_numel) self._weights_before = self.out_fn.weight.detach().clone() def forward(self, xs): if callable(getattr(self.op, "forward", None)) and isinstance( inspect.signature(self.op.forward).return_annotation, tuple): x = self.op.forward(*xs) l = 0 else: x = self.op(*xs) if isinstance(x, (Batch, Data)): x1 = torch.flatten(x.x) elif isinstance(x, tuple): x1 = torch.flatten(x[0]) else: x1 = torch.flatten(x) if x1.shape.numel() != self.first_input_numel: ratio = math.ceil(self.first_input_numel / x1.shape.numel()) x1 = x1.repeat(ratio)[:self.first_input_numel] if x1.dtype != torch.float: x1 = x1.float() x1 = x1 if self.out_fn is None else self.out_fn(x1) x1 = x1.reshape(self.first_input_shape) target = torch.ones_like(x1) l = self.loss_fn(x1, target) return x, l def op_harness(op, inputs, assert_func=None, inference=False): if isinstance(inputs[0], (Batch, Data)): first_input_shape = inputs[0].x.shape else: first_input_shape = inputs[0].shape model = ModelWW(op, first_input_shape) # Run on CPU. native_out, _ = model(tuple(inputs)) # Run on IPU. if inference: poptorch_model = poptorch.inferenceModel(model) else: # The LR should be large enough that a single training step will # definitely cause weights to change optim = torch.optim.AdamW(model.parameters(), lr=0.1) poptorch_model = poptorch.trainingModel(model, optimizer=optim) poptorch_out, _ = poptorch_model(tuple(inputs)) # Training test - check weights have changed poptorch_model.assert_weights_changed() if assert_func is not None: assert_func(native_out, poptorch_out) return poptorch_out ================================================ FILE: tests/gnn/nn/norm/norm_utils.py ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. import torch import helpers from torch_geometric.data import Batch, Data from gnn.nn.nn_utils import op_harness def assert_(native_out, poptorch_out): def check_inner_field(x, y): assert isinstance(x, type(y)), \ f"x type={type(x)} is different than y type={type(y)}" if isinstance(x, torch.Tensor): helpers.assert_allclose(actual=x, expected=y, atol=1e-04, rtol=1e-04, equal_nan=True) elif isinstance(x, (list, tuple)): for t, ct in zip(x, y): check_inner_field(t, ct) elif isinstance(x, (Batch, Data)): assert x.keys == y.keys, "Objects have different keys." for k in x.keys: check_inner_field(x[k], y[k]) elif x is not None: assert False, f"Unsupported types: x type={type(x)}, y type=" \ f"{type(y)}" check_inner_field(native_out, poptorch_out) def norm_harness(op, inputs, assert_func=None, inference=False): if assert_func is None: assert_func = assert_ poptorch_out = op_harness(op, inputs, assert_func, inference) return poptorch_out ================================================ FILE: tests/gnn/nn/norm/test_batch_norm.py ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. import pytest import torch from torch_geometric.nn import BatchNorm from norm_utils import norm_harness @pytest.mark.parametrize('conf', [True, False]) def test_batch_norm(conf): x = torch.randn(100, 16) norm = BatchNorm(16, affine=conf, track_running_stats=conf) assert str(norm) == 'BatchNorm(16)' out = norm_harness(norm, [x]) assert out.size() == (100, 16) def test_batch_norm_single_element(): x = torch.randn(1, 16) norm = BatchNorm(16, track_running_stats=True, allow_single_element=True) assert str(norm) == 'BatchNorm(16)' out = norm_harness(norm, [x], inference=True) assert torch.allclose(out, x) ================================================ FILE: tests/gnn/nn/norm/test_diff_group_norm.py ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. import torch from norm_utils import norm_harness from torch_geometric.nn import DiffGroupNorm def test_diff_group_norm(): x = torch.randn(6, 16) norm = DiffGroupNorm(16, groups=4, lamda=0.01) assert str(norm) == 'DiffGroupNorm(16, groups=4)' out = norm_harness(norm, [x]) assert out.size() == x.size() ================================================ FILE: tests/gnn/nn/norm/test_graph_norm.py ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. import torch from norm_utils import norm_harness from torch_geometric.nn import GraphNorm def test_graph_norm(): torch.manual_seed(42) x = torch.randn(200, 16) batch = torch.arange(4).view(-1, 1).repeat(1, 50).view(-1) batch_size = int(batch.max() + 1) norm = GraphNorm(16) norm_harness(norm, [x]) norm_harness(norm, [x, batch, batch_size]) ================================================ FILE: tests/gnn/nn/norm/test_graph_size_norm.py ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. import torch from norm_utils import norm_harness from torch_geometric.nn import GraphSizeNorm def test_graph_size_norm(): x = torch.randn(100, 16) batch = torch.repeat_interleave(torch.full((10, ), 10, dtype=torch.long)) batch_size = int(batch.max()) + 1 norm = GraphSizeNorm() assert str(norm) == 'GraphSizeNorm()' out = norm_harness(norm, [x, batch, batch_size]) assert out.size() == (100, 16) ================================================ FILE: tests/gnn/nn/norm/test_instance_norm.py ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. import pytest import torch from torch_geometric.nn import InstanceNorm import helpers from gnn.nn.nn_utils import ModelWW import poptorch @pytest.mark.parametrize('conf', [True, False]) def test_instance_norm(conf): atol = None rtol = None if conf is True: # These values are based on torch_nn_test.py file # where InstanceNorm is tested from torch package. atol = 1e-3 rtol = 0.05 nodes_list = torch.randn(5, 100, 16) def test_body(inputs): norm = InstanceNorm(16, affine=conf, track_running_stats=conf) cpu_model = ModelWW(norm, inputs[0][0].shape) ipu_model = poptorch.trainingModel(ModelWW(norm, inputs[0][0].shape)) for x in inputs[0]: cpu_out = None ipu_out = None if len(inputs) > 1: model_inputs = [x] + inputs[1:] cpu_out = cpu_model(model_inputs) ipu_out = ipu_model(model_inputs) else: cpu_out = cpu_model([x]) ipu_out = ipu_model([x]) helpers.assert_allclose(actual=ipu_out[0], expected=cpu_out[0], atol=atol, rtol=rtol) test_body([nodes_list]) batch = torch.zeros(100, dtype=torch.long) batch_size = 1 test_body([nodes_list, batch, batch_size]) batch[:50] = torch.ones(50, dtype=torch.long) batch_size = 2 test_body([nodes_list, batch, batch_size]) ================================================ FILE: tests/gnn/nn/norm/test_layer_norm.py ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. import pytest import torch from norm_utils import norm_harness from torch_geometric.nn import LayerNorm @pytest.mark.parametrize('affine', [True, False]) @pytest.mark.parametrize('mode', ['graph', 'node']) def test_layer_norm(affine, mode): x = torch.randn(100, 16) norm = LayerNorm(16, affine=affine, mode=mode) norm_harness(norm, [x]) batch = torch.zeros(100, dtype=torch.int64) batch_size = 1 norm_harness(norm, [x, batch, batch_size]) batch_size = 2 norm_harness(norm, [ torch.cat([x, x], dim=0), torch.cat([batch, batch + 1], dim=0), batch_size ]) ================================================ FILE: tests/gnn/nn/norm/test_mean_subtraction_norm.py ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. import torch from norm_utils import norm_harness from torch_geometric.nn import MeanSubtractionNorm def test_mean_subtraction_norm_no_batch(): x = torch.randn(6, 16) norm = MeanSubtractionNorm() assert str(norm) == 'MeanSubtractionNorm()' out = norm_harness(norm, [x]) assert out.size() == (6, 16) assert torch.allclose(out.mean(), torch.tensor(0.), atol=1e-04) def test_mean_subtraction_norm(): x = torch.randn(6, 16) batch = torch.tensor([0, 0, 1, 1, 1, 2]) norm = MeanSubtractionNorm() assert str(norm) == 'MeanSubtractionNorm()' out = norm_harness(norm, [x, batch, 3]) assert out.size() == (6, 16) assert torch.allclose(out[0:2].mean(), torch.tensor(0.), atol=1e-04) assert torch.allclose(out[0:2].mean(), torch.tensor(0.), atol=1e-04) ================================================ FILE: tests/gnn/nn/norm/test_msg_norm.py ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. import torch from norm_utils import norm_harness from torch_geometric.nn import MessageNorm def test_message_norm(): norm = MessageNorm(learn_scale=True) assert str(norm) == 'MessageNorm(learn_scale=True)' x = torch.randn(100, 16) msg = torch.randn(100, 16) out = norm_harness(norm, [x, msg]) assert out.size() == (100, 16) norm = MessageNorm(learn_scale=False) assert str(norm) == 'MessageNorm(learn_scale=False)' out = norm_harness(norm, [x, msg]) assert out.size() == (100, 16) ================================================ FILE: tests/gnn/nn/norm/test_pair_norm.py ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. import pytest import torch from norm_utils import norm_harness from torch_geometric.nn import PairNorm @pytest.mark.parametrize('scale_individually', [False, True]) def test_pair_norm_no_batch(scale_individually): x = torch.randn(100, 16) norm = PairNorm(scale_individually=scale_individually) assert str(norm) == 'PairNorm()' out1 = norm_harness(norm, [x]) assert out1.size() == (100, 16) @pytest.mark.parametrize('scale_individually', [False, True]) def test_pair_norm(scale_individually): x = torch.randn(100, 16) batch = torch.zeros(100, dtype=torch.long) norm = PairNorm(scale_individually=scale_individually) assert str(norm) == 'PairNorm()' out1 = norm_harness(norm, [x]) batch_size = 2 out2 = norm_harness(norm, [ torch.cat([x, x], dim=0), torch.cat([batch, batch + 1], dim=0), batch_size ]) assert torch.allclose(out1, out2[:100], atol=1e-04) assert torch.allclose(out1, out2[100:], atol=1e-04) ================================================ FILE: tests/gnn/nn/pool/pool_utils.py ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. import dataclasses import torch import helpers from torch_geometric.data import Batch, Data from gnn.nn.nn_utils import op_harness def assert_(native_out, poptorch_out): def check_inner_field(x, y): assert isinstance(x, type(y)), \ f"x type={type(x)} is different than y type={type(y)}" if isinstance(x, torch.Tensor): helpers.assert_allclose(actual=x, expected=y, atol=1e-04, rtol=1e-04, equal_nan=True) elif isinstance(x, (list, tuple)): for t, ct in zip(x, y): check_inner_field(t, ct) elif isinstance(x, (Batch, Data)): assert x.keys == y.keys, "Objects have different keys." for k in x.keys: check_inner_field(x[k], y[k]) elif dataclasses.is_dataclass(x): for att in dir(x): x_field = getattr(x, att, None) if not callable(x_field) and isinstance(x_field, torch.Tensor): check_inner_field(x_field, getattr(y, att, None)) elif x is not None: assert False, f"Unsupported types: x type={type(x)}, y type=" \ f"{type(y)}" check_inner_field(native_out, poptorch_out) def pool_harness(op, inputs, assert_func=None): if assert_func is None: assert_func = assert_ poptorch_out = op_harness(op, inputs, assert_func) return poptorch_out ================================================ FILE: tests/gnn/nn/pool/test_asap.py ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. import pytest import torch from torch_geometric.nn import ASAPooling, GCNConv, GraphConv from pool_utils import pool_harness @pytest.mark.skip(reason="TODO(AFS-229, AFS-230, AFS-232, AFS-262)") def test_asap(): in_channels = 16 edge_index = torch.tensor([[0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3], [1, 2, 3, 0, 2, 3, 0, 1, 3, 0, 1, 2]]) num_nodes = edge_index.max().item() + 1 x = torch.randn((num_nodes, in_channels)) for GNN in [GraphConv, GCNConv]: pool = ASAPooling(in_channels, ratio=0.5, GNN=GNN, add_self_loops=False) assert pool.__repr__() == ('ASAPooling(16, ratio=0.5)') out = pool_harness(pool, [x, edge_index]) assert out[0].size() == (num_nodes // 2, in_channels) assert out[1].size() == (2, 2) pool = ASAPooling(in_channels, ratio=0.5, GNN=GNN, add_self_loops=True) assert pool.__repr__() == ('ASAPooling(16, ratio=0.5)') out = pool_harness(pool, [x, edge_index]) assert out[0].size() == (num_nodes // 2, in_channels) assert out[1].size() == (2, 4) pool = ASAPooling(in_channels, ratio=2, GNN=GNN, add_self_loops=False) assert pool.__repr__() == ('ASAPooling(16, ratio=2)') out = pool_harness(pool, [x, edge_index]) assert out[0].size() == (2, in_channels) assert out[1].size() == (2, 2) ================================================ FILE: tests/gnn/nn/pool/test_avg_pool.py ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. import pytest import torch from torch_geometric.data import Batch, Data from torch_geometric.nn import avg_pool, avg_pool_neighbor_x, avg_pool_x from pool_utils import pool_harness def test_avg_pool_x(): cluster = torch.tensor([0, 1, 0, 1, 2, 2]) x = torch.Tensor([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12]]) batch = torch.tensor([0, 0, 0, 0, 1, 1]) batch_size = int(batch.max().item()) + 1 out, _ = pool_harness(avg_pool_x, [cluster, x, batch, batch_size, 2]) assert out.tolist() == [[3, 4], [5, 6], [10, 11], [0, 0]] @pytest.mark.skip( reason="avg_pool uses torch.unique instruction which produces " "tensor with dynamic shape. This is not supported for Mk2.") def test_avg_pool(): cluster = torch.tensor([0, 1, 0, 1, 2, 2]) x = torch.Tensor([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12]]) pos = torch.Tensor([[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5]]) edge_index = torch.tensor([[0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 5], [1, 2, 3, 0, 2, 3, 0, 1, 3, 0, 1, 2, 5, 4]]) edge_attr = torch.Tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]) batch = torch.tensor([0, 0, 0, 0, 1, 1]) data = Batch(x=x, pos=pos, edge_index=edge_index, edge_attr=edge_attr, batch=batch) data = pool_harness(avg_pool, [cluster, data, lambda x: x]) assert data.x.tolist() == [[3, 4], [5, 6], [10, 11]] assert data.pos.tolist() == [[1, 1], [2, 2], [4.5, 4.5]] assert data.edge_index.tolist() == [[0, 1], [1, 0]] assert data.edge_attr.tolist() == [4, 4] assert data.batch.tolist() == [0, 0, 1] @pytest.mark.parametrize('input_type', [Data, Batch]) def test_avg_pool_neighbor_x(input_type): if input_type == Batch: pytest.skip("TODO(AFS-231, AFS-229, AFS-230)") x = torch.Tensor([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12]]) edge_index = torch.tensor([[0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 5], [1, 2, 3, 0, 2, 3, 0, 1, 3, 0, 1, 2, 5, 4]]) batch = torch.tensor([0, 0, 0, 0, 1, 1]) data = input_type(x=x, edge_index=edge_index, batch=batch) data = pool_harness(avg_pool_neighbor_x, [data]) assert data.x.tolist() == [ [4, 5], [4, 5], [4, 5], [4, 5], [10, 11], [10, 11], ] assert data.edge_index.tolist() == edge_index.tolist() ================================================ FILE: tests/gnn/nn/pool/test_consecutive.py ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. import pytest import torch from torch_geometric.nn.pool.consecutive import consecutive_cluster from pool_utils import pool_harness @pytest.mark.skip( reason="consecutive_cluster uses torch.unique instruction which produces " "tensor with dynamic shape. This is not supported for Mk2.") def test_consecutive_cluster(): src = torch.tensor([8, 2, 10, 15, 100, 1, 100]) out, perm = pool_harness(consecutive_cluster, [src]) assert out.tolist() == [2, 1, 3, 4, 5, 0, 5] assert perm.tolist() == [5, 1, 0, 2, 3, 6] ================================================ FILE: tests/gnn/nn/pool/test_decimation.py ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. import pytest import torch from torch_geometric.nn.pool.decimation import decimation_indices from pool_utils import pool_harness @pytest.mark.skip(reason="Algorithm uses tensors with dynamic shapes " "and reads tensor values during runtime") def test_decimation_basic(): N_1, N_2 = 4, 6 decimation_factor = 2 ptr = torch.tensor([0, N_1, N_1 + N_2]) idx_decim, ptr_decim = pool_harness(decimation_indices, [ptr, decimation_factor], assert_func=lambda x, y: True) expected_size = (N_1 // decimation_factor) + (N_2 // decimation_factor) assert idx_decim.size(0) == expected_size expected = torch.tensor([0, N_1 // decimation_factor, expected_size]) assert torch.equal(ptr_decim, expected) @pytest.mark.skip(reason="Algorithm uses tensors with dynamic shapes " "and reads tensor values during runtime") def test_decimation_single_cloud(): N_1 = 4 decimation_factor = 2 ptr = torch.tensor([0, N_1]) idx_decim, ptr_decim = pool_harness(decimation_indices, [ptr, decimation_factor]) expected_size = N_1 // decimation_factor assert idx_decim.size(0) == expected_size assert torch.equal(ptr_decim, torch.tensor([0, expected_size])) @pytest.mark.skip(reason="Algorithm uses tensors with dynamic shapes " "and reads tensor values during runtime") def test_decimation_almost_empty(): N_1 = 4 decimation_factor = 666 # greater than N_1 ptr = torch.tensor([0, N_1]) idx_decim, ptr_decim = pool_harness(decimation_indices, [ptr, decimation_factor]) assert idx_decim.size(0) == 1 assert torch.equal(ptr_decim, torch.tensor([0, 1])) ================================================ FILE: tests/gnn/nn/pool/test_edge_pool.py ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. import pytest import torch from torch_geometric.nn import EdgePooling from torch_geometric.utils import scatter from pool_utils import pool_harness def test_compute_edge_score_softmax(): edge_index = torch.tensor([[0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 5], [1, 2, 3, 0, 2, 3, 0, 1, 3, 0, 1, 2, 5, 4]]) raw = torch.randn(edge_index.size(1)) e = pool_harness(EdgePooling.compute_edge_score_softmax, [raw, edge_index, 6]) assert torch.all(e >= 0) and torch.all(e <= 1) # Test whether all incoming edge scores sum up to one. assert torch.allclose(scatter(e, edge_index[1], reduce='sum'), torch.Tensor([1, 1, 1, 1, 1, 1])) def test_compute_edge_score_tanh(): edge_index = torch.tensor([[0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 5], [1, 2, 3, 0, 2, 3, 0, 1, 3, 0, 1, 2, 5, 4]]) raw = torch.randn(edge_index.size(1)) e = pool_harness(EdgePooling.compute_edge_score_tanh, [raw, edge_index, 6]) assert torch.all(e >= -1) and torch.all(e <= 1) assert torch.all(torch.argsort(raw) == torch.argsort(e)) def test_compute_edge_score_sigmoid(): edge_index = torch.tensor([[0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 5], [1, 2, 3, 0, 2, 3, 0, 1, 3, 0, 1, 2, 5, 4]]) raw = torch.randn(edge_index.size(1)) e = pool_harness(EdgePooling.compute_edge_score_sigmoid, [raw, edge_index, 6]) assert torch.all(e >= 0) and torch.all(e <= 1) assert torch.all(torch.argsort(raw) == torch.argsort(e)) @pytest.mark.skip( reason="Currently not possible to run on Mk2 due to the algorithm " "used in the __merge_edges__ function") def test_edge_pooling(): x = torch.Tensor([[0], [1], [2], [3], [4], [5], [-1]]) edge_index = torch.tensor([[0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 5, 6], [1, 2, 3, 0, 2, 3, 0, 1, 3, 0, 1, 2, 5, 4, 0]]) batch = torch.tensor([0, 0, 0, 0, 1, 1, 0]) op = EdgePooling(in_channels=1) assert str(op) == 'EdgePooling(1)' # Setting parameters fixed so we can test the expected outcome: op.lin.weight.data.fill_(1.) op.lin.bias.data.fill_(0.) # Test pooling: new_x, new_edge_index, new_batch, _ = pool_harness(op, [x, edge_index, batch]) assert new_x.size(0) == new_batch.size(0) == 4 assert new_edge_index.tolist() == [[0, 1, 1, 2, 2, 3], [0, 1, 2, 1, 2, 2]] assert new_batch.tolist() == [1, 0, 0, 0] # Test edge cases. x = torch.Tensor([[0], [1], [2], [3], [4], [5]]) edge_index = torch.tensor([[0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 5], [1, 2, 3, 0, 2, 3, 0, 1, 3, 0, 1, 2, 5, 4]]) batch = torch.tensor([0, 0, 0, 0, 1, 1]) new_x, new_edge_index, new_batch, _ = pool_harness(op, [x, edge_index, batch]) assert new_x.size(0) == new_batch.size(0) == 3 assert new_batch.tolist() == [1, 0, 0] assert new_edge_index.tolist() == [[0, 1, 1, 2, 2], [0, 1, 2, 1, 2]] def test_edge_unpooling(): x = torch.Tensor([[0], [1], [2], [3], [4], [5], [-1]]) edge_index = torch.tensor([[0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 5, 6], [1, 2, 3, 0, 2, 3, 0, 1, 3, 0, 1, 2, 5, 4, 0]]) batch = torch.tensor([0, 0, 0, 0, 1, 1, 0]) op = EdgePooling(in_channels=1) assert str(op) == 'EdgePooling(1)' # Setting parameters fixed so we can test the expected outcome: op.lin.weight.data.fill_(1.) op.lin.bias.data.fill_(0.) # Test pooling: new_x, _, _, unpool_info = op(x, edge_index, batch) out = pool_harness(op.unpool, [new_x, unpool_info]) assert out[0].size() == x.size() assert out[0].tolist() == [[1], [1], [5], [5], [9], [9], [-1]] assert torch.equal(out[1], edge_index) assert torch.equal(out[2], batch) ================================================ FILE: tests/gnn/nn/pool/test_fps.py ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. from copy import deepcopy import numpy as np import torch import torch_cluster from torch_geometric.nn import Linear import pytest import poptorch class FpsInferModel(torch.nn.Module): def forward(self, x, ptr, ratio, random_start): return poptorch.fps(x, ptr, ratio, random_start) class FpsTrainModel(torch.nn.Module): def __init__(self, op, linear): super().__init__() self.loss_fn = torch.nn.MSELoss() self.linear = linear self.op = op def forward(self, x, ptr, ratio, random_start): result = self.op(x, ptr, ratio, random_start) res = result.float() result = self.linear(res) target = torch.ones_like(result) loss = self.loss_fn(result, target) return result, loss @pytest.mark.parametrize('src_shape', [(1, 2), (2, 19), (3, 10), (19, 3)]) @pytest.mark.parametrize('ratio', [0.3, 0.5, 1.0]) def test_single_batch(src_shape, ratio): src = torch.rand(src_shape) ptr = [0, src_shape[0]] batch = torch.zeros(src_shape[0], dtype=torch.long) inference_model = poptorch.inferenceModel(FpsInferModel()) ipu_res = inference_model(src, ptr, ratio, random_start=False) ref_res = torch_cluster.fps(src, batch, ratio, random_start=False) assert all(ipu_res == ref_res) @pytest.mark.parametrize('src_shape', [(19, 3)]) @pytest.mark.parametrize( 'ptr', [[0, 13, 19], [0, 2, 3, 4, 9, 11, 19], [0, 1, 3, 4, 9, 18, 19]]) @pytest.mark.parametrize('ratio', [0.4, 0.6, 1.0]) def test_multi_batch(src_shape, ptr, ratio): src = torch.rand(src_shape) batch = torch.zeros(src_shape[0], dtype=torch.long) for i in range(1, len(ptr)): batch[ptr[i - 1]:ptr[i]] = i - 1 inference_model = poptorch.inferenceModel(FpsInferModel()) ipu_res = inference_model(src, ptr, ratio, random_start=False) ref_res = torch_cluster.fps(src, batch, ratio, random_start=False) assert all(ipu_res == ref_res) @pytest.mark.parametrize('src_shape', [(29, 3)]) @pytest.mark.parametrize('ptr', [[0, 29], [0, 2, 6, 11, 28, 29]]) @pytest.mark.parametrize('ratio', [1.0]) def test_random_start(src_shape, ptr, ratio): src = torch.rand(src_shape) batch = torch.zeros(src_shape[0], dtype=torch.long) for i in range(1, len(ptr)): batch[ptr[i - 1]:ptr[i]] = i - 1 inference_model = poptorch.inferenceModel(FpsInferModel()) ipu_res = inference_model(src, ptr, ratio, random_start=True) ref_res = torch_cluster.fps(src, batch, ratio, random_start=True) for i in range(1, len(ptr)): ipu_res_slice = set(ipu_res[ptr[i - 1]:ptr[i]].tolist()) ref_res_slice = set(ref_res[ptr[i - 1]:ptr[i]].tolist()) assert ipu_res_slice == ref_res_slice @pytest.mark.parametrize('src_shape', [(29, 3)]) @pytest.mark.parametrize('ptr', [[0, 29], [0, 2, 6, 11, 28, 29]]) @pytest.mark.parametrize('ratio', [0.15, 0.7, 1.0]) def test_train(src_shape, ptr, ratio): src = torch.rand(src_shape) batch = torch.zeros(src_shape[0], dtype=torch.long) for i in range(1, len(ptr)): batch[ptr[i - 1]:ptr[i]] = i - 1 deg = np.subtract(ptr[1:], ptr[0:-1]) out_size = np.ceil(deg * ratio).astype(int) out_size = np.cumsum(out_size, 0)[-1] linear_ipu = Linear(out_size, out_size) linear_ref = deepcopy(linear_ipu) ipu_model = FpsTrainModel(poptorch.fps, linear_ipu) ipu_model = poptorch.trainingModel(ipu_model) ipu_res, ipu_loss = ipu_model(src, ptr, ratio, random_start=False) ref_model = FpsTrainModel(torch_cluster.fps, linear_ref) ref_res, ref_loss = ref_model(src, batch, ratio, random_start=False) rtol = 1e-05 atol = 1e-06 assert np.allclose(ipu_res.tolist(), ref_res.tolist(), rtol=rtol, atol=atol) assert np.allclose(ipu_loss.tolist(), ref_loss.tolist(), rtol=rtol, atol=atol) ================================================ FILE: tests/gnn/nn/pool/test_glob.py ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. import pytest import torch from torch_geometric.nn import ( global_add_pool, global_max_pool, global_mean_pool, ) from pool_utils import pool_harness def test_global_pool(): N_1, N_2 = 4, 6 x = torch.randn(N_1 + N_2, 4) batch = torch.tensor([0 for _ in range(N_1)] + [1 for _ in range(N_2)]) out = pool_harness(global_add_pool, [x, batch, 2]) assert out.size() == (2, 4) torch.testing.assert_close(out[0], x[:4].sum(dim=0)) torch.testing.assert_close(out[1], x[4:].sum(dim=0)) out = pool_harness(global_add_pool, [x, None]) assert out.size() == (1, 4) torch.testing.assert_close(out, x.sum(dim=0, keepdim=True)) out = pool_harness(global_mean_pool, [x, batch, 2]) assert out.size() == (2, 4) torch.testing.assert_close(out[0], x[:4].mean(dim=0)) torch.testing.assert_close(out[1], x[4:].mean(dim=0)) out = pool_harness(global_mean_pool, [x, None]) assert out.size() == (1, 4) torch.testing.assert_close(out, x.mean(dim=0, keepdim=True)) out = pool_harness(global_max_pool, [x, batch, 2]) assert out.size() == (2, 4) torch.testing.assert_close(out[0], x[:4].max(dim=0)[0]) torch.testing.assert_close(out[1], x[4:].max(dim=0)[0]) @pytest.mark.skip(reason="TODO(AFS-140)") def test_global_max_pool_no_batch(): N_1, N_2 = 4, 6 x = torch.randn(N_1 + N_2, 4) out = pool_harness(global_max_pool, [x, None]) assert out.size() == (1, 4) torch.testing.assert_close(out, x.max(dim=0, keepdim=True)[0]) def test_permuted_global_pool(): N_1, N_2 = 4, 6 x = torch.randn(N_1 + N_2, 4) batch = torch.cat([torch.zeros(N_1), torch.ones(N_2)]).to(torch.long) perm = torch.randperm(N_1 + N_2) px = x[perm] pbatch = batch[perm] px1 = px[pbatch == 0] px2 = px[pbatch == 1] out = pool_harness(global_add_pool, [px, pbatch, 2]) assert out.size() == (2, 4) assert torch.allclose(out[0], px1.sum(dim=0)) assert torch.allclose(out[1], px2.sum(dim=0)) out = pool_harness(global_mean_pool, [px, pbatch, 2]) assert out.size() == (2, 4) assert torch.allclose(out[0], px1.mean(dim=0)) assert torch.allclose(out[1], px2.mean(dim=0)) out = pool_harness(global_max_pool, [px, pbatch, 2]) assert out.size() == (2, 4) assert torch.allclose(out[0], px1.max(dim=0)[0]) assert torch.allclose(out[1], px2.max(dim=0)[0]) def test_dense_global_pool(): x = torch.randn(3, 16, 32) out = pool_harness(global_add_pool, [x, None]) assert torch.allclose(out, x.sum(dim=1)) ================================================ FILE: tests/gnn/nn/pool/test_graclus.py ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. import pytest import torch from torch_geometric.nn import graclus from torch_geometric.testing import withPackage from pool_utils import pool_harness @pytest.mark.skip(reason="TODO(AFS-245)") @withPackage('torch_cluster') def test_graclus(): edge_index = torch.tensor([[0, 1], [1, 0]]) weight = torch.tensor([1., 1.]) out = pool_harness(graclus, [edge_index, weight, 2]) assert out.tolist() == [0, 0] ================================================ FILE: tests/gnn/nn/pool/test_max_pool.py ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. import pytest import torch from torch_geometric.data import Batch, Data from torch_geometric.nn import max_pool, max_pool_neighbor_x, max_pool_x from pool_utils import pool_harness def test_max_pool_x(): cluster = torch.tensor([0, 1, 0, 1, 2, 2]) x = torch.Tensor([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12]]) batch = torch.tensor([0, 0, 0, 0, 1, 1]) batch_size = int(batch.max().item()) + 1 out, _ = pool_harness(max_pool_x, [cluster, x, batch, batch_size, 2]) assert out.tolist() == [[5, 6], [7, 8], [11, 12], [0, 0]] @pytest.mark.skip( reason="max_pool uses torch.unique instruction which produces " "tensor with dynamic shape. This is not supported for Mk2.") def test_max_pool(): cluster = torch.tensor([0, 1, 0, 1, 2, 2]) x = torch.Tensor([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12]]) pos = torch.Tensor([[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5]]) edge_index = torch.tensor([[0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 5], [1, 2, 3, 0, 2, 3, 0, 1, 3, 0, 1, 2, 5, 4]]) edge_attr = torch.Tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]) batch = torch.tensor([0, 0, 0, 0, 1, 1]) data = Batch(x=x, pos=pos, edge_index=edge_index, edge_attr=edge_attr, batch=batch) data = pool_harness(max_pool, [cluster, data, lambda x: x]) assert data.x.tolist() == [[5, 6], [7, 8], [11, 12]] assert data.pos.tolist() == [[1, 1], [2, 2], [4.5, 4.5]] assert data.edge_index.tolist() == [[0, 1], [1, 0]] assert data.edge_attr.tolist() == [4, 4] assert data.batch.tolist() == [0, 0, 1] @pytest.mark.parametrize('input_type', [Data, Batch]) def test_max_pool_neighbor_x(input_type): if input_type == Batch: pytest.skip("TODO(AFS-231, AFS-229, AFS-230)") x = torch.Tensor([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12]]) edge_index = torch.tensor([[0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 5], [1, 2, 3, 0, 2, 3, 0, 1, 3, 0, 1, 2, 5, 4]]) batch = torch.tensor([0, 0, 0, 0, 1, 1]) data = input_type(x=x, edge_index=edge_index, batch=batch) data = pool_harness(max_pool_neighbor_x, [data]) assert data.x.tolist() == [ [7, 8], [7, 8], [7, 8], [7, 8], [11, 12], [11, 12], ] assert data.edge_index.tolist() == edge_index.tolist() ================================================ FILE: tests/gnn/nn/pool/test_mem_pool.py ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. import torch from torch_geometric.nn import MemPooling from torch_geometric.utils import to_dense_batch from pool_utils import pool_harness import helpers import poptorch def test_mem_pool_basic(): torch.manual_seed(42) mpool1 = MemPooling(4, 8, heads=3, num_clusters=2) assert mpool1.__repr__() == 'MemPooling(4, 8, heads=3, num_clusters=2)' x = torch.randn(17, 4) batch = torch.tensor([0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 4, 4, 4]) _, mask = to_dense_batch(x, batch) batch_size = int(batch.max() + 1) out1, S = pool_harness(mpool1, [x, batch, None, 4, batch_size]) assert out1.size() == (5, 2, 8) assert S[~mask].sum() == 0 assert round(S[mask].sum().item()) == x.size(0) def test_mem_pool_basic_custom_loss(): torch.manual_seed(42) x = torch.randn(17, 4) batch = torch.tensor([0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 4, 4, 4]) class MemPoolWrapper(torch.nn.Module): def __init__(self, *args, **kwargs) -> None: super().__init__(*args, **kwargs) self.op = MemPooling(4, 8, heads=3, num_clusters=2) def forward(self, *args, **kwargs): out1, S = self.op.forward(*args, **kwargs) loss = MemPooling.kl_loss(S) return out1, poptorch.identity_loss(loss, "sum") model = MemPoolWrapper() model.train() optimizer = torch.optim.AdamW(model.parameters(), lr=0.001) poptorch_model = poptorch.trainingModel(model, optimizer=optimizer) batch_size = int(batch.max() + 1) out1_expected, _ = model(x, batch, None, 4, batch_size) out1, loss = poptorch_model(x, batch, None, 4, batch_size) assert float(loss) > 0 assert out1.size() == (5, 2, 8) helpers.assert_allclose(actual=out1, expected=out1_expected) def test_mem_pool_chain(): torch.manual_seed(42) mpool1 = MemPooling(4, 8, heads=3, num_clusters=2) assert mpool1.__repr__() == 'MemPooling(4, 8, heads=3, num_clusters=2)' mpool2 = MemPooling(8, 4, heads=2, num_clusters=1) assert mpool2.__repr__() == 'MemPooling(8, 4, heads=2, num_clusters=1)' x = torch.randn(17, 4) batch = torch.tensor([0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 4, 4, 4]) out1, _ = mpool1(x, batch) assert out1.size() == (5, 2, 8) out2, _ = pool_harness(mpool2, [out1]) assert out2.size() == (5, 1, 4) ================================================ FILE: tests/gnn/nn/pool/test_pan_pool.py ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. import pytest import torch from torch_geometric.nn import PANConv, PANPooling from pool_utils import pool_harness @pytest.mark.skip(reason="The class is using filter_adj which produces " "tensors with dynamic shapes. It is not supported " "on Mk2.") def test_pan_pooling(): edge_index = torch.tensor([[0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3], [1, 2, 3, 0, 2, 3, 0, 1, 3, 0, 1, 2]]) num_nodes = edge_index.max().item() + 1 x = torch.randn((num_nodes, 16)) conv = PANConv(16, 32, filter_size=2) pool = PANPooling(32, ratio=0.5) assert str(pool) == 'PANPooling(32, ratio=0.5, multiplier=1.0)' x, M = conv(x, edge_index) row, col, edge_weight = M.coo() h, edge_index, edge_weight, _, perm, score = pool_harness( pool, [x, row, col, edge_weight]) assert h.size() == (2, 32) assert edge_index.size() == (2, 4) assert edge_weight.size() == (4, ) assert perm.size() == (2, ) assert score.size() == (2, ) ================================================ FILE: tests/gnn/nn/pool/test_pool_knn.py ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. import torch from torch_geometric.nn import knn, knn_graph import helpers import poptorch class KnnModel(torch.nn.Module): def __init__(self, op) -> None: super().__init__() self.op = op def forward(self, *args, **kwargs): return self.op(*args, **kwargs) def test_knn(): x = torch.Tensor([[-1, -1], [-1, 1], [1, -1], [1, 1]]) batch_x = torch.tensor([0, 0, 0, 0]) y = torch.Tensor([[-1, 0], [1, 0]]) batch_y = torch.tensor([0, 0]) assign_index_cpu = knn(x, y, 2, batch_x, batch_y) model = poptorch.inferenceModel(KnnModel(knn)) assign_index_ipu = model(x, y, 2, batch_x, batch_y) # There is no guarantee that indexes that knn returns must be in any # particualr order if there are multiple identical elements so we can't # compare results directly as one can be permutation of the other. helpers.assert_allequal(actual=assign_index_ipu.sort()[0], expected=assign_index_cpu.sort()[0]) def test_knn_graph(): x = torch.Tensor([[-1, -1], [-1, 1], [1, -1], [1, 1]]) batch = torch.tensor([0, 0, 0, 0]) edge_index_cpu = knn_graph(x, k=2, batch=batch, loop=True) model = poptorch.inferenceModel(KnnModel(knn_graph)) edge_index_ipu = model(x, k=2, batch=batch, loop=True) # There is no guarantee that indexes that knn returns must be in any # particualr order if there are multiple identical elements so we can't # compare results directly as one can be permutation of the other. helpers.assert_allequal(actual=edge_index_cpu.sort()[0], expected=edge_index_ipu.sort()[0]) ================================================ FILE: tests/gnn/nn/pool/test_radius.py ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. from typing import Optional import torch import torch_geometric from torch import Tensor import poptorch def to_set(edge_index): # pylint: disable=R1721 return {(i, j) for i, j in edge_index.t().tolist()} def assert_fn(native_out, poptorch_out): poptorch_out = poptorch_out[poptorch_out != -1] dim = poptorch_out.size(0) // 2 poptorch_out = poptorch_out.reshape((2, dim)) native_out = native_out[native_out != -1] dim = native_out.size(0) // 2 native_out = native_out.reshape((2, dim)) assert to_set(poptorch_out) == to_set(native_out) def op_harness(*args, **kwargs): class Model(torch.nn.Module): def forward(self, x: Tensor, batch: Optional[Tensor] = None) -> Tensor: return torch_geometric.nn.radius_graph(x, r=2.5, batch=batch, loop=True) native_out = Model()(*args, **kwargs) model = poptorch.inferenceModel(Model()) poptorch_out = model(*args, **kwargs) assert_fn(native_out, poptorch_out) def test_radius_graph(): x = torch.tensor([[-1, -1], [-1, 1], [1, -1], [1, 1]], dtype=torch.float) batch = torch.tensor([0, 0, 0, 0]) op_harness(x, batch) ================================================ FILE: tests/gnn/nn/pool/test_sag_pool.py ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. import pytest import torch from torch_geometric.nn import ( GATConv, GCNConv, GraphConv, SAGEConv, SAGPooling, ) from pool_utils import pool_harness @pytest.mark.skip(reason="The class is using filter_adj which produces " "tensors with dynamic shapes. It is not supported " "on Mk2.") @pytest.mark.parametrize('GNN', [GraphConv, GCNConv, GATConv, SAGEConv]) def test_sag_pooling(GNN): conv_kwargs = {'add_self_loops': False} in_channels = 16 edge_index = torch.tensor([[0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3], [1, 2, 3, 0, 2, 3, 0, 1, 3, 0, 1, 2]]) num_nodes = edge_index.max().item() + 1 x = torch.randn((num_nodes, in_channels)) pool1 = SAGPooling(in_channels, ratio=0.5, GNN=GNN, **conv_kwargs) out1 = pool_harness(pool1, [x, edge_index]) assert out1[0].size() == (num_nodes // 2, in_channels) assert out1[1].size() == (2, 2) pool2 = SAGPooling(in_channels, ratio=None, GNN=GNN, min_score=0.1, **conv_kwargs) out2 = pool_harness(pool2, [x, edge_index]) assert out2[0].size(0) <= x.size(0) and out2[0].size(1) == (16) assert out2[1].size(0) == 2 and out2[1].size(1) <= edge_index.size(1) pool3 = SAGPooling(in_channels, ratio=2, GNN=GNN, **conv_kwargs) out3 = pool_harness(pool3, [x, edge_index]) assert out3[0].size() == (2, in_channels) assert out3[1].size() == (2, 2) ================================================ FILE: tests/gnn/nn/pool/test_select_topk.py ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. import pytest import torch from torch_geometric.nn.pool.select import SelectOutput, SelectTopK from torch_geometric.nn.pool.select.topk import topk from pool_utils import pool_harness @pytest.mark.skip( reason= "Inside the topk function, an index tensor is created that causes the " "output tensor to dynamically shape. It is not supported on MK2.") def test_topk_ratio(): x = torch.Tensor([2, 4, 5, 6, 2, 9]) batch = torch.tensor([0, 0, 1, 1, 1, 1]) perm1 = pool_harness(topk, [x, 0.5, batch]) assert perm1.tolist() == [1, 5, 3] assert x[perm1].tolist() == [4, 9, 6] assert batch[perm1].tolist() == [0, 1, 1] perm2 = pool_harness(topk, [x, 2, batch]) assert perm2.tolist() == [1, 0, 5, 3] assert x[perm2].tolist() == [4, 2, 9, 6] assert batch[perm2].tolist() == [0, 0, 1, 1] perm3 = pool_harness(topk, [x, 3, batch]) assert perm3.tolist() == [1, 0, 5, 3, 2] assert x[perm3].tolist() == [4, 2, 9, 6, 5] assert batch[perm3].tolist() == [0, 0, 1, 1, 1] @pytest.mark.skip( reason= "Inside the topk function, an index tensor is created that causes the " "output tensor to dynamically shape. It is not supported on MK2.") @pytest.mark.parametrize('min_score', [None, 2.0]) def test_select_topk(min_score): if min_score is not None: return x = torch.randn(6, 16) batch = torch.tensor([0, 0, 1, 1, 1, 1]) pool = SelectTopK(16, min_score=min_score) if min_score is None: assert str(pool) == 'SelectTopK(16, ratio=0.5)' else: assert str(pool) == 'SelectTopK(16, min_score=2.0)' out = pool_harness(pool, [x, batch]) assert isinstance(out, SelectOutput) assert out.num_nodes == 6 assert out.num_clusters <= out.num_nodes assert out.node_index.min() >= 0 assert out.node_index.max() < out.num_nodes assert out.cluster_index.min() == 0 assert out.cluster_index.max() == out.num_clusters - 1 ================================================ FILE: tests/gnn/nn/pool/test_topk_pool.py ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. import pytest import torch from torch_geometric.nn.pool import TopKPooling from torch_geometric.nn.pool.topk_pool import filter_adj from pool_utils import pool_harness @pytest.mark.skip(reason="The class is using filter_adj which produces " "tensors with dynamic shapes. It is not supported " "on Mk2.") def test_filter_adj(): edge_index = torch.tensor([[0, 0, 1, 1, 2, 2, 3, 3], [1, 3, 0, 2, 1, 3, 0, 2]]) edge_attr = torch.Tensor([1, 2, 3, 4, 5, 6, 7, 8]) perm = torch.tensor([1, 2, 3]) out = pool_harness(filter_adj, [edge_index, edge_attr, perm, 4]) assert out[0].tolist() == [[0, 1], [1, 0]] assert out[1].tolist() == [6, 8] @pytest.mark.skip(reason="The class is using filter_adj which produces " "tensors with dynamic shapes. It is not supported " "on Mk2.") def test_topk_pooling(): in_channels = 16 edge_index = torch.tensor([[0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3], [1, 2, 3, 0, 2, 3, 0, 1, 3, 0, 1, 2]]) num_nodes = edge_index.max().item() + 1 x = torch.randn((num_nodes, in_channels)) pool1 = TopKPooling(in_channels, ratio=0.5) assert str(pool1) == 'TopKPooling(16, ratio=0.5, multiplier=1.0)' out1 = pool_harness(pool1, [x, edge_index]) assert out1[0].size() == (num_nodes // 2, in_channels) assert out1[1].size() == (2, 2) pool2 = TopKPooling(in_channels, ratio=None, min_score=0.1) assert str(pool2) == 'TopKPooling(16, min_score=0.1, multiplier=1.0)' out2 = pool_harness(pool2, [x, edge_index]) assert out2[0].size(0) <= x.size(0) and out2[0].size(1) == (16) assert out2[1].size(0) == 2 and out2[1].size(1) <= edge_index.size(1) pool3 = TopKPooling(in_channels, ratio=2) assert str(pool3) == 'TopKPooling(16, ratio=2, multiplier=1.0)' out3 = pool_harness(pool3, [x, edge_index]) assert out3[0].size() == (2, in_channels) assert out3[1].size() == (2, 2) ================================================ FILE: tests/gnn/nn/pool/test_voxel_grid.py ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. import torch from torch_geometric.data import Batch from torch_geometric.nn import avg_pool, voxel_grid from torch_geometric.testing import withPackage from pool_utils import pool_harness @withPackage('torch_cluster') def test_voxel_grid(): pos = torch.Tensor([[0, 0], [11, 9], [2, 8], [2, 2], [8, 3]]) batch = torch.tensor([0, 0, 0, 1, 1]) out = pool_harness(voxel_grid, [pos, 5, batch]) assert out.tolist() == [0, 5, 3, 6, 7] out = pool_harness(voxel_grid, [pos, 5]) assert out.tolist() == [0, 5, 3, 0, 1] @withPackage('torch_cluster') def test_voxel_grid_with_optional_args(): pos = torch.Tensor([[0, 0], [11, 9], [2, 8], [2, 2], [8, 3]]) batch = torch.tensor([0, 0, 0, 1, 1]) cluster = pool_harness(voxel_grid, [pos, 5, batch, -1, [18, 14]]) assert cluster.tolist() == [0, 10, 4, 16, 17] cluster_no_batch = pool_harness(voxel_grid, [pos, 5, None, -1, [18, 14]]) assert cluster_no_batch.tolist() == [0, 10, 4, 0, 1] @withPackage('torch_cluster') def test_single_voxel_grid(): pos = torch.Tensor([[0, 0], [1, 1], [2, 2], [3, 3], [4, 4]]) edge_index = torch.tensor([[0, 0, 3], [1, 2, 4]]) batch = torch.tensor([0, 0, 0, 1, 1]) x = torch.randn(5, 16) cluster = pool_harness(voxel_grid, [pos, 5, batch]) assert cluster.tolist() == [0, 0, 0, 1, 1] data = Batch(x=x, edge_index=edge_index, pos=pos, batch=batch) data = avg_pool(cluster, data) cluster_no_batch = pool_harness(voxel_grid, [pos, 5]) assert cluster_no_batch.tolist() == [0, 0, 0, 0, 0] ================================================ FILE: tests/gnn/nn/test_linear.py ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. from itertools import product import pytest import torch from torch_geometric.nn import HeteroLinear, Linear from dense.dense_utils import dense_harness weight_inits = ['glorot', "uniform", 'kaiming_uniform', None] bias_inits = ['zeros', None] @pytest.mark.parametrize('weight,bias', product(weight_inits, bias_inits)) def test_linear(weight, bias): lin = Linear(16, 32, weight_initializer=weight, bias_initializer=bias) x = torch.randn(1, 4, 16) dense_harness(lin, x) @pytest.mark.parametrize('with_bias', [True, False]) def test_hetero_linear(with_bias): x = torch.randn(10, 16) type_vec = torch.tensor([0, 0, 2, 1, 0, 2, 2, 2, 1, 2]) lin = HeteroLinear(16, 32, num_types=3, bias=with_bias) dense_harness(lin, (x, type_vec)) ================================================ FILE: tests/gnn/nn/test_loss.py ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. import pytest import torch from torch.nn import L1Loss, MSELoss from poptorch_geometric import TrainingStepper def loss_harness(in_channels, out_channels, cpu_dataloader=None, ipu_dataloader=None, loss_fn=None, num_steps=4, atol=5e-4, rtol=5e-4): class LinearModel(torch.nn.Module): def __init__(self, loss_fn): assert loss_fn is not None assert hasattr(loss_fn, 'reduction') super().__init__() self.loss = loss_fn self.linear = torch.nn.Linear(in_channels, out_channels) def forward(self, *args): x = args[0] nodes_mask = args[1] target = args[2] result = self.linear(x) # Apply nodes mask, so that the loss may be computed properly if nodes_mask is not None: result[~nodes_mask] = 0 if self.training: # target = torch.ones_like(result) if nodes_mask is not None: target[~nodes_mask] = 0 loss = self.loss(result, target) # In case, the loss function applies mean reduction, the result # has to be rescaled by the effective size of the batch # (excluding padding). if nodes_mask is not None and self.loss.reduction == 'mean': size = nodes_mask.shape[0] real_size = torch.count_nonzero(nodes_mask) loss = loss * size / real_size return (result, loss) return result model = LinearModel(loss_fn) stepper = TrainingStepper(model, atol=atol, rtol=rtol) if cpu_dataloader is not None and ipu_dataloader is not None: for step, (cpu_batch, ipu_batch) in enumerate(zip(cpu_dataloader, ipu_dataloader)): if step == num_steps: break stepper.run(1, (cpu_batch.x, None, torch.ones(cpu_batch.x.shape[0], out_channels)), (ipu_batch.x, ipu_batch.nodes_mask, torch.ones(ipu_batch.x.shape[0], out_channels))) @pytest.mark.parametrize('loss_fn', [ L1Loss, MSELoss, ]) @pytest.mark.parametrize('reduction', ['mean', 'sum']) def test_loss_fixedsize_vs_regular_dataloader(loss_fn, reduction, dataloader, fixed_size_dataloader): first_sample = next(iter(dataloader)) in_channels = first_sample.num_node_features out_channels = in_channels * 2 loss_harness(in_channels, out_channels, cpu_dataloader=dataloader, ipu_dataloader=fixed_size_dataloader, loss_fn=loss_fn(reduction=reduction)) ================================================ FILE: tests/gnn/nn/test_mish.py ================================================ # Copyright (c) 2021 Graphcore Ltd. All rights reserved. from copy import deepcopy import pytest import torch from torch_geometric.nn import Linear from poptorch import inferenceModel, trainingModel class MishReference(torch.nn.Module): def forward(self, x): return x * torch.tanh(torch.nn.functional.softplus(x)) class MishTrainModel(torch.nn.Module): def __init__(self, op, linear): super().__init__() self.loss_fn = torch.nn.MSELoss() self.linear = linear self.op = op def forward(self, x): result = self.op(x) res = result.float() result = self.linear(res) target = torch.ones_like(result) loss = self.loss_fn(result, target) return result, loss @pytest.mark.parametrize('size', [(13, ), (1, 64, 320, 320)]) def test_mish(size): x = torch.rand(size) ipu_model = inferenceModel(torch.nn.Mish()) ipu_res = ipu_model(x) ref_ipu_model = inferenceModel(MishReference()) ref_ipu_res = ref_ipu_model(x) ref_model = torch.nn.Mish() ref_res = ref_model(x) torch.allclose(ipu_res, ref_ipu_res) torch.allclose(ipu_res, ref_res) @pytest.mark.parametrize('size', [(11, ), (1, 64, 128)]) def test_mish_training(size): x = torch.rand(size) linear_ipu = Linear(size[-1], size[-1]) linear_ref = deepcopy(linear_ipu) model = MishTrainModel(torch.nn.Mish(), linear_ipu) ref_res, ref_loss = model(x) ipu_model = trainingModel(model) ipu_res, ipu_loss = ipu_model(x) model = MishTrainModel(MishReference(), linear_ref) ref_ipu_model = trainingModel(model) ref_ipu_res, ref_ipu_loss = ref_ipu_model(x) torch.allclose(ipu_res, ref_res) torch.allclose(ipu_loss, ref_loss) torch.allclose(ipu_res, ref_ipu_res) torch.allclose(ipu_loss, ref_ipu_loss) ================================================ FILE: tests/gnn/nn/test_sequential.py ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. from collections import OrderedDict from torch.nn import ReLU from torch_geometric.nn import Sequential, GCNConv, Linear from conv.conv_utils import conv_harness conv_kwargs = {"add_self_loops": False} def test_sequential(dataset): out_channels = in_channels = dataset.num_node_features model = Sequential('x, edge_index', [ (GCNConv(in_channels, 64, **conv_kwargs), 'x, edge_index -> x'), ReLU(inplace=True), (GCNConv(64, 64, **conv_kwargs), 'x, edge_index -> x'), ReLU(inplace=True), Linear(64, out_channels), ]) conv_harness(model, dataset) def test_sequential_with_ordered_dict(dataset): in_channels = dataset.num_node_features model = Sequential('x, edge_index', modules=OrderedDict([ ('conv1', (GCNConv(in_channels, 32, **conv_kwargs), 'x, edge_index -> x')), ('conv2', (GCNConv(32, 64, **conv_kwargs), 'x, edge_index -> x')), ])) conv_harness(model, dataset) ================================================ FILE: tests/gnn/nn/unpool/test_interpolate.py ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. import helpers import torch import torch_geometric import poptorch import poptorch_geometric # pylint: disable=unused-import def test_knn_interpolate(): x = torch.Tensor([[1], [10], [100], [-1], [-10], [-100]]) pos_x = torch.Tensor([[-1, 0], [0, 0], [1, 0], [-2, 0], [0, 0], [2, 0]]) pos_y = torch.Tensor([[-1, -1], [1, 1], [-2, -2], [2, 2]]) batch_x = torch.tensor([0, 0, 0, 1, 1, 1]) batch_y = torch.tensor([0, 0, 1, 1]) k = 2 class Model(torch.nn.Module): def forward(self, *args, **kwargs): return torch_geometric.nn.knn_interpolate(*args, **kwargs) model = poptorch.inferenceModel(Model()) poptorch_out = model(x, pos_x, pos_y, batch_x, batch_y, k) torch_geometric_out = torch_geometric.nn.knn_interpolate( x, pos_x, pos_y, batch_x, batch_y, k) helpers.assert_allclose(actual=poptorch_out, expected=torch_geometric_out) ================================================ FILE: tests/gnn/ops/test_knn.py ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. import pytest import torch import torch_cluster import helpers from poptorch_geometric.ops.knn import knn import poptorch def assert_fn(native_out, poptorch_out, x, y): row_native, col_native = native_out row_poptorch, col_poptorch = poptorch_out helpers.assert_allclose(actual=row_poptorch, expected=row_native) assert col_native.shape == col_poptorch.shape for idx, expected_idx, y_idx in zip(col_native, col_poptorch, row_native): if idx != expected_idx: helpers.assert_allclose(actual=torch.norm(x[idx] - y[y_idx], dim=-1), expected=torch.norm(x[expected_idx] - y[y_idx], dim=-1)) def op_harness(op, reference_op, x, y, k, batch_x=None, batch_y=None): native_out = reference_op(x, y, k, batch_x, batch_y) class Model(torch.nn.Module): def forward(self, *args): return op(*args) model = poptorch.inferenceModel(Model()) poptorch_out = model(x, y, k, batch_x, batch_y) assert_fn(native_out, poptorch_out, x, y) @pytest.mark.parametrize("with_batch", [True, False]) def test_knn_basic(with_batch): pos_x = torch.Tensor([[-1, 0], [0, 0], [1, 0], [-2, 0], [0, 0], [2, 0]]) pos_y = torch.Tensor([[-1, -1], [1, 1], [-2, -2], [2, 2]]) k = 2 if with_batch: batch_x = torch.Tensor([0, 0, 0, 1, 1, 1]) batch_y = torch.Tensor([0, 0, 1, 1]) else: batch_x = None batch_y = None op_harness(knn, knn, pos_x, pos_y, k, batch_x, batch_y) op_harness(knn, torch_cluster.knn, pos_x, pos_y, k, batch_x, batch_y) def test_knn(): x = torch.Tensor([ [-1, -1], [-1, +1], [+1, +1], [+1, -1], [-1, -1], [-1, +1], [+1, +1], [+1, -1], ]) y = torch.Tensor([ [1, 0], [-1, 0], ]) batch_x = torch.Tensor([0, 0, 0, 0, 1, 1, 1, 1]) batch_y = torch.Tensor([0, 1]) k = 2 op_harness(knn, torch_cluster.knn, x, y, k, batch_x, batch_y) op_harness(knn, knn, x, y, k, batch_x, batch_y) op_harness(knn, torch_cluster.knn, x, y, k) op_harness(knn, knn, x, y, k) def test_knn_batch_skip(): x = torch.Tensor([ [-1, -1], [-1, +1], [+1, +1], [+1, -1], [-1, -1], [-1, +1], [+1, +1], [+1, -1], ]) y = torch.Tensor([ [1, 0], [-1, 0], ]) batch_x = torch.Tensor([0, 0, 0, 0, 1, 1, 1, 1]) batch_y = torch.Tensor([0, 1]) k = 2 op_harness(knn, torch_cluster.knn, x, y, k, batch_x, batch_y) op_harness(knn, knn, x, y, k, batch_x, batch_y) @pytest.mark.parametrize("with_batch", [True, False]) def test_knn_override(with_batch): pos_x = torch.Tensor([[-1, 0], [0, 0], [1, 0], [-2, 0], [0, 0], [2, 0]]) pos_y = torch.Tensor([[-1, -1], [1, 1], [-2, -2], [2, 2]]) k = 2 if with_batch: batch_x = torch.Tensor([0, 0, 0, 1, 1, 1]) batch_y = torch.Tensor([0, 0, 1, 1]) else: batch_x = None batch_y = None class Model(torch.nn.Module): def forward(self, *args): return torch_cluster.knn(*args) model = poptorch.inferenceModel(Model()) poptorch_out = model(pos_x, pos_y, k, batch_x, batch_y) native_out = torch_cluster.knn(pos_x, pos_y, k, batch_x, batch_y) assert_fn(native_out, poptorch_out, pos_x, pos_y) ================================================ FILE: tests/gnn/ops/test_knn_graph.py ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. import pytest import torch import helpers from torch_geometric.nn import knn_graph from poptorch_geometric.ops.knn_graph import knn_graph as pyg_knn_graph import poptorch @pytest.mark.parametrize('flow', ['source_to_target', 'target_to_source']) def test_knn_graph(flow): x = torch.Tensor([[1], [10], [100], [-1], [-10], [-100]]) batch = torch.tensor([0, 0, 0, 1, 1, 1]) k = 2 class Model(torch.nn.Module): def forward(self, *args, **kwargs): return pyg_knn_graph(*args, **kwargs) model = poptorch.inferenceModel(Model()) poptorch_out = model(x, k, batch, True, flow) torch_geometric_out = knn_graph(x, k, batch, True, flow) pyg_cpu_out = pyg_knn_graph(x, k, batch, True, flow) helpers.assert_allclose(actual=poptorch_out, expected=pyg_cpu_out) helpers.assert_allclose(actual=poptorch_out, expected=torch_geometric_out) ================================================ FILE: tests/gnn/ops/test_knn_interpolate.py ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. import helpers import torch from torch_geometric.nn import knn_interpolate import poptorch from poptorch_geometric.ops.knn_interpolate import knn_interpolate as pyg_knn_interpolate def test_knn_interpolate(): x = torch.Tensor([[1], [10], [100], [-1], [-10], [-100]]) pos_x = torch.Tensor([[-1, 0], [0, 0], [1, 0], [-2, 0], [0, 0], [2, 0]]) pos_y = torch.Tensor([[-1, -1], [1, 1], [-2, -2], [2, 2]]) batch_x = torch.tensor([0, 0, 0, 1, 1, 1]) batch_y = torch.tensor([0, 0, 1, 1]) k = 2 class Model(torch.nn.Module): def forward(self, *args, **kwargs): return pyg_knn_interpolate(*args, **kwargs) model = poptorch.inferenceModel(Model()) poptorch_out = model(x, pos_x, pos_y, batch_x, batch_y, k) torch_geometric_out = knn_interpolate(x, pos_x, pos_y, batch_x, batch_y, k) pyg_cpu_out = pyg_knn_interpolate(x, pos_x, pos_y, batch_x, batch_y, k) helpers.assert_allclose(actual=poptorch_out, expected=pyg_cpu_out) helpers.assert_allclose(actual=poptorch_out, expected=torch_geometric_out) ================================================ FILE: tests/gnn/ops/test_nearest.py ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. import pytest import torch from torch_cluster import nearest as nearest_or from poptorch import nearest import poptorch def op_harness(op, reference_op, x, y, batch_x=None, batch_y=None): batch_x_ref = torch.tensor(batch_x, dtype=torch.long) if isinstance( batch_x, list) else batch_x batch_y_ref = torch.tensor(batch_y, dtype=torch.long) if isinstance( batch_y, list) else batch_y native_out = reference_op(x, y, batch_x_ref, batch_y_ref) class Model(torch.nn.Module): def forward(self, *args): return op(*args) model = poptorch.inferenceModel(Model()) poptorch_out = model(x, y, batch_x, batch_y) assert all(native_out == poptorch_out) @pytest.mark.parametrize('dtype', [torch.half, torch.float, torch.double]) def test_nearest(dtype): x = torch.tensor([ [-1, -1], [-1, +1], [+1, +1], [+1, -1], [-2, -2], [-2, +2], [+2, +2], [+2, -2], ], dtype=dtype) y = torch.tensor([ [-1, 0], [+1, 0], [-2, 0], [+2, 0], ], dtype=dtype) batch_x_lst = [0, 0, 0, 0, 1, 1, 1, 1] batch_x = torch.tensor(batch_x_lst, dtype=torch.long) batch_y_lst = [0, 0, 1, 1] batch_y = torch.tensor(batch_y_lst, dtype=torch.long) op_harness(nearest, nearest_or, x, y, batch_x_lst, batch_y_lst) op_harness(nearest, nearest_or, x, y, batch_x, batch_y) batch_x_lst_zeros = [0] * x.shape[0] batch_x_zeros = torch.tensor(batch_x_lst_zeros, dtype=torch.long) batch_y_lst_zeros = [0] * y.shape[0] batch_y_zeros = torch.tensor(batch_y_lst_zeros, dtype=torch.long) op_harness(nearest, nearest_or, x, y, batch_x=batch_x_zeros) op_harness(nearest, nearest_or, x, y, batch_y=batch_y_zeros) op_harness(nearest, nearest_or, x, y) # Invalid input: instance 1 only in batch_x batch_x = [0, 0, 0, 0, 1, 1, 1, 1] batch_y = [0, 0, 0, 0] with pytest.raises(ValueError): op_harness(nearest, nearest_or, x, y, batch_x, batch_y) # Invalid input: instance 1 only in batch_x (implicitly as batch_y=None) with pytest.raises(ValueError): op_harness(nearest, nearest_or, x, y, batch_x, None) # Invalid input: instance 2 only in batch_x # (i.e.instance in the middle missing) batch_x = [0, 0, 1, 1, 2, 2, 3, 3] batch_y = [0, 1, 3, 3] with pytest.raises(ValueError): op_harness(nearest, nearest_or, x, y, batch_x, batch_y) # Invalid input: batch_x unsorted batch_x = [0, 0, 1, 0, 0, 0, 0] batch_y = [0, 0, 1, 1] with pytest.raises(ValueError): op_harness(nearest, nearest_or, x, y, batch_x, batch_y) # Invalid input: batch_y unsorted batch_x = [0, 0, 0, 0, 1, 1, 1, 1] batch_y = [0, 0, 1, 0] with pytest.raises(ValueError): op_harness(nearest, nearest_or, x, y, batch_x, batch_y) ================================================ FILE: tests/gnn/ops/test_radius_op.py ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. import pytest import torch import torch_cluster from poptorch_geometric.ops.radius import radius, radius_graph import poptorch def to_set(edge_index): # pylint: disable=R1721 return {(i, j) for i, j in edge_index.t().tolist()} def assert_fn(native_out, poptorch_out): poptorch_out = poptorch_out[poptorch_out != -1] dim = poptorch_out.size(0) // 2 poptorch_out = poptorch_out.reshape((2, dim)) native_out = native_out[native_out != -1] dim = native_out.size(0) // 2 native_out = native_out.reshape((2, dim)) assert to_set(poptorch_out) == to_set(native_out) def op_harness(op, reference_op, *args, **kwargs): native_out = reference_op(*args, **kwargs) class Model(torch.nn.Module): def forward(self, *args, **kwargs): return op(*args, **kwargs) model = poptorch.inferenceModel(Model()) poptorch_out = model(*args, **kwargs) assert_fn(native_out, poptorch_out) @pytest.mark.parametrize("with_batch", [True, False]) def test_radius_basic(with_batch): x = torch.Tensor([[-1, -1], [-1, 1], [1, -1], [1, 1]]) y = torch.Tensor([[-1, 0], [1, 0]]) if with_batch: batch_x = torch.tensor([0, 0, 0, 1]) batch_y = torch.tensor([0, 1]) else: batch_x = None batch_y = None op_harness(radius, torch_cluster.radius, x, y, 1.5, batch_x, batch_y) def test_radius_upstream(): x = torch.tensor([ [-1, -1], [-1, +1], [+1, +1], [+1, -1], [-1, -1], [-1, +1], [+1, +1], [+1, -10], ]) y = torch.tensor([ [0, 0], [0, 1], ]) batch_x = torch.tensor([0, 0, 0, 0, 1, 1, 1, 1], dtype=torch.long) batch_y = torch.tensor([0, 1], dtype=torch.long) op_harness(radius, torch_cluster.radius, x, y, 2, max_num_neighbors=4) op_harness(radius, torch_cluster.radius, x, y, 2, batch_x, batch_y, max_num_neighbors=4) # Skipping a batch batch_x = torch.tensor([0, 0, 0, 0, 2, 2, 2, 2], dtype=torch.long) batch_y = torch.tensor([0, 2], dtype=torch.long) op_harness(radius, torch_cluster.radius, x, y, 2, batch_x, batch_y, max_num_neighbors=4) @pytest.mark.parametrize('flow', ['source_to_target', 'target_to_source']) def test_radius_graph(flow): x = torch.tensor([ [-1, -1], [-1, +1], [+1, +1], [+1, -1], ]) op_harness(radius_graph, torch_cluster.radius_graph, x, r=2.5, loop=True, flow=flow) @pytest.mark.ipuHardwareRequired def test_radius_graph_large(): torch.manual_seed(40) x = torch.randn(1000, 3) op_harness(radius_graph, torch_cluster.radius_graph, x, r=2.5, loop=True, flow='target_to_source', max_num_neighbors=2000) ================================================ FILE: tests/gnn/ops/test_spline_conv_ops.py ================================================ #!/usr/bin/env python3 # Copyright (c) 2023 Graphcore Ltd. All rights reserved. # Tests for PyG torch_spline_conv ops integration with PopTorch from collections import namedtuple from copy import deepcopy import torch import pytest import helpers import poptorch if helpers.is_running_tests: from torch_spline_conv import spline_basis, spline_weighting else: def spline_basis(): pass def spline_weighting(): pass def gen_basis_input_data(num_edges, num_dims, max_kernel_size, dtype): torch.manual_seed(0) pseudo = torch.rand(num_edges, num_dims, dtype=dtype) kernel_size = torch.randint(1, max_kernel_size, (num_dims, )) is_open_spline = torch.randint(0, 2, (num_dims, ), dtype=torch.uint8) return pseudo, kernel_size, is_open_spline BasisParams = namedtuple('BasisParams', 'edges dims max_kernel_size degree') test_params_b = (BasisParams(6, 2, 6, 1), BasisParams(64, 3, 16, 3)) @pytest.mark.parametrize("params", test_params_b) @pytest.mark.parametrize("dtype", [torch.float32, torch.float16]) def test_spline_basis(params, dtype): class Model(torch.nn.Module): def __init__(self, degree): self.degree = degree super().__init__() def forward(self, p, ks, ios): return spline_basis(p, ks, ios, self.degree) *params, degree = params pseudo, kernel_size, is_open_spline = gen_basis_input_data(*params, dtype) model = Model(degree) pseudo_f32 = pseudo.type(torch.float32) basis, weight_index = model(pseudo_f32, kernel_size, is_open_spline) reference_output = (basis.type(dtype), weight_index) poptorch_model = poptorch.inferenceModel(deepcopy(model)) poptorch_output = poptorch_model(pseudo, kernel_size, is_open_spline) atol, rtol = (1e-3, 1e-5) if dtype == torch.float16 else (1e-5, 1e-8) helpers.assert_allclose(actual=poptorch_output, expected=reference_output, atol=atol, rtol=rtol) def gen_weighting_input_data(edges, in_ch, out_ch, kernel_size, num_splines, dtype): torch.manual_seed(0) x = torch.rand(edges, in_ch, dtype=dtype) weights = torch.rand(kernel_size, in_ch, out_ch, dtype=dtype) basis = torch.rand(edges, num_splines, dtype=dtype) weight_index = torch.randint(0, kernel_size, (edges, num_splines)) return x, weights, basis, weight_index WeightingParams = namedtuple('WeightingParams', 'edges in_ch out_ch kernel_size num_splines') test_params_w = (WeightingParams(6, 4, 4, 10, 8), WeightingParams(24, 5, 6, 3, 10)) @pytest.mark.parametrize("params", test_params_w) @pytest.mark.parametrize("dtype", [torch.float32, torch.float16]) def test_spline_weighting(params, dtype): class Model(torch.nn.Module): def forward(self, x, weight, basis, weight_index): return spline_weighting(x, weight, basis, weight_index) x, weight, basis, weight_index = gen_weighting_input_data(*params, dtype) model = Model() x_f32 = x.type(torch.float32) weight_f32 = weight.type(torch.float32) basis_f32 = basis.type(torch.float32) reference_output = model(x_f32, weight_f32, basis_f32, weight_index) poptorch_model = poptorch.inferenceModel(deepcopy(model)) weight_index = weight_index.type(torch.int32) poptorch_output = poptorch_model(x, weight, basis, weight_index) atol, rtol = (1e-2, 1e-3) if dtype == torch.float16 else (1e-5, 1e-8) helpers.assert_allclose(actual=poptorch_output, expected=reference_output.type(dtype), atol=atol, rtol=rtol) ================================================ FILE: tests/gnn/ops/test_to_dense_batch.py ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. import pytest import torch import torch_geometric from torch_geometric.utils import to_dense_batch import helpers import poptorch def op_harness(reference_op, *args, **kwargs): class Model(torch.nn.Module): def forward(self, *args, **kwargs): return torch_geometric.utils.to_dense_batch(*args, **kwargs) model = poptorch.inferenceModel(Model()) poptorch_out = model(*args, **kwargs) native_out = reference_op(*args, **kwargs) helpers.assert_allclose(actual=poptorch_out, expected=native_out) def test_basic(): x = torch.arange(12).view(6, 2) op_harness(to_dense_batch, x, batch_size=1, max_num_nodes=11) def test_batch_size_not_set(): x = torch.arange(12).view(6, 2) batch = torch.tensor([0, 0, 1, 2, 2, 2]) with pytest.raises( ValueError, match= "Dynamic shapes disabled. Argument 'batch_size' needs to be set"): op_harness(to_dense_batch, x, batch) def test_batch_size_set(): x = torch.arange(12).view(6, 2) batch = torch.tensor([0, 0, 1, 2, 2, 2]) with pytest.raises( ValueError, match= "Dynamic shapes disabled. Argument 'max_num_nodes' needs to be set" ): op_harness(to_dense_batch, x, batch, batch_size=3) def test_batch_size_and_max_num_nodes_set(): x = torch.arange(12).view(6, 2) batch = torch.tensor([0, 0, 1, 2, 2, 2]) batch_size = int(batch.max()) + 1 max_num_nodes = 11 op_harness(to_dense_batch, x, batch, max_num_nodes=max_num_nodes, batch_size=batch_size) ================================================ FILE: tests/gnn/test_basic_gnn.py ================================================ # Copyright (c) 2022 Graphcore Ltd. All rights reserved. import unittest.mock import pytest import torch import torch.nn.functional as F from torch_geometric import seed_everything from torch_geometric.datasets import FakeDataset from torch_geometric.nn.models import GAT, GCN, GIN, PNA, EdgeCNN, GraphSAGE from torch_geometric.transforms import Compose, GCNNorm, NormalizeFeatures from torch_geometric.utils import degree from torch_scatter import scatter_add import helpers from poptorch_geometric import TrainingStepper, set_aggregation_dim_size @pytest.fixture def data(): seed_everything(0) transform = Compose([GCNNorm(), NormalizeFeatures()]) dataset = FakeDataset(transform=transform, num_channels=64) data = dataset[0] data.num_classes = dataset.num_classes # Add a train_mask property that contains indices num_training_nodes = int(0.8 * data.num_nodes) data.train_mask = torch.randperm(data.num_nodes)[:num_training_nodes] return data def node_classification_harness(gnn, dataset, num_steps=40, atol=1e-4, rtol=1e-5): # Wrapper for a GNN model + a loss function class Wrapper(torch.nn.Module): def __init__(self, model, loss_fn): super().__init__() self.model = model self.loss_fn = loss_fn def forward(self, x, edge_index, train_mask, y): x = self.model(x, edge_index) out = F.log_softmax(x, dim=1) pred = out[train_mask] target = y[train_mask] loss = self.loss_fn(pred, target) return out, loss set_aggregation_dim_size(gnn, int(dataset.edge_index.max()) + 1) model = Wrapper(gnn, F.cross_entropy) stepper = TrainingStepper(model, atol=atol, rtol=rtol) batch = (dataset.x, dataset.edge_index, dataset.train_mask, dataset.y) stepper.run(num_steps, batch) @unittest.mock.patch.dict("os.environ", helpers.disableSmallModel()) def test_node_classification_GCN(data): gnn = GCN(in_channels=data.num_node_features, hidden_channels=32, num_layers=2, out_channels=data.num_classes, normalize=False) node_classification_harness(gnn, data) @unittest.mock.patch.dict("os.environ", helpers.disableSmallModel()) def test_node_classification_GraphSAGE(data): gnn = GraphSAGE(in_channels=data.num_node_features, hidden_channels=32, num_layers=2, out_channels=data.num_classes) node_classification_harness(gnn, data, atol=1e-3, rtol=1e-2) @unittest.mock.patch.dict("os.environ", helpers.disableSmallModel()) def test_node_classification_GIN(data): gnn = GIN(in_channels=data.num_node_features, hidden_channels=32, num_layers=2, out_channels=data.num_classes) node_classification_harness(gnn, data) @unittest.mock.patch.dict("os.environ", helpers.disableSmallModel()) def test_node_classification_GAT(data): gnn = GAT(in_channels=data.num_node_features, hidden_channels=32, num_layers=2, out_channels=data.num_classes, add_self_loops=False) node_classification_harness(gnn, data) @unittest.mock.patch.dict("os.environ", helpers.disableSmallModel()) def test_node_classification_PNA(data): # Calculate the in-degree histogram deg = degree(data.edge_index[1]).long() deg = scatter_add(torch.ones_like(deg), deg) gnn = PNA(in_channels=data.num_node_features, hidden_channels=32, num_layers=2, out_channels=data.num_classes, aggregators=['sum', 'mean'], scalers=['linear'], deg=deg) # TODO: investigate numerical drift with PNAConv node_classification_harness(gnn, data, num_steps=1) @unittest.mock.patch.dict("os.environ", helpers.disableSmallModel()) @pytest.mark.parametrize('act', [torch.nn.ReLU(), torch.relu_]) def test_node_classification_EdgeCNN(data, act): if act == torch.relu_: # TODO: enable testing with the inplace relu_ op when this is supported pytest.skip( "Skipping testing inplace activation with dispatcher: " "RuntimeError: a leaf Variable that requires grad is being used in" "an in-place operation.") gnn = EdgeCNN(in_channels=data.num_node_features, hidden_channels=32, num_layers=2, out_channels=data.num_classes, dropout=0, act=act, norm=None, jk=None) node_classification_harness(gnn, data, num_steps=1) ================================================ FILE: tests/gnn/test_cluster_loader.py ================================================ # Copyright (c) 2022 Graphcore Ltd. All rights reserved. import pytest from torch_geometric import seed_everything from torch_geometric.datasets import FakeDataset from torch_geometric.loader import ClusterData from poptorch_geometric.cluster_loader import \ FixedSizeClusterLoader as IPUFixedSizeClusterLoader from poptorch_geometric.fixed_size_options import FixedSizeOptions from poptorch_geometric.pyg_cluster_loader import FixedSizeClusterLoader import poptorch @pytest.mark.parametrize('loader_cls', [FixedSizeClusterLoader, IPUFixedSizeClusterLoader]) @pytest.mark.parametrize('batch_size', [1, 2, 4]) @pytest.mark.parametrize('task', ['graph', 'node']) def test_fixed_size_dataloader_with_cluster_data(loader_cls, batch_size, benchmark, task): ipu_dataloader = loader_cls is IPUFixedSizeClusterLoader avg_degree = 3 num_parts = 8 seed_everything(42) dataset = FakeDataset( num_graphs=1, avg_num_nodes=128, avg_degree=avg_degree, num_channels=4, task=task, )[0] # Get a sensible value for the the maximum number of nodes. padded_num_nodes = dataset.num_nodes // num_parts * batch_size + 10 padded_num_edges = (avg_degree + 5) * padded_num_nodes cluster_data = ClusterData(dataset, num_parts=num_parts, log=False) # Define the expected tensor sizes in the output. data = cluster_data.data data_attributes = (k for k, _ in data if data.is_node_attr(k) or data.is_edge_attr(k)) expected_sizes = { k: ((padded_num_nodes if data.is_node_attr(k) else padded_num_edges), 0) for k in data_attributes } # Special case for edge_index which is of shape [2, num_edges]. expected_sizes['edge_index'] = (padded_num_edges, 1) # Special case for `y` being graph-lvl label if not data.is_node_attr('y'): expected_sizes['y'] = (2, 0) # Create a fixed size dataloader. kwargs = { 'cluster_data': cluster_data, 'fixed_size_options': FixedSizeOptions(num_nodes=padded_num_nodes, num_edges=padded_num_edges, num_graphs=2), 'batch_size': batch_size } if ipu_dataloader: kwargs['options'] = poptorch.Options() loader = loader_cls(**kwargs) # Check that each batch matches the expected size. for batch in loader: sizes_match = all( getattr(batch, k).shape[dim] == size for k, (size, dim) in expected_sizes.items()) assert sizes_match def loop(): for _ in loader: pass benchmark(loop) ================================================ FILE: tests/gnn/test_collate.py ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. import pytest import torch import torch_geometric as pyg from torch_geometric.data import Data, HeteroData from torch.utils.data import BatchSampler, RandomSampler, SequentialSampler from utils import is_data from poptorch_geometric.collate import FixedSizeCollater from poptorch_geometric.fixed_size_options import FixedSizeOptions # pylint: disable=protected-access @pytest.fixture(params=[Data, HeteroData]) def _get_test_data(request, molecule, fake_hetero_data): if is_data(request.param): dataset = molecule assert dataset.num_nodes == 29 assert dataset.num_edges == 56 else: dataset = fake_hetero_data dataset['name'] = 'gdb_57518' assert dataset.num_nodes == 103 assert dataset.num_edges == 2391 return request.param, dataset @pytest.mark.parametrize('num_graphs,num_real_graphs', [(10, 8), (2, 1)]) @pytest.mark.parametrize('num_edges', [300, None]) @pytest.mark.parametrize('set_pad_values', [True, False]) def test_batch_masks(num_graphs, num_real_graphs, num_edges, set_pad_values): avg_num_nodes = 10 num_channels = 8 dataset = pyg.datasets.FakeDataset(num_graphs=16, avg_num_nodes=avg_num_nodes, avg_degree=2, num_channels=num_channels, edge_dim=2, task='graph') node_pad_value = 22.0 if set_pad_values else 0.0 edge_pad_value = 34.0 if set_pad_values else 0.0 graph_pad_value = 55.0 if set_pad_values else 0.0 num_batch_nodes = 100 num_batch_edges = num_batch_nodes * (num_batch_nodes - 1) \ if num_edges is None else num_edges num_batch_graphs = num_graphs fixed_size_options = None fixed_size_collater = None if set_pad_values: fixed_size_options = FixedSizeOptions(num_nodes=num_batch_nodes, num_edges=num_edges, num_graphs=num_graphs, node_pad_value=node_pad_value, edge_pad_value=edge_pad_value, graph_pad_value=graph_pad_value) fixed_size_collater = FixedSizeCollater( fixed_size_options=fixed_size_options, add_masks_to_batch=True) else: fixed_size_options = FixedSizeOptions( num_nodes=num_batch_nodes, num_edges=num_edges, num_graphs=num_graphs, ) fixed_size_collater = FixedSizeCollater( fixed_size_options=fixed_size_options, add_masks_to_batch=True) batch_sampler = BatchSampler(SequentialSampler(dataset), num_real_graphs, drop_last=False) for i, sample in enumerate(batch_sampler): num_real_nodes = sum(dataset[id].num_nodes for id in sample) num_real_edges = sum(dataset[id].num_edges for id in sample) result = fixed_size_collater([dataset[id] for id in sample]) # Check graph values assert len(result.graphs_mask) == num_batch_graphs assert int(result.graphs_mask.sum()) == num_real_graphs for j, mask in enumerate(result.graphs_mask): if mask.item() is True: assert dataset[i * num_real_graphs + j].y[0] == result.y[j] else: assert result.y[j] == graph_pad_value # Check nodes values assert len(result.nodes_mask) == num_batch_nodes assert int(result.nodes_mask.sum()) == num_real_nodes begin = 0 end = 0 for id in sample: end += dataset[id].num_nodes assert torch.all(result.nodes_mask[begin:end]) assert torch.equal(result.x[begin:end], dataset[id].x) begin += dataset[id].num_nodes assert not torch.any(result.nodes_mask[begin:]) for node_features in result.x[begin:]: for feature in node_features: assert feature == node_pad_value # Check edges values assert len(result.edges_mask) == num_batch_edges assert int(result.edges_mask.sum()) == num_real_edges begin = 0 end = 0 for id in sample: end += dataset[id].num_edges assert torch.all(result.edges_mask[begin:end]) assert torch.equal(result.edge_attr[begin:end], dataset[id].edge_attr) begin += dataset[id].num_edges assert not torch.any(result.edges_mask[begin:]) for edge_features in result.edge_attr[begin:]: for feature in edge_features: assert feature == edge_pad_value @pytest.mark.parametrize('num_graphs,num_real_graphs', [(6, 2), (4, 2), (2, 1)]) @pytest.mark.parametrize('num_edges', [1200, None]) @pytest.mark.parametrize('set_pad_values', [True, False]) def test_batch_masks_heterodata(num_graphs, num_real_graphs, num_edges, set_pad_values, fake_hetero_dataset): dataset = fake_hetero_dataset num_node_types = 2 num_edge_types = 5 node_pad_value = 22.0 if set_pad_values else 0.0 edge_pad_value = 34.0 if set_pad_values else 0.0 graph_pad_value = 55.0 if set_pad_values else 0.0 num_batch_nodes = 150 fixed_size_options = None fixed_size_collater = None if set_pad_values: fixed_size_options = FixedSizeOptions(num_nodes=num_batch_nodes, num_edges=num_edges, num_graphs=num_graphs, node_pad_value=node_pad_value, edge_pad_value=edge_pad_value, graph_pad_value=graph_pad_value) fixed_size_collater = FixedSizeCollater( fixed_size_options=fixed_size_options, add_masks_to_batch=True) else: fixed_size_options = FixedSizeOptions( num_nodes=num_batch_nodes, num_edges=num_edges, num_graphs=num_graphs, ) fixed_size_collater = FixedSizeCollater( fixed_size_options=fixed_size_options, add_masks_to_batch=True) num_batch_edges = (num_batch_nodes * (num_batch_nodes - 1) \ if num_edges is None else num_edges) * num_edge_types num_batch_graphs = num_graphs num_batch_nodes *= num_node_types batch_sampler = BatchSampler(SequentialSampler(dataset), num_real_graphs, drop_last=False) for i, sample in enumerate(batch_sampler): num_real_nodes = sum(dataset[id].num_nodes for id in sample) num_real_edges = sum(dataset[id].num_edges for id in sample) result = fixed_size_collater([dataset[id] for id in sample]) # Check graph values assert len(result.graphs_mask) == num_batch_graphs assert int(result.graphs_mask.sum()) == num_real_graphs for j, mask in enumerate(result.graphs_mask): if mask.item() is True: assert dataset[i * num_real_graphs + j].y[0] == result.y[j] else: assert result.y[j] == graph_pad_value # Check nodes values assert sum(node_type.nodes_mask.shape[0] for node_type in result.node_stores) == num_batch_nodes for key in result.node_types: num_real_nodes = sum( dataset[id]._node_store_dict[key]['x'].shape[0] for id in sample) nodes_mask = result._node_store_dict[key]['nodes_mask'] assert torch.all(nodes_mask[0:num_real_nodes]) assert not torch.any(nodes_mask[num_real_nodes:]) x = result._node_store_dict[key]['x'] assert not torch.all(x[num_real_nodes:] - node_pad_value) # Check edges values assert sum(edge_type.edges_mask.shape[0] for edge_type in result.edge_stores) == num_batch_edges for key in result.edge_types: num_real_edges = sum( dataset[id]._edge_store_dict[key]['edge_index'].shape[1] for id in sample) edges_mask = result._edge_store_dict[key]['edges_mask'] assert torch.all(edges_mask[0:num_real_edges]) assert not torch.any(edges_mask[num_real_edges:]) def test_prune_nodes_single_input(_get_test_data): type_, dataset = _get_test_data if is_data(type_): fixed_size_options = FixedSizeOptions(num_nodes=10, num_graphs=2) else: fixed_size_options = FixedSizeOptions(num_nodes=dict(v0=10, v1=5), num_graphs=2) fixed_size_options.to_hetero(dataset.node_types, dataset.edge_types) fixed_size_collater = FixedSizeCollater(fixed_size_options) result = fixed_size_collater._prune_nodes([dataset]) assert len(result) == 1 if is_data(type_): assert result[0].num_nodes == fixed_size_options.num_nodes assert result[0].x.shape[0] == fixed_size_options.num_nodes assert result[0].pos.shape[0] == fixed_size_options.num_nodes else: assert result[0].num_nodes == fixed_size_options.total_num_nodes for node_type, expected_val in fixed_size_options.num_nodes.items(): assert result[0][node_type].num_nodes == expected_val assert result[0][node_type].x.shape[0] == expected_val def test_prune_nodes_multiple_inputs(_get_test_data): type_, dataset = _get_test_data num_inputs = 4 input = [dataset] * num_inputs if is_data(type_): fixed_size_options = FixedSizeOptions(num_nodes=80, num_graphs=num_inputs + 1) else: fixed_size_options = FixedSizeOptions(num_nodes=dict(v0=80, v1=40), num_graphs=num_inputs + 1) fixed_size_options.to_hetero(dataset.node_types, dataset.edge_types) fixed_size_collater = FixedSizeCollater(fixed_size_options) result = fixed_size_collater._prune_nodes(input) num_nodes = 0 for data in result: num_nodes += data.num_nodes assert num_nodes > 0 assert num_nodes == fixed_size_options.total_num_nodes def test_prune_nodes_multiple_inputs_minimal_num_node(_get_test_data): type_, dataset = _get_test_data num_inputs = 3 input = [dataset] * num_inputs if is_data(type_): fixed_size_options = FixedSizeOptions(num_nodes=3, num_graphs=num_inputs + 1) else: fixed_size_options = FixedSizeOptions(num_nodes=dict(v0=3, v1=3), num_graphs=num_inputs + 1) fixed_size_options.to_hetero(dataset.node_types, dataset.edge_types) fixed_size_collater = FixedSizeCollater(fixed_size_options) result = fixed_size_collater._prune_nodes(input) assert len(result) == num_inputs num_nodes = 0 for data in result: num_nodes += data.num_nodes assert data.num_nodes > 0 assert num_nodes == fixed_size_options.total_num_nodes def test_prune_edges_single_input(_get_test_data): type_, dataset = _get_test_data if is_data(type_): fixed_size_options = FixedSizeOptions(num_nodes=dataset.num_nodes, num_edges=40) else: fixed_size_options = FixedSizeOptions(num_nodes=dict( v0=dataset["v0"].num_nodes, v1=dataset["v1"].num_nodes), num_edges={ ("v0", "e0", "v1"): 40, ("v0", "e0", "v0"): 30, ("v1", "e0", "v0"): 30, ("v0", "e1", "v1"): 40, ("v1", "e0", "v1"): 50, }) fixed_size_collator = FixedSizeCollater(fixed_size_options) result = fixed_size_collator._prune_edges([dataset]) assert len(result) == 1 assert result[0].num_nodes == fixed_size_options.total_num_nodes assert result[0].num_edges == fixed_size_options.total_num_edges if is_data(type_): assert result[0].x.shape[0] == fixed_size_options.num_nodes assert result[0].pos.shape[0] == fixed_size_options.num_nodes assert result[0].edge_attr.shape[0] == fixed_size_options.num_edges assert result[0].edge_index.shape[1] == fixed_size_options.num_edges else: for edge_type, expected_num in fixed_size_options.num_edges.items(): assert result[0][edge_type].edge_index.shape[1] == expected_num def test_prune_edges_multiple_inputs(_get_test_data): type_, dataset = _get_test_data num_inputs = 4 input = [dataset] * num_inputs if is_data(type_): fixed_size_options = FixedSizeOptions(num_nodes=dataset.num_nodes * num_inputs, num_edges=80, num_graphs=num_inputs + 1) else: fixed_size_options = FixedSizeOptions(num_nodes=dict( v0=dataset["v0"].num_nodes * num_inputs, v1=dataset["v1"].num_nodes * num_inputs), num_edges={ ("v0", "e0", "v1"): 80, ("v0", "e0", "v0"): 120, ("v1", "e0", "v0"): 90, ("v0", "e1", "v1"): 100, ("v1", "e0", "v1"): 80, }) fixed_size_collator = FixedSizeCollater(fixed_size_options) result = fixed_size_collator._prune_edges(input) assert len(result) == num_inputs num_nodes = 0 num_edges = 0 for data in result: assert data.num_nodes > 0 num_nodes += data.num_nodes assert data.num_edges > 0 num_edges += data.num_edges assert num_nodes == fixed_size_options.total_num_nodes assert num_edges == fixed_size_options.total_num_edges def test_prune_nodes_multiple_inputs_minimal_num_edges(_get_test_data): type_, dataset = _get_test_data num_inputs = 3 input = [dataset] * num_inputs if is_data(type_): fixed_size_options = FixedSizeOptions(num_nodes=dataset.num_nodes * num_inputs, num_edges=80, num_graphs=num_inputs + 1) else: fixed_size_options = FixedSizeOptions(num_nodes=dict( v0=dataset["v0"].num_nodes * num_inputs, v1=dataset["v1"].num_nodes * num_inputs), num_edges={ ("v0", "e0", "v1"): 80, ("v0", "e0", "v0"): 120, ("v1", "e0", "v0"): 90, ("v0", "e1", "v1"): 100, ("v1", "e0", "v1"): 80, }) fixed_size_collator = FixedSizeCollater(fixed_size_options) result = fixed_size_collator._prune_edges(input) assert len(result) == num_inputs num_nodes = 0 num_edges = 0 for data in result: assert data.num_nodes > 0 num_nodes += data.num_nodes num_edges += data.num_edges assert num_nodes == fixed_size_options.total_num_nodes assert num_edges == fixed_size_options.total_num_edges def test_prune_nodes_multiple_inputs_should_throw_exception(_get_test_data): type_, dataset = _get_test_data num_inputs = 3 input = [dataset] * num_inputs expected_num_nodes = (num_inputs - 1) fixed_size_options = FixedSizeOptions(num_nodes=expected_num_nodes, num_graphs=num_inputs + 1) if not is_data(type_): fixed_size_options.to_hetero(dataset.node_types, dataset.edge_types) fixed_size_collater = FixedSizeCollater(fixed_size_options) with pytest.raises(RuntimeError): fixed_size_collater._prune_nodes(input) @pytest.mark.parametrize('data_type,fixed_size_hetero', [(Data, False), (HeteroData, False), (HeteroData, True)]) def test_prune_nodes_fixed_size_collater(data_type, fixed_size_hetero, fake_hetero_dataset): batch_size = 10 if is_data(data_type): avg_num_nodes = 30 num_channels = 16 dataset = pyg.datasets.FakeDataset(num_graphs=99, avg_num_nodes=avg_num_nodes, avg_degree=5, num_channels=num_channels, edge_dim=8) else: avg_num_nodes = 60 dataset = fake_hetero_dataset if fixed_size_hetero: fixed_size_options = FixedSizeOptions(num_nodes=dict(v0=800, v1=800), num_graphs=batch_size + 1) else: fixed_size_options = FixedSizeOptions(num_nodes=800, num_graphs=batch_size + 1) fixed_size_collater = FixedSizeCollater(fixed_size_options, trim_nodes=True) batch_sampler = BatchSampler(RandomSampler(dataset), batch_size, drop_last=False) for sample in batch_sampler: result = fixed_size_collater([dataset[id] for id in sample]) assert result.num_nodes == fixed_size_options.total_num_nodes assert result.num_edges == fixed_size_options.total_num_edges if is_data(data_type): assert result.batch.shape[0] == fixed_size_options.total_num_nodes assert result.x.shape[0] == fixed_size_options.total_num_nodes assert result.edge_attr.shape[ 0] == fixed_size_options.total_num_edges assert result.edge_index.shape[ 1] == fixed_size_options.total_num_edges else: for node_type, expected_val in fixed_size_options.num_nodes.items( ): assert result[node_type].num_nodes == expected_val assert result[node_type].x.shape[0] == expected_val for edge_type, expected_num in fixed_size_options.num_edges.items( ): assert result[edge_type].edge_index.shape[1] == expected_num @pytest.mark.parametrize('data_type,fixed_size_hetero', [(Data, False), (HeteroData, False), (HeteroData, True)]) def test_prune_edges_fixed_size_collator(data_type, fixed_size_hetero, fake_hetero_dataset): batch_size = 10 if is_data(data_type): avg_num_nodes = 30 num_channels = 16 dataset = pyg.datasets.FakeDataset(num_graphs=99, avg_num_nodes=avg_num_nodes, avg_degree=5, num_channels=num_channels, edge_dim=8) else: avg_num_nodes = 60 dataset = fake_hetero_dataset if fixed_size_hetero: fixed_size_options = FixedSizeOptions(num_nodes=dict( v0=avg_num_nodes * (batch_size * 2), v1=avg_num_nodes * (batch_size * 2)), num_edges={ ("v0", "e0", "v1"): 80, ("v0", "e0", "v0"): 120, ("v1", "e0", "v0"): 90, ("v0", "e1", "v1"): 100, ("v1", "e0", "v1"): 80, }, num_graphs=batch_size + 1) else: fixed_size_options = FixedSizeOptions(num_nodes=avg_num_nodes * (batch_size * 2), num_edges=30, num_graphs=batch_size + 1) fixed_size_collator = FixedSizeCollater(fixed_size_options, trim_edges=True) batch_sampler = BatchSampler(RandomSampler(dataset), batch_size, drop_last=False) for sample in batch_sampler: result = fixed_size_collator([dataset[id] for id in sample]) assert result.num_nodes == fixed_size_options.total_num_nodes assert result.num_edges == fixed_size_options.total_num_edges if is_data(data_type): assert result.batch.shape[0] == fixed_size_options.total_num_nodes assert result.x.shape[0] == fixed_size_options.total_num_nodes assert result.edge_attr.shape[ 0] == fixed_size_options.total_num_edges assert result.edge_index.shape[ 1] == fixed_size_options.total_num_edges else: for node_type, expected_val in fixed_size_options.num_nodes.items( ): assert result[node_type].num_nodes == expected_val assert result[node_type].x.shape[0] == expected_val for edge_type, expected_num in fixed_size_options.num_edges.items( ): assert result[edge_type].edge_index.shape[1] == expected_num @pytest.mark.parametrize('data_type,fixed_size_hetero', [(Data, False), (HeteroData, False), (HeteroData, True)]) def test_prune_data_fixed_size_collator(data_type, fixed_size_hetero, fake_hetero_dataset): batch_size = 10 if is_data(data_type): avg_num_nodes = 30 num_channels = 16 dataset = pyg.datasets.FakeDataset(num_graphs=99, avg_num_nodes=avg_num_nodes, avg_degree=5, num_channels=num_channels, edge_dim=8) else: avg_num_nodes = 300 dataset = fake_hetero_dataset if fixed_size_hetero: fixed_size_options = FixedSizeOptions(num_nodes=dict(v0=200, v1=100), num_edges={ ("v0", "e0", "v1"): 80, ("v0", "e0", "v0"): 120, ("v1", "e0", "v0"): 90, ("v0", "e1", "v1"): 100, ("v1", "e0", "v1"): 80, }, num_graphs=batch_size + 1) else: fixed_size_options = FixedSizeOptions(num_nodes=200, num_edges=30, num_graphs=batch_size + 1) for data in dataset: if is_data(data_type): assert data.edge_index.shape[1] > 0 else: for edge_store in data.edge_stores: assert edge_store['edge_index'].shape[1] > 0 fixed_size_collator = FixedSizeCollater(fixed_size_options, trim_nodes=True, trim_edges=True) batch_sampler = BatchSampler(RandomSampler(dataset), batch_size, drop_last=False) for sample in batch_sampler: result = fixed_size_collator([dataset[id] for id in sample]) assert result.num_nodes == fixed_size_options.total_num_nodes assert result.num_edges == fixed_size_options.total_num_edges if is_data(data_type): assert result.batch.shape[0] == fixed_size_options.total_num_nodes assert result.x.shape[0] == fixed_size_options.total_num_nodes assert result.edge_attr.shape[ 0] == fixed_size_options.total_num_edges assert result.edge_index.shape[ 1] == fixed_size_options.total_num_edges else: for node_type, expected_val in fixed_size_options.num_nodes.items( ): assert result[node_type].num_nodes == expected_val assert result[node_type].x.shape[0] == expected_val for edge_type, expected_num in fixed_size_options.num_edges.items( ): assert result[edge_type].edge_index.shape[1] == expected_num def test_valid_args_fixed_size_collater(_get_test_data): _, dataset = _get_test_data num_inputs = 3 expected_num_nodes = dataset.num_nodes * num_inputs fixed_size_options = FixedSizeOptions(num_nodes=expected_num_nodes, num_graphs=num_inputs + 1) fixed_size_collater = FixedSizeCollater(fixed_size_options) input_list = [dataset] * num_inputs fixed_size_collater(input_list) with pytest.raises(TypeError, match='Expected list, got tuple.'): fixed_size_collater(tuple(input_list)) def test_fixed_size_collater_should_include_non_tensor_keys_in_pad_graph( _get_test_data): _, dataset = _get_test_data dataset['scalar_key'] = 2 expected_num_nodes = dataset.num_nodes * 3 fixed_size_options = FixedSizeOptions(num_nodes=expected_num_nodes) fixed_size_collater = FixedSizeCollater(fixed_size_options) input_list = [dataset] result = fixed_size_collater(input_list) assert result.name == ['gdb_57518', 'gdb_57518'] assert torch.equal(result.scalar_key, torch.Tensor([2, 2])) def test_fixed_size_collater_should_assign_default_pad_values(_get_test_data): _, dataset = _get_test_data expected_num_nodes = dataset.num_nodes * 3 dataset['scalar_key'] = 2 pad_graph_defaults = {'name': 'pad_graph', 'scalar_key': 3} input_list = [dataset] fixed_size_options = FixedSizeOptions( num_nodes=expected_num_nodes, pad_graph_defaults=pad_graph_defaults) fixed_size_collater = FixedSizeCollater(fixed_size_options) result = fixed_size_collater(input_list) assert result.name == ['gdb_57518', 'pad_graph'] assert torch.equal(result.scalar_key, torch.Tensor([2, 3])) @pytest.mark.parametrize('num_nodes,num_edges,error_type', [(10, 10000, 'nodes'), (10000, 10, 'edges')]) def test_fixed_size_collater_wrong_size_exceptions(_get_test_data, num_nodes, num_edges, error_type): _, dataset = _get_test_data num_inputs = 4 input = [dataset] * num_inputs fixed_size_options = FixedSizeOptions(num_nodes=num_nodes, num_edges=num_edges, num_graphs=num_inputs + 1) fixed_size_collater = FixedSizeCollater(fixed_size_options) error_contains = ( r"The fixed sizes given don't allocate enough space for the" fr" number of .* {error_type}") with pytest.raises(RuntimeError, match=error_contains): # TODO: Be more specific about error fixed_size_collater(input) ================================================ FILE: tests/gnn/test_dataloader.py ================================================ # Copyright (c) 2022 Graphcore Ltd. All rights reserved. import inspect import pickle from functools import singledispatch import pytest import torch from torch_geometric.data import Batch, Data, HeteroData from torch_geometric.datasets import FakeDataset from torch_geometric.transforms import Pad import utils from utils import is_data from poptorch_geometric.stream_packing_sampler import StreamPackingSampler from poptorch_geometric.collate import CombinedBatchingCollater, make_exclude_keys from poptorch_geometric.dataloader import DataLoader as IPUDataLoader from poptorch_geometric.dataloader import \ FixedSizeDataLoader as IPUFixedSizeDataLoader from poptorch_geometric.dataloader import FixedSizeStrategy, OverSizeStrategy from poptorch_geometric.fixed_size_options import FixedSizeOptions from poptorch_geometric.pyg_collate import Collater from poptorch_geometric.pyg_dataloader import (DataLoader, FixedSizeDataLoader) from poptorch_geometric.types import PyGArgsParser from poptorch_geometric.common import DataBatch, HeteroDataBatch import poptorch # pylint: disable=protected-access @singledispatch def _compare_batches(batch_actual, batch_expected): raise ValueError(f'Unsupported data type: {type(batch_actual)}') @_compare_batches.register def _(batch_actual: DataBatch, batch_expected: DataBatch): for key in batch_expected.keys: expected_value = batch_expected[key] actual_value = batch_actual[key] if isinstance(expected_value, torch.Tensor): assert torch.equal(actual_value, expected_value) else: assert actual_value == expected_value @_compare_batches.register def _(batch_actual: HeteroDataBatch, batch_expected: HeteroDataBatch): for actual, expected in zip(batch_actual._global_store.values(), batch_expected._global_store.values()): assert actual == expected def compare_stores(actual, expected): for a, e in zip(actual, expected): for act, exp in zip(a.values(), e.values()): assert act.tolist() == exp.tolist() compare_stores(batch_actual.node_stores, batch_expected.node_stores) compare_stores(batch_actual.edge_stores, batch_expected.edge_stores) @pytest.mark.parametrize('dataset', ['fake_small_dataset', 'fake_hetero_dataset']) def test_batch_serialization(dataset, request): dataset = request.getfixturevalue(dataset) data = dataset[0] batch = Batch.from_data_list([data]) serialized_batch = pickle.dumps(batch) batch_unserialized = pickle.loads(serialized_batch) _compare_batches(batch_unserialized, batch) @pytest.mark.parametrize('dataset', ['fake_small_dataset', 'fake_hetero_dataset']) def test_custom_batch_parser(dataset, request): dataset = request.getfixturevalue(dataset) data = dataset[0] batch = Batch.from_data_list([data]) parser = PyGArgsParser() generator = parser.yieldTensors(batch) batch_reconstructed = parser.reconstruct(batch, generator) _compare_batches(batch_reconstructed, batch) @pytest.mark.parametrize('data', ['molecule', 'fake_hetero_data']) def test_collater(data, request): data = request.getfixturevalue(data) if isinstance(data, Data): include_keys = ('x', 'y', 'z') else: include_keys = ('x') exclude_keys = make_exclude_keys(include_keys, data) collate_fn = Collater(exclude_keys=exclude_keys) batch = collate_fn([data]) data_type = type(data) assert isinstance(batch, type(Batch(_base_cls=data_type))) batch_keys = list( filter(lambda key: key not in ('ptr', 'batch', 'edge_index'), batch.keys)) assert len(batch_keys) == len(include_keys) for key in include_keys: if is_data(data_type): utils.assert_equal(actual=batch[key], expected=getattr(data, key)) utils.assert_equal(actual=getattr(batch, key), expected=getattr(data, key)) else: for b_store, d_store in zip(batch.node_stores, data.node_stores): utils.assert_equal(actual=b_store[key], expected=getattr(d_store, key)) utils.assert_equal(actual=getattr(b_store, key), expected=getattr(d_store, key)) @pytest.mark.parametrize('data', ['molecule', 'fake_hetero_data']) def test_multiple_collater(data, request): r"""Test that we can have two different collaters at the same time and that attribute access works as expected.""" data = request.getfixturevalue(data) include_keys = ('x', ) exclude_keys = make_exclude_keys(include_keys, data) indclude_keys_2 = ('z', ) exclude_keys_2 = make_exclude_keys(indclude_keys_2, data) batch = Collater(exclude_keys=exclude_keys)([data]) batch_2 = Collater(exclude_keys=exclude_keys_2)([data]) for k1, k2 in zip(include_keys, indclude_keys_2): assert k1 in batch.keys assert k2 not in batch.keys assert k1 not in batch_2.keys if is_data(type(data)): assert k2 in batch_2.keys @pytest.mark.parametrize('data', ['molecule', 'fake_hetero_data']) def test_collater_invalid_keys(data, request): data = request.getfixturevalue(data) if not isinstance(data, Data): data['y'] = torch.zeros(1) expected_keys = ['edge_index', 'x', 'y'] else: expected_keys = [ 'edge_index', 'pos', 'y', 'idx', 'z', 'edge_attr', 'x' ] data_type = type(data) exclude_keys = ('v', 'name') collate_fn = Collater(exclude_keys=exclude_keys) batch = collate_fn([data]) assert isinstance(batch, type(Batch(_base_cls=data_type))) batch_keys = list( filter(lambda key: key not in ('ptr', 'batch'), batch.keys)) assert len(expected_keys) == len(batch_keys) if is_data(data_type): for key in expected_keys: utils.assert_equal(actual=batch[key], expected=getattr(data, key)) utils.assert_equal(actual=getattr(batch, key), expected=getattr(data, key)) else: def check(batch_stores, data_stores, key): for b_store, d_store in zip(batch_stores, data_stores): utils.assert_equal(actual=b_store[key], expected=getattr(d_store, key)) utils.assert_equal(actual=getattr(b_store, key), expected=getattr(d_store, key)) key = 'edge_index' check(batch.edge_stores, data.edge_stores, key) key = 'x' check(batch.node_stores, data.node_stores, key) key = 'y' check((batch._global_store, ), (data._global_store, ), key) @pytest.mark.parametrize('data', ['molecule', 'fake_hetero_data']) @pytest.mark.parametrize('mini_batch_size', [1, 16]) def test_combined_batching_collater(mini_batch_size, data, request): data = request.getfixturevalue(data) # Simulates 4 replicas. num_replicas = 4 combined_batch_size = num_replicas * mini_batch_size data_list = [data] * combined_batch_size collate_fn = CombinedBatchingCollater(mini_batch_size=mini_batch_size, collater=Collater()) batch = collate_fn(data_list) for key, v in batch.items(): if isinstance(v, torch.Tensor): if key == 'batch': size = sum(d.num_nodes for d in data_list) assert v.shape[0] == size elif key == 'ptr': assert v.shape[0] == (mini_batch_size + 1) * num_replicas else: if key == 'edge_index': assert v.shape[0] == num_replicas * 2 assert v.shape[ 1] == data.edge_index.shape[1] * mini_batch_size else: size = sum(d[key].shape[0] for d in data_list) assert v.shape[0] == size def test_combined_batching_collater_invalid(molecule): collate_fn = CombinedBatchingCollater(mini_batch_size=8, collater=Collater()) with pytest.raises(AssertionError, match='Invalid batch size'): collate_fn([molecule] * 9) def test_simple_fixed_size_data_loader_mro(num_graphs=2, num_nodes=40): # Check that MROs of the dataloader classes are correct. There are other # classes that inherit from `FixedSizeDataLoader` and would be # affected if the MRO changes here. dataset = FakeDataset(num_graphs=num_graphs, avg_num_nodes=30) fixed_size_options = FixedSizeOptions(num_nodes=num_nodes, num_graphs=num_graphs) pyg_dataloader = FixedSizeDataLoader(dataset, fixed_size_options=fixed_size_options, batch_size=num_graphs) mro = inspect.getmro(type(pyg_dataloader)) # MRO is longer but it's enough to check these classes. expected_mro = (FixedSizeDataLoader, torch.utils.data.DataLoader) num_classes = len(expected_mro) assert mro[:num_classes] == expected_mro ipu_dataloader = IPUFixedSizeDataLoader( dataset=dataset, fixed_size_options=fixed_size_options, batch_size=num_graphs) mro = inspect.getmro(type(ipu_dataloader)) # MRO is longer but it's enough to check these classes. expected_mro = (IPUFixedSizeDataLoader, FixedSizeDataLoader, poptorch.DataLoader, torch.utils.data.DataLoader) num_classes = len(expected_mro) assert mro[:num_classes] == expected_mro @pytest.mark.parametrize('loader', [ FixedSizeDataLoader, dict(loader_cls=IPUFixedSizeDataLoader, device_iterations=3), dict(loader_cls=IPUFixedSizeDataLoader) ]) @pytest.mark.parametrize( 'fixed_size_strategy', [FixedSizeStrategy.PadToMax, FixedSizeStrategy.StreamPack]) @pytest.mark.parametrize('dataset', ['pyg_qm9', 'fake_node_task_dataset']) def test_fixed_size_dataloader(loader, fixed_size_strategy, benchmark, dataset, request, batch_size=10): dataset = request.getfixturevalue(dataset) ipu_dataloader = loader is not FixedSizeDataLoader # CombinedBatchingCollater adds an additional 0-th dimension. dim_offset = 0 device_iterations = loader.get( 'device_iterations', poptorch.Options().device_iterations) if ipu_dataloader else 1 # Get a sensible value for the the maximum number of nodes. padded_num_nodes = dataset[0].num_nodes * (batch_size + 20) padded_num_edges = dataset[0].num_edges * padded_num_nodes # Define the expected tensor sizes in the output. data = dataset[0] data_attributes = (k for k, _ in data() if data.is_node_attr(k) or data.is_edge_attr(k)) expected_sizes = { k: ((padded_num_nodes if data.is_node_attr(k) else padded_num_edges) * device_iterations, dim_offset) for k in data_attributes } # Special case for edge_index which is of shape [2, num_edges]. expected_sizes['edge_index'] = (device_iterations * 2, dim_offset) # Special case for `y` being graph-lvl label if not data.is_node_attr('y'): expected_sizes['y'] = (batch_size * device_iterations, dim_offset) # Create a fixed size dataloader. kwargs = { 'dataset': dataset, 'batch_size': batch_size, 'fixed_size_options': FixedSizeOptions(num_nodes=padded_num_nodes, num_edges=padded_num_edges, num_graphs=batch_size), 'fixed_size_strategy': fixed_size_strategy } if ipu_dataloader: options = poptorch.Options() options.deviceIterations(device_iterations=device_iterations) kwargs['options'] = options loader = loader['loader_cls'] loader = loader(**kwargs) # Check that each batch matches the expected size. loader_iter = iter(loader) repeats = 10 for _ in range(repeats): batch = next(loader_iter) assert hasattr(batch, 'batch') assert hasattr(batch, 'ptr') if ipu_dataloader: assert list(batch.batch.size()) == [ device_iterations * padded_num_nodes, ] if not fixed_size_strategy == FixedSizeStrategy.StreamPack: assert list(batch.ptr.size()) == [ device_iterations * (batch_size + 1), ] else: assert list(batch.batch.size()) == [padded_num_nodes] if not fixed_size_strategy == FixedSizeStrategy.StreamPack: assert list(batch.ptr.size()) == [batch_size + 1] sizes_match = all( getattr(batch, k).shape[dim] == size for k, (size, dim) in expected_sizes.items()) assert sizes_match def loop(): loader_iter = iter(loader) for _ in range(repeats): next(loader_iter) benchmark(loop) @pytest.mark.parametrize('loader', [ FixedSizeDataLoader, dict(loader_cls=IPUFixedSizeDataLoader, device_iterations=3), dict(loader_cls=IPUFixedSizeDataLoader) ]) @pytest.mark.parametrize( 'fixed_size_strategy', [FixedSizeStrategy.PadToMax, FixedSizeStrategy.StreamPack]) @pytest.mark.parametrize( 'dataset', ['fake_hetero_dataset', 'fake_node_task_hetero_dataset']) @pytest.mark.parametrize('fixed_size_options,requires_trimming', [(FixedSizeOptions( num_nodes={ "v0": 500, "v1": 1000, }, num_edges={ ("v0", "e0", "v1"): 5000, ("v0", "e0", "v0"): 6000, ("v1", "e0", "v0"): 7000, ("v0", "e1", "v1"): 8000, ("v1", "e0", "v1"): 9000, }, num_graphs=10, ), False), (FixedSizeOptions( num_nodes=1000, num_edges={ ("v0", "e0", "v1"): 5000, ("v0", "e0", "v0"): 6000, ("v1", "e0", "v0"): 7000, ("v0", "e1", "v1"): 8000, ("v1", "e0", "v1"): 9000, }, num_graphs=10, ), False), (FixedSizeOptions( num_nodes={ "v0": 500, "v1": 1000, }, num_edges=8000, num_graphs=10, ), False), (FixedSizeOptions( num_nodes={ "v0": 100, "v1": 200, }, num_edges={ ("v0", "e0", "v1"): 2000, ("v0", "e0", "v0"): 300, ("v1", "e0", "v0"): 1000, ("v0", "e1", "v1"): 100, ("v1", "e0", "v1"): 3000, }, num_graphs=10, ), True)]) def test_fixed_size_heterodataloader( loader, fixed_size_strategy, benchmark, dataset, fixed_size_options, requires_trimming, request, ): dataset = request.getfixturevalue(dataset) ipu_dataloader = loader is not FixedSizeDataLoader batch_size = fixed_size_options.num_graphs device_iterations = loader.get( 'device_iterations', poptorch.Options().device_iterations) if ipu_dataloader else 1 # Create a fixed size dataloader. kwargs = { 'dataset': dataset, 'batch_size': batch_size, 'fixed_size_options': fixed_size_options, 'fixed_size_strategy': fixed_size_strategy, } if ipu_dataloader: options = poptorch.Options() options.deviceIterations(device_iterations=device_iterations) kwargs['options'] = options loader = loader['loader_cls'] fixed_size_loader = loader(**kwargs) if requires_trimming: with pytest.raises(RuntimeError): next(iter(fixed_size_loader)) fixed_size_loader = loader( over_size_strategy=OverSizeStrategy.TrimNodesAndEdges, **kwargs) for batch in fixed_size_loader: for node_attr in filter(is_iterable, batch.node_stores): check_batch_and_ptr(node_attr) assert batch.num_nodes == fixed_size_options.total_num_nodes assert batch.num_edges == fixed_size_options.total_num_edges assert 'num_nodes' not in batch.node_types assert 'num_edges' not in batch.edge_types if 'y' in batch._node_store_dict.keys(): assert batch.y.shape[0] == batch_size * device_iterations assert batch.graphs_mask.shape[0] == batch_size * device_iterations assert sum(node_attr.batch.shape[0] for node_attr in filter(is_iterable, batch.node_stores) ) == fixed_size_options.total_num_nodes * device_iterations if not fixed_size_strategy == FixedSizeStrategy.StreamPack: assert { node_attr.ptr.shape[0] for node_attr in filter(is_iterable, batch.node_stores) } == {device_iterations * (batch_size + 1)} # Check sizes for some of the items in the batch for node_type in fixed_size_options.num_nodes: assert batch[node_type].x.shape[0] == fixed_size_options.num_nodes[ node_type] * device_iterations assert batch[node_type].batch.shape[ 0] == fixed_size_options.num_nodes[ node_type] * device_iterations assert batch[node_type].nodes_mask.shape[ 0] == fixed_size_options.num_nodes[ node_type] * device_iterations for edge_type in fixed_size_options.num_edges: # Checking num of edges with second dimension so it is not a multiple # of device iterations. assert batch[edge_type].edge_index.shape[ 1] == fixed_size_options.num_edges[edge_type] assert batch[edge_type].edges_mask.shape[ 0] == fixed_size_options.num_edges[ edge_type] * device_iterations def loop(): for _ in fixed_size_loader: pass benchmark(loop) @pytest.mark.parametrize('num_edges', [None, 500]) @pytest.mark.parametrize('num_graphs', [2, 10]) @pytest.mark.parametrize( 'fixed_size_strategy', [FixedSizeStrategy.PadToMax, FixedSizeStrategy.StreamPack]) def test_dataloader_trims_to_fixed_sizes(num_edges, num_graphs, fixed_size_strategy, fake_molecular_dataset): num_nodes = num_graphs * 30 dataset_size = 123 dataset = fake_molecular_dataset[:dataset_size] fixed_size_options = FixedSizeOptions(num_nodes=num_nodes, num_edges=num_edges, num_graphs=num_graphs) train_dataloader = FixedSizeDataLoader( dataset, fixed_size_options=fixed_size_options, batch_size=num_graphs, fixed_size_strategy=fixed_size_strategy, over_size_strategy=OverSizeStrategy.TrimNodesAndEdges) batch = next(iter(train_dataloader)) attrs = [ attr for attr in batch.keys if isinstance(batch[attr], torch.Tensor) ] for data in train_dataloader: for attr in attrs: assert batch[attr].shape == data[attr].shape def is_iterable(src): return hasattr(src, '__iter__') def check_batch_and_ptr(src): assert 'batch' in src assert 'ptr' in src @pytest.mark.parametrize('dataset', ['fake_molecular_dataset', 'fake_hetero_dataset']) def test_dataloader(dataset, request, batch_size=10): dataset = request.getfixturevalue(dataset) loader = DataLoader(dataset=dataset, batch_size=batch_size) for idx, batch in enumerate(loader): if isinstance(batch, HeteroDataBatch): for node_attr in filter(is_iterable, batch.node_stores): check_batch_and_ptr(node_attr) else: check_batch_and_ptr(batch) # Check that each batch matches the expected size. idx_range = slice(idx * batch_size, (idx + 1) * batch_size) assert batch.num_graphs == batch_size assert batch.num_nodes == sum(d.num_nodes for d in dataset[idx_range]) assert batch.num_edges == sum(d.num_edges for d in dataset[idx_range]) # Split batch to the list of data and compare with the data from the # dataset. data_list = batch.to_data_list() def check_data_types(original, new): if isinstance(original, torch.Tensor): assert original.dtype == new.dtype else: for o, n in zip(original.values(), new.values()): check_data_types(o, n) for original, new in zip(dataset[idx_range], data_list): assert set(new.keys) == set(original.keys) for o, n in zip(original.to_dict().values(), new.to_dict().values()): check_data_types(o, n) for key in original.keys: if not isinstance(original[key], torch.Tensor): assert new[key] == original[key] else: assert torch.all(torch.eq(new[key], original[key])) @pytest.mark.parametrize('dataset', ['fake_molecular_dataset', 'fake_hetero_dataset']) @pytest.mark.parametrize('device_iterations', [None, 3]) def test_pad_transform_with_dataloader( device_iterations, dataset, request, batch_size=3, ): """Tests the pattern of using a Pad transform and a non-fixed-size data loader as an approach to achieve fixed size batches""" dataset = request.getfixturevalue(dataset) is_HeteroData = isinstance(dataset[0], HeteroData) if is_HeteroData: max_num_nodes = 300 max_num_edges = 1500 def check(b_idx, torch_batch, batch): for t, b in zip(torch_batch.node_stores, batch.node_stores): assert set(t.keys()) == set(b.keys()) for key in t.keys(): if isinstance(t[key], torch.Tensor): shape_dim = t[key].shape[0] slc = slice(b_idx * shape_dim, (b_idx + 1) * shape_dim) assert all((b[key][slc] == t[key]).tolist()) else: assert b[key] == t[key] else: max_num_nodes = 30 max_num_edges = 150 dataset = dataset[:123] def check(b_idx, torch_batch, batch): assert set(torch_batch.keys).issubset(set(batch.keys)) for key in torch_batch.keys: if isinstance(torch_batch[key], torch.Tensor): shape_dim = torch_batch[key].shape[0] slc = slice(b_idx * shape_dim, (b_idx + 1) * shape_dim) if isinstance(batch[key], torch.Tensor): assert all( (batch[key][slc] == torch_batch[key]).tolist()) else: assert sum(torch_batch[key].tolist()) == batch[key] else: assert batch[key] == torch_batch[key] dataset.transform = Pad(max_num_nodes=max_num_nodes, max_num_edges=max_num_edges) options = poptorch.Options() if device_iterations is not None: options.deviceIterations(device_iterations=device_iterations) loader = IPUDataLoader(dataset=dataset, batch_size=batch_size, options=options) # Create PyG's dataloader to compare the created batches. pyg_loader = DataLoader(dataset=dataset, batch_size=batch_size) torch_loader_iter = iter(pyg_loader) for idx, batch in enumerate(loader): if is_HeteroData: for node_attr in filter(is_iterable, batch.node_stores): check_batch_and_ptr(node_attr) else: check_batch_and_ptr(batch) # Check that each batch matches the expected size. idx_range = slice(idx * batch_size, (idx + 1) * batch_size) assert batch.num_graphs == batch_size assert batch.num_nodes == sum(d.num_nodes for d in dataset[idx_range]) assert batch.num_edges == sum(d.num_edges for d in dataset[idx_range]) num_batches = device_iterations or 1 # Compare batches from PyG's and PopPyG's dataloaders. torch_batches = [next(torch_loader_iter) for _ in range(num_batches)] for b_idx, torch_batch in enumerate(torch_batches): check(b_idx, torch_batch, batch) @pytest.mark.parametrize('dataset', ['fake_molecular_dataset', 'fake_hetero_dataset']) @pytest.mark.parametrize('allow_skip_data', [True, False]) def test_dataloader_with_sampler_num_nodes(allow_skip_data, dataset, request): num_node_types = 2 if dataset == 'fake_hetero_dataset' else 1 dataset = request.getfixturevalue(dataset) num_nodes = 1000 if isinstance(dataset[0], Data): dataset = dataset[:10] num_nodes = 100 sampler = StreamPackingSampler(dataset, max_num_graphs=1, max_num_nodes=num_nodes, allow_skip_data=allow_skip_data) num_nodes = num_nodes + 1 fixed_size_options = FixedSizeOptions(num_nodes=num_nodes) dataloader = FixedSizeDataLoader(dataset, fixed_size_options=fixed_size_options, batch_sampler=sampler) for batch in dataloader: assert batch.num_nodes == num_nodes * num_node_types @pytest.mark.parametrize('create_loader', [FixedSizeDataLoader, IPUFixedSizeDataLoader]) def test_fixed_size_dataloader_num_created_batches_stream_packing( create_loader): total_num_graphs = 100 ds = FakeDataset(num_graphs=total_num_graphs, avg_num_nodes=10) total_num_nodes = sum(d.num_nodes for d in ds) total_num_edges = sum(d.num_edges for d in ds) # Loader should create 10 batches of 11 graphs each (10 real + 1 padding # graph). expected_num_batches = 10 padded_batch_size = 11 fixed_size_options = FixedSizeOptions(num_nodes=total_num_nodes, num_graphs=padded_batch_size) loader = create_loader(ds, batch_size=padded_batch_size, fixed_size_options=fixed_size_options, fixed_size_strategy=FixedSizeStrategy.StreamPack) batches_created = sum(1 for _ in loader) assert batches_created == expected_num_batches # Loader should create only 1 batch since there is space for all graphs # and one padding graph. expected_num_batches = 1 fixed_size_options = FixedSizeOptions(num_nodes=total_num_nodes + 1, num_edges=total_num_edges + 1, num_graphs=101) loader = create_loader(ds, batch_size=101, fixed_size_options=fixed_size_options, fixed_size_strategy=FixedSizeStrategy.StreamPack) batches_created = sum(1 for _ in loader) assert batches_created == expected_num_batches # There is no space for padding graph in the first batch (not enough # graphs) so loader should create two batches. expected_num_batches = 2 fixed_size_options = FixedSizeOptions(num_nodes=total_num_nodes + 1, num_edges=total_num_edges + 1, num_graphs=100) loader = create_loader(ds, batch_size=100, fixed_size_options=fixed_size_options, fixed_size_strategy=FixedSizeStrategy.StreamPack) batches_created = sum(1 for _ in loader) assert batches_created == expected_num_batches # There is no space for padding graph in the first batch (not enough # nodes) so loader should create two batches. expected_num_batches = 2 fixed_size_options = FixedSizeOptions(num_nodes=total_num_nodes, num_edges=total_num_edges + 1, num_graphs=101) loader = create_loader(ds, batch_size=101, fixed_size_options=fixed_size_options, fixed_size_strategy=FixedSizeStrategy.StreamPack) batches_created = sum(1 for _ in loader) assert batches_created == expected_num_batches # There is no space for padding graph in the first batch (not enough # edges) so loader should create two batches. expected_num_batches = 2 fixed_size_options = FixedSizeOptions(num_nodes=total_num_nodes + 1, num_edges=total_num_edges, num_graphs=101) loader = create_loader(ds, batch_size=101, fixed_size_options=fixed_size_options, fixed_size_strategy=FixedSizeStrategy.StreamPack) batches_created = sum(1 for _ in loader) assert batches_created == expected_num_batches def test_fixed_size_dataloader_with_default_values(fake_large_dataset): ds = fake_large_dataset batch_size = 10 padded_batch_size = batch_size + 1 # The default value of `num_nodes` should be large enough so it's possible # to always pick 10 graphs and create additional padding graph. loader = FixedSizeDataLoader(ds, batch_size=padded_batch_size) expected_batches = 10 num_batches = sum(1 for _ in loader) assert expected_batches == num_batches # DataLoader should correctly capture the number of nodes from sampler. sampler = StreamPackingSampler(ds, max_num_graphs=batch_size) loader = FixedSizeDataLoader(ds, batch_size=padded_batch_size, batch_sampler=sampler) num_batches = 0 for batch in loader: assert batch.num_nodes == sampler.max_num_nodes + 1 num_batches += 1 assert expected_batches == num_batches @pytest.mark.parametrize('create_loader', [FixedSizeDataLoader, IPUFixedSizeDataLoader]) def test_fixed_size_dataloader_with_custom_batch_sampler(create_loader): total_num_graphs = 20 batch_size = 5 ds = FakeDataset(num_graphs=total_num_graphs, avg_num_nodes=10) class DummySampler: def __init__(self, data_source, batch_size): self.data_source = data_source self.batch_size = batch_size def __iter__(self): for _ in range(len(self)): yield [0] * self.batch_size def __len__(self): return len(self.data_source) // self.batch_size sampler = DummySampler(ds, batch_size - 1) with pytest.raises(ValueError): loader = create_loader( ds, batch_size=5, batch_sampler=sampler, fixed_size_strategy=FixedSizeStrategy.StreamPack) loader = FixedSizeDataLoader(ds, batch_size=batch_size, batch_sampler=sampler) num_batches = sum(1 for _ in loader) assert num_batches == 5 ================================================ FILE: tests/gnn/test_encoding.py ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. import torch from torch_geometric.nn import PositionalEncoding, TemporalEncoding from gnn.nn.nn_utils import op_harness def test_positional_encoding(): encoder = PositionalEncoding(64) x = torch.tensor([1.0, 2.0, 3.0]) op_harness(encoder, [x]) def test_temporal_encoding(): encoder = TemporalEncoding(64) x = torch.tensor([1.0, 2.0, 3.0]) op_harness(encoder, [x]) ================================================ FILE: tests/gnn/test_fixed_size_options.py ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. import pytest import torch_geometric as pyg from torch_geometric.data import HeteroData from torch_geometric.loader import DataLoader, NeighborLoader from poptorch_geometric.fixed_size_options import FixedSizeOptions @pytest.mark.parametrize('dataset,expected_result', [('fake_large_dataset', FixedSizeOptions( num_nodes=109, num_edges=1099, num_graphs=10, )), ('fake_hetero_dataset', FixedSizeOptions( num_nodes={ "v0": 559, "v1": 559 }, num_edges={ ("v0", "e0", "v0"): 5212, ("v1", "e0", "v1"): 5176, ("v0", "e0", "v1"): 5239, ("v1", "e0", "v0"): 5149, ("v0", "e1", "v1"): 5176, }, num_graphs=10, ))]) def test_fixed_size_options_from_dataset(dataset, expected_result, request): dataset = request.getfixturevalue(dataset) batch_size = 10 fixed_size_options = FixedSizeOptions.from_dataset(dataset, batch_size) assert fixed_size_options.num_nodes == expected_result.num_nodes assert fixed_size_options.num_edges == expected_result.num_edges assert fixed_size_options.num_graphs == expected_result.num_graphs # With sample limit fixed_size_options = FixedSizeOptions.from_dataset(dataset, batch_size, sample_limit=10000) assert fixed_size_options.num_nodes == expected_result.num_nodes assert fixed_size_options.num_edges == expected_result.num_edges assert fixed_size_options.num_graphs == expected_result.num_graphs @pytest.mark.parametrize('dataset,expected_result', [('fake_large_dataset', FixedSizeOptions( num_nodes=116, num_edges=1015, num_graphs=11, )), ('fake_hetero_dataset', FixedSizeOptions( num_nodes={ "v0": 543, "v1": 523 }, num_edges={ ("v0", "e0", "v0"): 4950, ("v1", "e0", "v1"): 4766, ("v0", "e0", "v1"): 4897, ("v1", "e0", "v0"): 4667, ("v0", "e1", "v1"): 4914, }, num_graphs=11, ))]) def test_fixed_size_options_from_dataloader(dataset, expected_result, request): dataset = request.getfixturevalue(dataset) batch_size = 10 dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False) fixed_size_options = FixedSizeOptions.from_loader(dataloader) assert fixed_size_options.num_nodes == expected_result.num_nodes assert fixed_size_options.num_edges == expected_result.num_edges assert fixed_size_options.num_graphs == expected_result.num_graphs dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False) # With sample limit fixed_size_options = FixedSizeOptions.from_loader(dataloader, sample_limit=1000) assert fixed_size_options.num_nodes == expected_result.num_nodes assert fixed_size_options.num_edges == expected_result.num_edges assert fixed_size_options.num_graphs == expected_result.num_graphs @pytest.mark.parametrize('dataset,expected_result', [('fake_node_task_dataset', FixedSizeOptions( num_nodes=13, num_edges=61, num_graphs=2, )), ('fake_node_task_hetero_dataset', FixedSizeOptions( num_nodes={ "v0": 62, "v1": 43 }, num_edges={ ("v0", "e0", "v0"): 146, ("v1", "e0", "v1"): 115, ("v0", "e0", "v1"): 116, ("v1", "e0", "v0"): 139, ("v0", "e1", "v1"): 116, }, num_graphs=2, ))]) def test_fixed_size_options_from_sample_dataloader(dataset, expected_result, request): dataset = request.getfixturevalue(dataset) is_HeteroData = isinstance(dataset[0], HeteroData) pyg.seed_everything(42) dataloader = NeighborLoader(dataset[0], [5, 5], batch_size=5, shuffle=False, input_nodes=("v0", None) if is_HeteroData else None) fixed_size_options = FixedSizeOptions.from_loader(dataloader) assert fixed_size_options.num_nodes == expected_result.num_nodes assert fixed_size_options.num_edges == expected_result.num_edges assert fixed_size_options.num_graphs == expected_result.num_graphs pyg.seed_everything(42) dataloader = NeighborLoader(dataset[0], [5, 5], batch_size=5, shuffle=False, input_nodes=("v0", None) if is_HeteroData else None) # With sample limit fixed_size_options = FixedSizeOptions.from_loader(dataloader, sample_limit=1000) assert fixed_size_options.num_nodes == expected_result.num_nodes assert fixed_size_options.num_edges == expected_result.num_edges assert fixed_size_options.num_graphs == expected_result.num_graphs def test_fixed_size_options_to_hetero(request): dataset = request.getfixturevalue("fake_hetero_dataset") batch_size = 10 num_nodes = 20 num_edges = 40 fixed_size_options = FixedSizeOptions(num_nodes=num_nodes, num_edges=num_edges, num_graphs=batch_size) fixed_size_options.to_hetero(dataset[0].node_types, dataset[0].edge_types) assert all(n == num_nodes for n in fixed_size_options.num_nodes.values()) assert all(n == num_edges for n in fixed_size_options.num_edges.values()) assert fixed_size_options.num_graphs == batch_size ================================================ FILE: tests/gnn/test_masker.py ================================================ # Copyright (c) 2022 Graphcore Ltd. All rights reserved. import pytest import torch import torch_geometric as pyg from poptorch_geometric import masker @pytest.fixture(params=[True, False]) def entries(request) -> masker.Entries: """Returns something which looks like an entry""" pyg.seed_everything(1) is_tuple = request.param entry = torch.rand([2, 3, 4]) return (entry, entry) if is_tuple else entry class TestNoOpMasker: """Tests the No Op masker, makes sure it does nothing.""" @pytest.mark.parametrize("masker_name", ["node", "graph", "edge"]) def test_masker_does_not_change_the_object(self, masker_name: str, entries: masker.Entries): mask = masker.NoMasker() output_entries = getattr(mask, f"{masker_name}_masker")(entries) assert entries is output_entries class TestNoOpLayerMasker: @pytest.fixture def layer(self): def layer_function(*args): total = 0 for arg in args: total += torch.sum(arg) return total return layer_function @pytest.mark.parametrize("masker_name", ["node", "graph", "edge"]) def test_masker_does_not_change_the_layer_result( self, masker_name: str, entries: masker.Entries, layer: masker.Layer, ): mask = masker.PreLayerMasker(masker=masker.NoMasker()) masked_layer = getattr(mask, f"{masker_name}_masker")(layer) if not isinstance(entries, (tuple, list)): entries = (entries, ) reference_output = layer(*entries) masked_output = masked_layer(*entries) assert reference_output == masked_output, ( "For the No-op layer masker," + " the result of a layer should be unchanged") ================================================ FILE: tests/gnn/test_model_args.py ================================================ # Copyright (c) 2022 Graphcore Ltd. All rights reserved. import pytest import torch import torch.nn.functional as F from torch_geometric import seed_everything from torch_geometric.data import Batch from torch_geometric.datasets import FakeDataset from torch_geometric.nn.models import MLP from utils import assert_equal # Need to import poptorch_geometric to ensure that our arg parser implementation is # registered with poptorch ahead of running these tests import poptorch_geometric # pylint: disable=unused-import import poptorch class Model(torch.nn.Module): def __init__(self, in_channels, out_channels): super().__init__() self.mlp = MLP([in_channels, out_channels]) def forward(self, example): example.h = self.mlp(example.x) example.out = F.log_softmax(example.h, dim=1) if self.training: pred = example.out[example.train_mask] target = example.y[example.train_mask] example.loss = F.cross_entropy(pred, target) return example def add_train_mask(data): # Add a train_mask property that contains indices num_training_nodes = int(0.8 * data.num_nodes) data.train_mask = torch.randperm(data.num_nodes)[:num_training_nodes] return data def data(): seed_everything(0) dataset = FakeDataset(transform=add_train_mask, avg_num_nodes=32, num_channels=8) data = dataset[0] in_channels = data.x.shape[-1] out_channels = dataset.num_classes return data, in_channels, out_channels def batch(): seed_everything(0) dataset = FakeDataset(num_graphs=4, transform=add_train_mask, avg_num_nodes=12, num_channels=8) data = dataset[0] in_channels = data.x.shape[-1] out_channels = dataset.num_classes batch = Batch.from_data_list(dataset[:]) return batch, in_channels, out_channels @pytest.fixture def dispatcher_options(): options = poptorch.Options() return options @pytest.mark.parametrize('arg', [data(), batch()], ids=['data', 'batch']) def test_args(arg, dispatcher_options): arg, in_channels, out_channels = arg if isinstance(arg, Batch): pytest.skip("Known issue. Unblock when AFS-97 will be completed.") model = Model(in_channels, out_channels) model.train() optimizer = poptorch.optim.Adam(model.parameters(), lr=0.001) model = poptorch.trainingModel(model=model, options=dispatcher_options, optimizer=optimizer) output = model(arg) assert isinstance(output, type(arg)), \ "Model output must have the same type as input argument" # Check that all the keys from the input argument are also present on the # output argument. for k in arg.keys: assert k in output # Check that all the keys that were added in the model are present on the # output argument. for k in ['h', 'out', 'loss']: assert k in output if isinstance(arg, Batch): # Check that the batch vector is preserved but omit the dtype since # the PopTorch dispatcher will coerce long -> int32 assert_equal(output.batch, arg.batch, check_dtype=False) assert output.batch.dtype == torch.int32 ================================================ FILE: tests/gnn/test_neighbor_loader.py ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. import numpy as np import pytest import torch from torch_geometric.data import Data, HeteroData from torch_geometric.loader import NeighborLoader from torch_geometric.sampler.base import SubgraphType from torch_geometric.testing import ( get_random_edge_index, onlyNeighborSampler, ) from torch_geometric.utils import ( is_undirected, ) from poptorch_geometric.fixed_size_options import FixedSizeOptions from poptorch_geometric.neighbor_loader import FixedSizeNeighborLoader def validate_fixed_data_format(loader: FixedSizeNeighborLoader, fixed_size_options: FixedSizeOptions, is_hetero_data: bool, debug_print: bool = False): for index in range(0, len(loader), loader.batch_size): # pylint: disable=too-many-nested-blocks indices = list(range(index, index + loader.batch_size)) dynamic = loader.nativeCollate([indices]) fixed = loader.fixedSizeCollate(dynamic) if is_hetero_data: dynamic_dict_store = { '_node_store_dict': dynamic.__dict__['_node_store_dict'], '_edge_store_dict': dynamic.__dict__['_edge_store_dict'] } fixed_dict_store = { '_node_store_dict': fixed.__dict__['_node_store_dict'], '_edge_store_dict': fixed.__dict__['_edge_store_dict'] } else: dynamic_dict_store = { '_store': { "Data:": dynamic.__dict__['_store'].__dict__['_mapping'] } } fixed_dict_store = { '_store': { "Data:": fixed.__dict__['_store'].__dict__['_mapping'] } } for storage_type in dynamic_dict_store: if debug_print: print(f"Store [{storage_type}]") if storage_type == '_edge_store_dict': pad_value = fixed_size_options.edge_pad_value else: pad_value = fixed_size_options.node_pad_value dynamic_dict_group = dynamic_dict_store[storage_type] fixed_dict_group = fixed_dict_store[storage_type] for group in dynamic_dict_group: if debug_print: print(f"Group [{group}]") dynamic_dict = dynamic_dict_group[group] fixed_dict = fixed_dict_group[group] # check if values are padded as expected for key in dynamic_dict: # Batch size is used only for sampling if key == 'batch_size': continue dynamic_tensor = dynamic_dict[key] fixed_tensor = fixed_dict[key] if debug_print: print(f"Key: [{key}]") print("Dynamic:", dynamic_tensor) print("Fixed :", fixed_tensor) if dynamic_tensor.dim() < 2: dynamic_tensor = [dynamic_tensor] fixed_tensor = [fixed_tensor] for i in range(0, len(dynamic_tensor)): # pylint: disable=consider-using-enumerate dynamic_dim = dynamic_tensor[i] fixed_dim = fixed_tensor[i] valid_range = range( 0, min(len(dynamic_dim), len(fixed_dim))) fixed_range = range(len(valid_range), len(fixed_dim)) for j in valid_range: assert dynamic_dim[j] == fixed_dim[j] # Dummy (padded) edge_index should point to dummy node if key == 'edge_index': if is_hetero_data: n_id_tensor = fixed_dict_store[ # pylint: disable=line-too-long '_node_store_dict'][ group[0 if i < 1 else -1]]['n_id'] else: assert fixed_size_options.num_edges == len( fixed_dim), f"Incorrect padding for {key}" n_id_tensor = fixed_dict['n_id'] for j in fixed_range: assert n_id_tensor[fixed_dim[j]] == pad_value # Dummy (padded) value check else: for j in fixed_range: assert fixed_dim[j] == pad_value def is_subset(subedge_index, edge_index, src_idx, dst_idx): num_nodes = int(edge_index.max()) + 1 idx = num_nodes * edge_index[0] + edge_index[1] subidx = num_nodes * src_idx[subedge_index[0]] + dst_idx[subedge_index[1]] mask = torch.from_numpy(np.isin(subidx, idx)) return int(mask.sum()) == mask.numel() @onlyNeighborSampler @pytest.mark.parametrize('subgraph_type', list(SubgraphType)) def test_homo_neighbor_loader_basic(subgraph_type): torch.manual_seed(12345) data = Data() data.x = torch.arange(15) data.edge_index = get_random_edge_index(15, 15, 75, torch.int64) data.edge_attr = torch.arange(75) use_batch_size = 5 default_loader = NeighborLoader( data, num_neighbors=[5] * 2, batch_size=use_batch_size, subgraph_type=subgraph_type, ) fixed_size_options = FixedSizeOptions.from_loader(default_loader) loader = FixedSizeNeighborLoader( data, num_neighbors=[5] * 2, batch_size=use_batch_size, subgraph_type=subgraph_type, fixed_size_options=fixed_size_options, ) validate_fixed_data_format(loader=loader, fixed_size_options=fixed_size_options, is_hetero_data=False) assert len(loader) == len(data.x) // use_batch_size batch = next(iter(loader)) assert isinstance(batch, Data) assert batch.n_id[:1].tolist() == [0] for i, batch in enumerate(loader): assert isinstance(batch, Data) assert batch.x.size(0) <= 101 assert batch.n_id.size() == (batch.num_nodes, ) assert batch.x.min() >= 0 and batch.x.max() < 101 assert batch.edge_index.min() >= 0 assert batch.edge_index.max() < batch.num_nodes # Input nodes are always sampled first: assert torch.equal( batch.x[:use_batch_size], torch.arange(i * use_batch_size, (i + 1) * use_batch_size)) if subgraph_type != SubgraphType.bidirectional: assert batch.edge_attr.min() >= 0 assert batch.edge_attr.max() < 500 assert is_subset( batch.edge_index.to(torch.int64), data.edge_index.to(torch.int64), batch.x, batch.x, ) @onlyNeighborSampler @pytest.mark.parametrize('subgraph_type', list(SubgraphType)) def test_hetero_neighbor_loader_basic(subgraph_type): dtype = torch.int64 torch.manual_seed(12345) data = HeteroData() data['paper'].x = torch.arange(15) data['author'].x = torch.arange(15, 45) edge_index = get_random_edge_index(15, 15, 45, dtype) data['paper', 'paper'].edge_index = edge_index data['paper', 'paper'].edge_attr = torch.arange(45) edge_index = get_random_edge_index(15, 30, 90, dtype) data['paper', 'author'].edge_index = edge_index data['paper', 'author'].edge_attr = torch.arange(45, 135) edge_index = get_random_edge_index(30, 15, 150, dtype) data['author', 'paper'].edge_index = edge_index data['author', 'paper'].edge_attr = torch.arange(200, 250) batch_size = 2 with pytest.raises(ValueError, match="hops must be the same across all"): default_loader = NeighborLoader( data, num_neighbors={ ('paper', 'to', 'paper'): [-1], ('paper', 'to', 'author'): [-1, -1], ('author', 'to', 'paper'): [-1, -1], }, input_nodes='paper', batch_size=batch_size, subgraph_type=subgraph_type, ) fixed_size_options = FixedSizeOptions.from_loader(default_loader) loader = FixedSizeNeighborLoader( data, num_neighbors={ ('paper', 'to', 'paper'): [-1], ('paper', 'to', 'author'): [-1, -1], ('author', 'to', 'paper'): [-1, -1], }, input_nodes='paper', batch_size=batch_size, subgraph_type=subgraph_type, fixed_size_options=fixed_size_options, ) next(iter(loader)) default_loader = NeighborLoader( data, num_neighbors=[10] * 2, input_nodes='paper', batch_size=batch_size, subgraph_type=subgraph_type, ) fixed_size_options = FixedSizeOptions.from_loader(default_loader) loader = FixedSizeNeighborLoader(data, num_neighbors=[10] * 2, input_nodes='paper', batch_size=batch_size, subgraph_type=subgraph_type, fixed_size_options=fixed_size_options) assert len(loader) > 0 validate_fixed_data_format(loader=loader, fixed_size_options=fixed_size_options, is_hetero_data=True) @onlyNeighborSampler @pytest.mark.parametrize('subgraph_type', list(SubgraphType)) def test_hetero_neighbor_loader_large(subgraph_type): dtype = torch.int64 torch.manual_seed(12345) data = HeteroData() data['paper'].x = torch.arange(20) data['author'].x = torch.arange(20, 220) edge_index = get_random_edge_index(20, 20, 40, dtype) data['paper', 'paper'].edge_index = edge_index data['paper', 'paper'].edge_attr = torch.arange(40) edge_index = get_random_edge_index(20, 50, 250, dtype) data['paper', 'author'].edge_index = edge_index data['paper', 'author'].edge_attr = torch.arange(40, 300) edge_index = get_random_edge_index(50, 20, 250, dtype) data['author', 'paper'].edge_index = edge_index data['author', 'paper'].edge_attr = torch.arange(300, 400) batch_size = 2 with pytest.raises(ValueError, match="hops must be the same across all"): default_loader = NeighborLoader( data, num_neighbors={ ('paper', 'to', 'paper'): [-1], ('paper', 'to', 'author'): [-1, -1], ('author', 'to', 'paper'): [-1, -1], }, input_nodes='paper', batch_size=batch_size, subgraph_type=subgraph_type, ) fixed_size_options = FixedSizeOptions.from_loader(default_loader) loader = FixedSizeNeighborLoader( data, num_neighbors={ ('paper', 'to', 'paper'): [-1], ('paper', 'to', 'author'): [-1, -1], ('author', 'to', 'paper'): [-1, -1], }, input_nodes='paper', batch_size=batch_size, subgraph_type=subgraph_type, fixed_size_options=fixed_size_options, ) next(iter(loader)) default_loader = NeighborLoader( data, num_neighbors=[10] * 2, input_nodes='paper', batch_size=batch_size, subgraph_type=subgraph_type, ) fixed_size_options = FixedSizeOptions.from_loader(default_loader) loader = FixedSizeNeighborLoader(data, num_neighbors=[10] * 2, input_nodes='paper', batch_size=batch_size, subgraph_type=subgraph_type, add_pad_masks=True, fixed_size_options=fixed_size_options) assert len(loader) > 0 validate_fixed_data_format(loader=loader, fixed_size_options=fixed_size_options, is_hetero_data=True) for batch in loader: assert isinstance(batch, HeteroData) # Test node type selection: assert set(batch.node_types) == {'paper', 'author'} assert batch['paper'].n_id.size() == (batch['paper'].num_nodes, ) assert batch['paper'].x.size(0) <= 20 + 1 assert batch['paper'].x.min() >= 0 and batch['paper'].x.max() < 40 + 1 assert batch['author'].n_id.size() == (batch['author'].num_nodes, ) assert batch['author'].x.size(0) <= 50 assert batch['author'].x.max() < 220 # Test edge type selection: assert set(batch.edge_types) == {('paper', 'to', 'paper'), ('paper', 'to', 'author'), ('author', 'to', 'paper')} row, col = batch['paper', 'paper'].edge_index assert row.min() >= 0 and row.max() < batch['paper'].num_nodes assert col.min() >= 0 and col.max() < batch['paper'].num_nodes if subgraph_type != SubgraphType.bidirectional: assert batch['paper', 'paper'].e_id.size() == (row.numel(), ) value = batch['paper', 'paper'].edge_attr assert value.min() >= 0 and value.max() < 40 assert is_subset( batch['paper', 'paper'].edge_index.to( torch.int64)[:, batch['paper', 'paper'].edges_mask], data['paper', 'paper'].edge_index.to(torch.int64), batch['paper'].x, batch['paper'].x, ) elif subgraph_type != SubgraphType.directional: assert 'e_id' not in batch['paper', 'paper'] # pylint: disable=no-value-for-parameter assert 'edge_attr' not in batch['paper', 'paper'] # pylint: disable=no-value-for-parameter assert is_undirected(batch['paper', 'paper'].edge_index) # pylint: disable=no-value-for-parameter row, col = batch['paper', 'author'].edge_index assert row.min() >= 0 and row.max() < batch['paper'].num_nodes assert col.min() >= 0 and col.max() < batch['author'].num_nodes ================================================ FILE: tests/gnn/test_register_custom_args.py ================================================ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. import unittest import torch import torch.nn.functional as F from torch_geometric.nn import GCNConv import helpers import poptorch class GCN(torch.nn.Module): def __init__(self, in_channels: int, out_channels: int): super().__init__() self.conv1 = GCNConv(in_channels, 16, add_self_loops=False) self.conv2 = GCNConv(16, out_channels, add_self_loops=False) def forward(self, data): x = data.x edge_index = data.edge_index x = self.conv1(x, edge_index).relu() x = F.dropout(x, training=self.training) x = self.conv2(x, edge_index).relu() x = F.log_softmax(x, dim=1) return x @unittest.mock.patch.dict("os.environ", helpers.disableSmallModel()) def test_register_custom_parsers(planetoid_cora): data = planetoid_cora[0] model = GCN(planetoid_cora.num_node_features, planetoid_cora.num_classes) model.eval() poptorch_model = poptorch.inferenceModel(model) result = poptorch_model(data) assert result is not None ================================================ FILE: tests/gnn/test_stream_packing_sampler.py ================================================ # Copyright (c) 2022 Graphcore Ltd. All rights reserved. import itertools import math from statistics import mean import pytest import torch from utils import FakeDatasetEqualGraphs, is_data from torch.utils.data.sampler import RandomSampler, SequentialSampler from torch_geometric.data import Data, HeteroData from torch_geometric.datasets import FakeDataset from poptorch_geometric.stream_packing_sampler import StreamPackingSampler from poptorch_geometric.collate import CombinedBatchingCollater, FixedSizeCollater from poptorch_geometric.dataloader import FixedSizeDataLoader from poptorch_geometric.fixed_size_options import FixedSizeOptions from poptorch_geometric.pyg_dataloader import FixedSizeStrategy, OverSizeStrategy def test_stream_packing_sampler_default_params(): num_graphs = 10 dataset = FakeDataset(num_graphs=num_graphs, avg_num_nodes=30, avg_degree=5) sampler = StreamPackingSampler(dataset, max_num_graphs=1) length = sum(1 for _ in itertools.chain(sampler)) assert length == num_graphs def test_stream_packing_sampler_should_throw_exception(): num_graphs = 3 dataset = FakeDataset(num_graphs=num_graphs, avg_num_nodes=30, avg_degree=5) sampler = StreamPackingSampler(dataset, max_num_graphs=2, max_num_nodes=2, allow_skip_data=False) with pytest.raises(RuntimeError): samples = [] for sample in sampler: samples.append(sample) sampler = StreamPackingSampler(dataset, max_num_graphs=2, max_num_edges=2, allow_skip_data=False) with pytest.raises(RuntimeError): samples = [] for sample in sampler: samples.append(sample) def test_stream_packing_sampler_should_not_throw_exception(): num_graphs = 4 dataset = FakeDataset(num_graphs=num_graphs, avg_num_nodes=30, avg_degree=5) sampler = StreamPackingSampler(dataset, max_num_graphs=2, max_num_nodes=2, allow_skip_data=True) length = sum(1 for _ in sampler) assert length == 0 sampler = StreamPackingSampler(dataset, max_num_graphs=2, max_num_edges=2, allow_skip_data=True) length = sum(1 for _ in sampler) assert length == 0 @pytest.mark.parametrize('data_type', [Data, HeteroData]) @pytest.mark.parametrize('shuffle', [True, False]) @pytest.mark.parametrize('batch_num_graphs', [2, 10]) @pytest.mark.parametrize('allow_skip_data', [True, False]) def test_stream_packing_should_return_valid_samples(data_type, shuffle, batch_num_graphs, allow_skip_data, fake_hetero_dataset): if is_data(data_type): avg_num_nodes = 30 dataset = FakeDataset(num_graphs=100, avg_num_nodes=avg_num_nodes, avg_degree=5, num_channels=16, edge_dim=8) else: avg_num_nodes = 50 dataset = fake_hetero_dataset avg_num_edges = int(math.ceil(mean((data.num_edges for data in dataset)))) batch_num_nodes = avg_num_nodes * batch_num_graphs + batch_num_graphs if not allow_skip_data: max_num_nodes = max(data.num_nodes for data in dataset) batch_num_nodes = max(batch_num_nodes, max_num_nodes + batch_num_graphs) batch_num_edges = avg_num_edges * batch_num_graphs + batch_num_graphs if not allow_skip_data: max_num_edges = max(data.num_edges for data in dataset) batch_num_edges = max(batch_num_edges, max_num_edges + batch_num_graphs) base_sampler = RandomSampler(dataset) if shuffle else \ SequentialSampler(dataset) # Leave space for padding. sampler = StreamPackingSampler(dataset, max_num_graphs=batch_num_graphs - 1, max_num_nodes=batch_num_nodes - 1, max_num_edges=batch_num_edges - 1, base_sampler=base_sampler, allow_skip_data=allow_skip_data) length = sum(1 for _ in sampler) assert length > 0 if not allow_skip_data: for sample in sampler: assert len(sample) <= batch_num_graphs @pytest.mark.parametrize('data_type', [Data, HeteroData]) @pytest.mark.parametrize('shuffle', [True, False]) @pytest.mark.parametrize('allow_skip_data', [True, False]) @pytest.mark.parametrize('torch_data_loader', [True, False]) def test_stream_packing_sampler_should_be_usable_with_torch_data_loader( data_type, shuffle, allow_skip_data, torch_data_loader, fake_hetero_dataset): batch_num_graphs = 10 num_channels = 16 edge_dim = 8 num_graphs = 10 if is_data(data_type): avg_num_nodes = 30 dataset = FakeDataset(num_graphs=100, avg_num_nodes=avg_num_nodes, avg_degree=5, num_channels=num_channels, edge_dim=8) else: avg_num_nodes = 50 dataset = fake_hetero_dataset avg_num_edges = math.ceil(mean(data.num_edges for data in dataset)) base_sampler = RandomSampler(dataset) if shuffle else \ SequentialSampler(dataset) batch_num_nodes = avg_num_nodes * batch_num_graphs + batch_num_graphs if not allow_skip_data: max_num_nodes = max(data.num_nodes for data in dataset) batch_num_nodes = max(batch_num_nodes, max_num_nodes + batch_num_graphs) batch_num_edges = avg_num_edges * batch_num_graphs + batch_num_graphs if not allow_skip_data: max_num_edges = max(data.num_edges for data in dataset) batch_num_edges = max(batch_num_edges, max_num_edges + batch_num_graphs) fixed_size_options = FixedSizeOptions(num_nodes=batch_num_nodes, num_edges=batch_num_edges, num_graphs=num_graphs, node_pad_value=0.0, edge_pad_value=0.0, graph_pad_value=0.0) # Leave space for padding. if torch_data_loader: batch_sampler = StreamPackingSampler(dataset, max_num_graphs=num_graphs - 1, max_num_nodes=batch_num_nodes - 1, max_num_edges=batch_num_edges - 1, base_sampler=base_sampler, allow_skip_data=allow_skip_data) collater = CombinedBatchingCollater( FixedSizeCollater(fixed_size_options=fixed_size_options, add_masks_to_batch=True)) dataloader = torch.utils.data.DataLoader(dataset, batch_sampler=batch_sampler, collate_fn=collater) else: dataloader = FixedSizeDataLoader( dataset, batch_size=num_graphs, fixed_size_options=fixed_size_options, fixed_size_strategy=FixedSizeStrategy.StreamPack, add_pad_masks=True, over_size_strategy=OverSizeStrategy.Skip if allow_skip_data else OverSizeStrategy.Error) expected_x_shape = torch.Size([batch_num_nodes, num_channels]) expected_batch_shape = torch.Size([batch_num_nodes]) expected_edge_attr_shape = torch.Size([batch_num_edges, edge_dim]) expected_mask_attr_shape = torch.Size([batch_num_graphs]) expected_edge_index_attr_shape = torch.Size([2, batch_num_edges]) for data in dataloader: assert data.graphs_mask.shape == expected_mask_attr_shape if is_data(data_type): assert data.x.shape == expected_x_shape assert data.batch.shape == expected_batch_shape assert data.edge_attr.shape == expected_edge_attr_shape assert data.edge_index.shape == expected_edge_index_attr_shape else: num_node_types = 2 num_edge_types = 5 assert data.num_nodes == batch_num_nodes * num_node_types assert data.num_edges == batch_num_edges * num_edge_types @pytest.mark.parametrize('shuffle', [True, False]) @pytest.mark.parametrize('allow_skip_data', [True, False]) def test_stream_packing_sampler_padding_not_needed(shuffle, allow_skip_data): num_graphs_in_dataset = 100 num_nodes = 30 batch_num_graphs = 10 num_channels = 16 edge_dim = 8 dataset = FakeDatasetEqualGraphs(num_graphs=num_graphs_in_dataset, num_nodes=num_nodes, num_channels=num_channels, edge_dim=edge_dim) avg_num_edges = math.ceil(mean(data.num_edges for data in dataset)) base_sampler = RandomSampler(dataset) if shuffle else \ SequentialSampler(dataset) batch_num_nodes = num_nodes * batch_num_graphs if not allow_skip_data: max_num_nodes = max(data.num_nodes for data in dataset) batch_num_nodes = max(batch_num_nodes, max_num_nodes + batch_num_graphs) batch_num_edges = avg_num_edges * batch_num_graphs if not allow_skip_data: max_num_edges = max(data.num_edges for data in dataset) batch_num_edges = max(batch_num_edges, max_num_edges + batch_num_graphs) batch_sampler = StreamPackingSampler(dataset, max_num_graphs=batch_num_graphs, max_num_nodes=batch_num_nodes, max_num_edges=batch_num_edges, base_sampler=base_sampler, allow_skip_data=allow_skip_data) fixed_size_options = FixedSizeOptions(num_nodes=batch_num_nodes, num_edges=batch_num_edges, num_graphs=batch_num_graphs, node_pad_value=0.0, edge_pad_value=0.0, graph_pad_value=0.0) collator = CombinedBatchingCollater( FixedSizeCollater(fixed_size_options=fixed_size_options, add_masks_to_batch=True)) dataloader = torch.utils.data.DataLoader(dataset, batch_sampler=batch_sampler, collate_fn=collator) expected_x_shape = torch.Size([batch_num_nodes, num_channels]) expected_batch_shape = torch.Size([batch_num_nodes]) expected_edge_attr_shape = torch.Size([batch_num_edges, edge_dim]) expected_mask_attr_shape = torch.Size([batch_num_graphs]) expected_edge_index_attr_shape = torch.Size([2, batch_num_edges]) total_graphs_from_dataloader = 0 for data in dataloader: assert data.x.shape == expected_x_shape assert data.batch.shape == expected_batch_shape assert data.edge_attr.shape == expected_edge_attr_shape assert data.graphs_mask.shape == expected_mask_attr_shape assert data.edge_index.shape == expected_edge_index_attr_shape total_graphs_from_dataloader += data.num_graphs assert total_graphs_from_dataloader == num_graphs_in_dataset ================================================ FILE: tests/gnn/utils.py ================================================ # Copyright (c) 2022 Graphcore Ltd. All rights reserved. import functools import random import subprocess from pathlib import Path from typing import List, Optional, Union import nbformat import torch # noqa F401 from nbconvert.preprocessors import ExecutePreprocessor from torch.testing import assert_close from torch_geometric.data import Data, HeteroData, InMemoryDataset from torch_geometric.data.data import BaseData assert_equal = functools.partial(assert_close, rtol=0., atol=0.) DEFAULT_PROCESS_TIMEOUT_SECONDS = 40 * 60 REPO_ROOT = Path(__file__).resolve().parents[1] def is_data(type_: BaseData): if type_ is Data: return True if type_ is HeteroData: return False raise f"Wrong data type: {type_}. Should be Data or HeteroData!" class CalledProcessError(subprocess.CalledProcessError): """An error for subprocesses which captures stdout and stderr in the error message.""" def __str__(self) -> str: return "{original_message}\n{stdout}\n{stderr}".format( original_message=super().__str__(), stdout=self.stdout, stderr=self.stderr) def run_command_fail_explicitly(command: Union[str, List[str]], cwd: str, **kwargs) -> str: """ Runs a command returning the output or failing with useful information Args: command: The command to execute, can also be a space separated string. cwd: The directory in which the command should be launched. If called by a pytest test function or method, this probably should be a `tmp_path` fixture. **kwargs: Additional keyword arguments are passed to `subprocess.check_output`. Returns: The standard output and error of the command if successfully executed. Raises: RuntimeError: If the subprocess command executes with a non-zero output. """ DEFAULT_KWARGS = { "shell": isinstance(command, str) and " " in command, "stderr": subprocess.PIPE, "timeout": DEFAULT_PROCESS_TIMEOUT_SECONDS, "universal_newlines": True, } try: merged_kwargs = {**DEFAULT_KWARGS, **kwargs} out = subprocess.check_output( command, cwd=cwd, **merged_kwargs, ) except subprocess.CalledProcessError as e: stdout = e.stdout stderr = e.stderr # type of the stdout stream will depend on the subprocess. # The python docs say decoding is to be handled at # application level. if hasattr(stdout, "decode"): stdout = stdout.decode("utf-8", errors="ignore") if hasattr(stderr, "decode"): stderr = stderr.decode("utf-8", errors="ignore") raise CalledProcessError(1, cmd=command, output=stdout, stderr=stderr) from e return out class ExpectedError(Exception): """An error which is expected by the test suite, to be used when decorating tests: @pytest.mark.xfail(raises=ExpectedError) def test_something_that_needs_fixing(): try: broken_fun() except Exception as e: # check that e matches a condition if check_cond(e): raise ExpectedError("") from e raise # otherwise raise the original unexpected error """ def run_notebook(notebook_filename, expected_error: str = "", cwd=REPO_ROOT): """helper to run notebooks which may or may not be expected to fail""" with open(notebook_filename) as f: nb = nbformat.read(f, as_version=4) ep = ExecutePreprocessor(timeout=600, kernel_name="python3") try: ep.preprocess(nb, {"metadata": {"path": f"{cwd}"}}) except Exception as e: if (not expected_error) or (expected_error not in str(e)): raise raise ExpectedError(expected_error) from e class FakeDatasetEqualGraphs(InMemoryDataset): #pylint: disable=abstract-method r"""A fake dataset that returns randomly generated :class:`~torch_geometric.data.Data` objects with fixed graph size. Args: num_graphs (int): The number of graphs. num_nodes (int): The number of nodes in a graph. num_channels (int): The number of node features. edge_dim (int): The number of edge features. num_edges (int, optional): The number of edges in a graph. (default: :obj:`None`) """ def __init__(self, num_graphs: int, num_nodes: int, num_channels: int, edge_dim: int, num_edges: Optional[int] = None) -> None: if num_graphs < 1: raise RuntimeError("Can't create dataset with less than 1 graph.") super().__init__('.') self.num_nodes = num_nodes if num_edges is not None: self.num_edges = num_edges else: # Randomize number of edges in graph. self.num_edges = random.randint(num_nodes + 1, num_nodes * (num_nodes - 1)) self.num_channels = num_channels self.edge_dim = edge_dim data_list = [self.generate_data() for _ in range(num_graphs)] self.data, self.slices = self.collate(data_list) def generate_data(self) -> Data: x = torch.rand(self.num_nodes, self.num_channels) edge_index = torch.randint(high=self.num_nodes, size=(2, self.num_edges)) edge_attr = torch.rand(self.num_edges, self.edge_dim) # -100 is the default value of `ignore_index` in `nn.CrossEntropyLoss`. y = torch.tensor([-100]).long() return Data(x=x, edge_index=edge_index, edge_attr=edge_attr, y=y) ================================================ FILE: tests/grouping_scatters_gathers_test.py ================================================ #!/usr/bin/env python3 # Copyright (c) 2022 Graphcore Ltd. All rights reserved. # Tests for PyG torch_scatter ops integration with PopTorch import json from torch import gather import torch import pytest import helpers import poptorch if helpers.is_running_tests: from torch_scatter import scatter, scatter_log_softmax, scatter_softmax, scatter_std, scatter_add, scatter_max else: def scatter(): pass def scatter_log_softmax(): pass def scatter_softmax(): pass def scatter_std(): pass def scatter_add(): pass def scatter_max(): pass expected_ops_after_fuse = { 'scatter': 2, 'scatter_add': 2, 'scatter_max': 2, 'scatter_softmax': 3, 'scatter_log_softmax': 3, 'scatter_std': 3, 'gather': 2 } expected_group_size_after_fuse = { 'scatter': 3, 'scatter_add': 3, 'scatter_max': 3, 'scatter_softmax': 3, 'scatter_log_softmax': 3, 'scatter_std': 6, 'gather': 3 } def check_is_fused(poptorch_model, op_type, expected_group_size, expected_num_ops): all_ops = json.loads(poptorch_model._debugGetPopartIR())['maingraph'] # pylint: disable=protected-access op_types = ("GroupedGather", "Gather") if op_type == "gather" else ("ScatterReduce", ) ops = [] for grouped_op_type in op_types: for op in all_ops: if op['type'] == grouped_op_type: ops.append(op) assert len(ops) == expected_num_ops assert int(ops[0]['attributes']['group_size']) == expected_group_size def torch_fusible_model(func, src, index, dtype): # We do the shape inference from scatter here because we don't support # dynamic shaped tensors on the ipu dim = 0 dim_size = int(index.max()) + 1 class Model(torch.nn.Module): def forward(self, src, index, dtype): ones = torch.ones_like(src, dtype=dtype) two = torch.ones_like(src) * 2 if func == gather: out = func(src, dim, index) out_ones = func(ones, dim, index) out_two = func(two, dim, index) else: out = func(src, index, dim_size=dim_size) out_ones = func(ones, index, dim_size=dim_size) out_two = func(two, index, dim_size=dim_size) if isinstance(out, tuple): out = out[0] out_ones = out_ones[0] out_two = out_two[0] src_updated = src - torch.sum(out) # Functions which should not be fused out_updated_s, _ = scatter_max(src_updated, index, dim_size=dim_size) out_updated_g = gather(src_updated, dim, index) out_updated_sum = torch.sum(out_updated_g) + torch.sum( out_updated_s) return (out_ones + out_two) / out_updated_sum model = Model() options = poptorch.Options() poptorch_model = poptorch.inferenceModel(model, options=options) ones = torch.ones_like(src, dtype=dtype) two = torch.ones_like(src) * 2 if func == gather: native_out = func(src, dim, index) native_out_ones = func(ones, dim, index) native_out_two = func(two, dim, index) else: native_out = func(src, index, dim_size=dim_size) native_out_ones = func(ones, index, dim_size=dim_size) native_out_two = func(two, index, dim_size=dim_size) if isinstance(native_out, tuple): native_out = native_out[0] native_out_ones = native_out_ones[0] native_out_two = native_out_two[0] src_updated = src - torch.sum(native_out) native_out_updated_s, _ = scatter_max(src_updated, index) native_out_updated_g = gather(src_updated, dim, index) native_out_updated_sum = torch.sum(native_out_updated_s) + torch.sum( native_out_updated_g) expected_nat = (native_out_ones + native_out_two) / native_out_updated_sum ipu_out = poptorch_model(src, index, dtype) # Verify that the ops have been fused expected_num_ops = expected_ops_after_fuse[func.__name__] expected_group_size = expected_group_size_after_fuse[func.__name__] if dtype != torch.float32: expected_group_size = expected_group_size - 1 expected_num_ops = expected_num_ops + 1 check_is_fused(poptorch_model, func.__name__, expected_group_size, expected_num_ops) helpers.assert_allclose(actual=torch.nan_to_num(ipu_out), expected=torch.nan_to_num(expected_nat)) @pytest.mark.parametrize("shape", [(3, ), (3, 5), (3, 5, 5)]) @pytest.mark.parametrize("func", [ scatter, scatter_add, scatter_max, scatter_softmax, scatter_log_softmax, scatter_std, gather ]) @pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.int]) def test_fuse(shape, func, dtype): if dtype != torch.float32 and func in [ scatter_softmax, scatter_log_softmax, scatter_std ]: pytest.skip("can only be computed with fp32 data types") torch.manual_seed(0) x = torch.rand(shape) ind = torch.randint(3, shape) torch_fusible_model(func, x, ind, dtype) ================================================ FILE: tests/gru_test.py ================================================ #!/usr/bin/env python3 # Copyright (c) 2020 Graphcore Ltd. All rights reserved. import pytest import torch import helpers import poptorch @pytest.mark.parametrize("bias", [True, False]) @pytest.mark.parametrize("batch_first", [True, False]) def test_gru(bias, batch_first): length = 1 batches = 3 input_size = 5 hidden_size = 7 layers = 1 directions = 1 torch.manual_seed(42) if batch_first: inp = torch.randn(batches, length, input_size) else: inp = torch.randn(length, batches, input_size) h0 = torch.randn(layers * directions, batches, hidden_size) op = torch.nn.GRU(input_size, hidden_size, bias=bias, batch_first=batch_first) out_fn = lambda x: x[0] model = helpers.ModelWithWeights(op, inp.shape, out_fn) poptorch_model = poptorch.trainingModel(model) (native_out, native_hn), _ = model((inp, h0)) (poptorch_out, poptorch_hn), _ = poptorch_model((inp, h0)) # Inference test - check outputs helpers.assert_allclose(actual=poptorch_out, expected=native_out) helpers.assert_allclose(actual=poptorch_hn, expected=native_hn) # Training test - check weights changed poptorch_model.assert_weights_changed() ================================================ FILE: tests/half_float_test.py ================================================ #!/usr/bin/env python3 # Copyright (c) 2020 Graphcore Ltd. All rights reserved. import pytest import torch import helpers import poptorch def assert_same_type(inputs, model): native_out = model(inputs) pop_model = poptorch.inferenceModel(model) pop_out = pop_model(inputs) assert native_out.dtype == pop_out.dtype def type_out_harness(inputs, forward_op): class Model(torch.nn.Module): def forward(self, x): return forward_op(x) model = Model() assert_same_type(inputs, model) ## Ones and Zeros tests ## ones_zeros = [torch.ones, torch.zeros] @pytest.mark.parametrize("op", ones_zeros) def test_ones_zeros_default_resolved(op): def fw_op(input): return op((2, 3, 4), dtype=input.dtype, device=helpers.outputDevice()) + input.to(input.dtype) type_out_harness(torch.tensor([1], dtype=torch.float16), fw_op) type_out_harness(torch.tensor([1], dtype=torch.float32), fw_op) # The dtype will correctly resolve becuse it matches the input added # All settings will match pytorch @pytest.mark.parametrize("op", ones_zeros) def test_ones_zeros_input_resolved_with_input_dtype(op): def fw_op(input): return op((2, 3, 4), dtype=input.dtype, device=helpers.outputDevice()) + input type_out_harness(torch.tensor([1], dtype=torch.float16), fw_op) type_out_harness(torch.tensor([1], dtype=torch.float32), fw_op) # The zeros/ones will resolve correctly becaue torch.float16 could not have been # from a tensor which could have beeh half/float. # # Half and half to float: # The output will always be float 16. # # Like pytorch: # The output will be correct. @pytest.mark.parametrize("op", ones_zeros) def test_ones_zeros_input_resolved_always_float16(op): def fw_op(input): return op( (2, 3, 4), dtype=torch.float16, device=helpers.outputDevice()) + input type_out_harness(torch.tensor([1], dtype=torch.float16), fw_op) type_out_harness(torch.tensor([1], dtype=torch.float32), fw_op) # The dtype will resolve to the same as input. In the float16 case, the # ones/zeros will be wrongly generated as a float16. # # The output will always match input. @pytest.mark.parametrize("op", ones_zeros) def test_ones_zeros_input_resolved_always_float32(op): def fw_op(input): return op( (2, 3, 4), dtype=torch.float32, device=helpers.outputDevice()) + input type_out_harness(torch.tensor([1], dtype=torch.float16), fw_op) type_out_harness(torch.tensor([1], dtype=torch.float32), fw_op) ## torch.rand tests ## def test_rand_default_resolved(): def fw_op(input): return torch.rand(3, 5, 100, dtype=input.dtype) type_out_harness(torch.tensor([1], dtype=torch.float16), fw_op) type_out_harness(torch.tensor([1], dtype=torch.float32), fw_op) #The dtype will correctly resolve becuse it matches the input added def test_rand_default_input_resolved(): def fw_op(input): return torch.rand(3, 5, 100, dtype=input.dtype) + input type_out_harness(torch.tensor([1], dtype=torch.float16), fw_op) type_out_harness(torch.tensor([1], dtype=torch.float32), fw_op) # The type will resolve correctly because torch.float16 could not have been # from a tensor which could have been half/float. # # Half and half to float: # The output will always be float 16. # # Like pytorch: # The output will be correct. def test_rand_default_input_resolved_always_float16(): def fw_op(input): return torch.rand(3, 5, 100, dtype=torch.float16) + input type_out_harness(torch.tensor([1], dtype=torch.float16), fw_op) type_out_harness(torch.tensor([1], dtype=torch.float32), fw_op) ## torch.normal tests ## # The type will be resolved correctly as the mean and standard deviation are # inputs to the op def test_normal_mean_correctly_resolved(): def fw_op(input_mean): return torch.normal(input_mean, 10.0) type_out_harness(torch.tensor([0.0], dtype=torch.float16), fw_op) type_out_harness(torch.tensor([0.0], dtype=torch.float32), fw_op) # The type will be resolved correctly as the mean and standard deviation are # inputs to the op def test_normal_std_correctly_resolved(): def fw_op(input_std): return torch.normal(0.0, input_std) type_out_harness(torch.tensor([10.0], dtype=torch.float16), fw_op) type_out_harness(torch.tensor([10.0], dtype=torch.float32), fw_op) ## torch.distributions.uniform.Uniform tests ## # The type will always resolve to float32 as it is traced to torch.rand without # the low and high input tensors (which become dead code) def test_distributions_uniform(): def fw_op(input_low): torch.manual_seed(42) ud = torch.distributions.uniform.Uniform( input_low, torch.tensor([10.0], dtype=torch.float32)) return ud.sample((10, 10, 1000)) type_out_harness(torch.tensor([1], dtype=torch.float16), fw_op) type_out_harness(torch.tensor([1], dtype=torch.float32), fw_op) ## torch.distributions.Normal tests ## # The type will resolve correctly because the mean is an input def test_distributions_normal_mean_correctly_resolved(): def fw_op(input_mean): torch.manual_seed(42) ud = torch.distributions.Normal(input_mean, 10.0) return ud.sample((10, 10, 100)) type_out_harness(torch.tensor([0.0], dtype=torch.float16), fw_op) type_out_harness(torch.tensor([0.0], dtype=torch.float32), fw_op) def test_distributions_normal_std_correctly_resolved(): def fw_op(input_std): torch.manual_seed(42) ud = torch.distributions.Normal(0.0, input_std) return ud.sample((10, 10, 100)) type_out_harness(torch.tensor([10.0], dtype=torch.float16), fw_op) type_out_harness(torch.tensor([10.0], dtype=torch.float32), fw_op) ## tensor._uniform test # # The type will resolve correctly because it is based on the input tensor def test_uniform_correctly_resolved(): def fw_op(input_tensor): torch.manual_seed(42) input_tensor = input_tensor + 0 # Ensure input is not modified in place return input_tensor.uniform_() type_out_harness(torch.empty((3, 4, 10), dtype=torch.float16), fw_op) type_out_harness(torch.empty((3, 4, 10), dtype=torch.float32), fw_op) ## tensor._normal test # # The type will also resolve correctly because it is based on the input tensor def test_normal_correctly_resolved(): def fw_op(input_tensor): torch.manual_seed(42) input_tensor = input_tensor + 0 # Ensure input is not modified in place return input_tensor.normal_() type_out_harness(torch.empty((3, 4, 10), dtype=torch.float16), fw_op) type_out_harness(torch.empty((3, 4, 10), dtype=torch.float32), fw_op) ## tensor constant tests ## # The type will resolve correctly because it is added to the input. # # The output will always be the same as the def test_constant_correctly_resolved(): def fw_op(input): return torch.tensor([1, 2, 3], dtype=input.dtype) + input type_out_harness(torch.tensor([3, 4, 8], dtype=torch.float16), fw_op) type_out_harness(torch.tensor([3, 4, 8], dtype=torch.float32), fw_op) # The type will resolve to float16 always because the input is cast to float16 # The output will always be float 16. def test_constant_add_float16(): def fw_op(input): return torch.tensor([1, 2, 3], dtype=input.dtype) + input.to( torch.float16) type_out_harness(torch.tensor([3, 4, 8], dtype=torch.float16), fw_op) type_out_harness(torch.tensor([3, 4, 8], dtype=torch.float32), fw_op) def test_constant_always_float32(): def fw_op(input): return torch.tensor([1, 2, 3], dtype=torch.float32) + input type_out_harness(torch.tensor([3, 4, 8], dtype=torch.float16), fw_op) type_out_harness(torch.tensor([3, 4, 8], dtype=torch.float32), fw_op) @pytest.mark.parametrize("conv", [True, False]) def test_float16_activations_float32_weights(conv): torch.manual_seed(42) if conv: input = torch.ones(1, 4, 4) model = torch.nn.Conv1d(4, 5, 2) else: input = torch.ones(10) model = torch.nn.Linear(10, 20) # Float 32 act, float 32 weights pop_model = poptorch.inferenceModel(model) pop_out = pop_model(input) assert pop_out.dtype == torch.float # Float 16 act, float 32 weights pop_model = poptorch.inferenceModel(model) pop_out = pop_model(input.half()) assert pop_out.dtype == torch.half # Float 32 act, float 16 weights model.half() pop_model = poptorch.inferenceModel(model) pop_out = pop_model(input) assert pop_out.dtype == torch.float # Float 16 act, float 16 weights pop_model = poptorch.inferenceModel(model) pop_out = pop_model(input.half()) assert pop_out.dtype == torch.half def test_master_weight_training(): torch.manual_seed(42) class Model(torch.nn.Module): def __init__(self): super().__init__() self.linear = torch.nn.Linear(10, 10) self.loss = torch.nn.MSELoss() def forward(self, data, target): out = self.linear(data) loss = self.loss(out, target) return out, loss model = Model() poptorch_model = poptorch.trainingModel(model) target = torch.randn(10) input = torch.randn(10).half() # Make sure the first run doesn't already pass the test.s original, original_loss = poptorch_model(input, target.half()) assert original_loss > 0.1 assert not torch.allclose(original.float(), target, rtol=1e-02, atol=1e-02) for _ in range(0, 2500): out, loss = poptorch_model(input, target.half()) # Check we have trained the "model" assert loss.float() < 0.001 helpers.assert_allclose(actual=out.float(), expected=target, rtol=1e-02, atol=1e-02) def test_bigger_model_training(): torch.manual_seed(42) class Model(torch.nn.Module): def __init__(self): super().__init__() self.linear_chain = torch.nn.Sequential(torch.nn.Linear(10, 10), torch.nn.Linear(10, 10), torch.nn.Linear(10, 10), torch.nn.Linear(10, 10), torch.nn.Linear(10, 10)) self.loss = torch.nn.MSELoss() def forward(self, data, target): out = self.linear_chain(data) loss = self.loss(out, target) return out, loss model = Model() poptorch_model = poptorch.trainingModel(model) target = torch.randn(10) input = torch.randn(10).half() # Make sure the first run doesn't already pass the test.s original, original_loss = poptorch_model(input, target.half()) assert original_loss > 0.1 assert not torch.allclose(original.float(), target, rtol=1e-02, atol=1e-02) for _ in range(0, 2500): out, loss = poptorch_model(input, target.half()) # Check we have trained the "model" assert loss.float() < 0.001 helpers.assert_allclose(actual=out.float(), expected=target, rtol=1e-02, atol=1e-02) ================================================ FILE: tests/half_test.py ================================================ #!/usr/bin/env python3 # Copyright (c) 2020 Graphcore Ltd. All rights reserved. import os # pylint: disable=unused-import import unittest.mock import torch import torchvision.models as models import helpers import poptorch def test_half_float_default_option(): class SimpleAdder(torch.nn.Module): def forward(self, x, y): return x + y model = SimpleAdder() inference_model = poptorch.inferenceModel(model) t1 = torch.tensor([1.]).half() t2 = torch.tensor([2.]).float() outHalf = inference_model(t1, t2) assert outHalf.dtype == torch.float # Refresh and try the other way model = SimpleAdder() inference_model = poptorch.inferenceModel(model) outHalf = inference_model(t2, t1) assert outHalf.dtype == torch.float @unittest.mock.patch.dict("os.environ", helpers.disableSmallModel()) def test_resnet(): torch.manual_seed(42) image_input = torch.randn([1, 3, 224, 224]).half() t1 = torch.tensor([1.]).long() loss_fn = torch.nn.NLLLoss() class ModelWithLoss(torch.nn.Module): def __init__(self): super().__init__() # We are running on a dummy input so it doesn't matter whether the # weights are trained. self.base_model = models.resnet18(pretrained=False) def forward(self, data, target): out = self.base_model(data) loss = loss_fn(out, target) return out, loss model = ModelWithLoss() model.train() model.half() training_model = poptorch.trainingModel(model) # Run on IPU. poptorch_out, loss = training_model(image_input, t1) assert poptorch_out.dtype == torch.half assert loss.dtype == torch.half def test_model_with_weights(): model = torch.nn.Linear(1, 10).half() t1 = torch.tensor([1.]).half() inference_model = poptorch.inferenceModel(model) out = inference_model(t1) assert out.dtype == torch.half # For running on host. model = model.float() t1 = t1.float() helpers.assert_allclose(expected=model(t1), actual=out.float(), rtol=0.001, atol=1e-04) def test_simple_model(): class SimpleAdder(torch.nn.Module): def forward(self, x, y, z, w): return x + y + 5, z + w + 5 model = SimpleAdder() inference_model = poptorch.inferenceModel(model) t1 = torch.tensor([1.]).half() t2 = torch.tensor([2.]).half() t3 = torch.tensor([3.]) t4 = torch.tensor([4.]) outHalf, outFloat = inference_model(t1, t2, t3, t4) assert outHalf.dtype == torch.half assert outHalf.float() == 8.0 assert outFloat.dtype == torch.float assert outFloat == 12.0 def test_lstm(): torch.manual_seed(42) numHidden = 5 inputSize = 3 lstm = torch.nn.LSTM(3, numHidden) lstm.half() ipuLstm = poptorch.inferenceModel(lstm) inputs = [torch.randn(1, inputSize).half() for _ in range(5)] # Add the extra 2nd dimension inputs = torch.cat(inputs).view(len(inputs), 1, -1) hidden = ( torch.randn(1, 1, numHidden).half(), torch.randn(1, 1, numHidden).half(), ) ipuOut = ipuLstm(inputs, hidden) assert isinstance(ipuOut[0], torch.HalfTensor) def test_ipu_print_tensor(): class SimplePrinter(torch.nn.Module): def forward(self, x): return poptorch.ipu_print_tensor(x) t1 = torch.tensor([1.], dtype=torch.float16) inference_model = poptorch.inferenceModel(SimplePrinter()) out = inference_model(t1) assert out == 1.0 assert out.dtype == torch.float16 def test_buffers(): torch.manual_seed(42) fake_data = torch.ones(1, 64, 10, 10).half() class M(torch.nn.Module): def __init__(self): super().__init__() self.bn = torch.nn.BatchNorm2d(64) self.bn.running_mean += torch.randn(64) self.bn.running_var += torch.randn(64) def forward(self, i): out = self.bn(i) return out, self.bn.running_var, self.bn.running_mean model = M() cpu_mean = model.bn.running_mean cpu_var = model.bn.running_var model.bn.half() model.bn.running_mean = model.bn.running_mean.to(torch.float) model.bn.running_var = model.bn.running_var.to(torch.float) poptorch_model = poptorch.inferenceModel(model) _, ipu_var, ipu_mean = poptorch_model(fake_data) # We lose some precision in the half conversion. helpers.assert_allclose(actual=ipu_mean, expected=cpu_mean.half(), rtol=1e-02, atol=1e-02) helpers.assert_allclose(actual=ipu_var, expected=cpu_var.half(), rtol=1e-02, atol=1e-02) def test_half_casts_outplace(): torch.manual_seed(42) opts = poptorch.Options() class Model(torch.nn.Module): def forward(self, x1, x2): return x1, x2, x1.to(torch.float16), x2.half() model = Model() poptorch_model = poptorch.inferenceModel(model, opts) x1 = torch.tensor([0], dtype=torch.float32) x2 = torch.tensor([0], dtype=torch.float32) x1_ipu, x2_ipu, x1_cast, x2_cast = poptorch_model(x1, x2) assert x1_ipu.dtype == torch.float32 assert x2_ipu.dtype == torch.float32 assert x1_cast.dtype == torch.float16 assert x2_cast.dtype == torch.float16 def test_8bit_io_casting(): torch.manual_seed(42) class Model(torch.nn.Module): def __init__(self): super().__init__() self.linear = torch.nn.Linear(1, 1) def forward(self, x): x1 = self.linear(x.half()) x2 = self.linear(x.to(torch.half)) x3 = self.linear(x.float()) x4 = self.linear(x.to(torch.float)) return x1, x2, x3, x4 model = Model() poptorch_model = poptorch.inferenceModel(model) x = torch.tensor([0], dtype=torch.uint8) y = poptorch_model(x) assert y[0].dtype == torch.half assert y[1].dtype == torch.half assert y[2].dtype == torch.float assert y[3].dtype == torch.float def test_buffers_without_parameters_can_be_traced(): torch.manual_seed(0) class Model(torch.nn.Module): def __init__(self): super().__init__() self.register_buffer("b", torch.randn(3, 3)) def forward(self, x): return torch.matmul(self.b, x) model = Model() model.half() poptorch_model = poptorch.inferenceModel(model) poptorch_model(torch.randn(3, 3).half()) ================================================ FILE: tests/helpers.py ================================================ # Copyright (c) 2020 Graphcore Ltd. All rights reserved. import functools import os import re import torch import poptorch import poptorch.poptorch_core as poptorch_core # type: ignore # Will be changed by conftest.py if pytest is only collecting tests is_running_tests = True # Will be changed by conftest.py if pytest is running reduced testing running_reduced_testing = False def selectIfReduced(reduced_set, full_set): if running_reduced_testing: return reduced_set return full_set def onlyFirstIfReduced(full_set): if running_reduced_testing: return (full_set[0], ) return full_set def assert_allclose(*, actual=None, expected=None, check_dtype=False, atol=None, rtol=None, **kwargs): """Assertion function that enforces passing the 'actual' and 'expected' arguments to torch.testing.assert_close in the correct order by forcing the use of keyword arguments. This improves error reporting in case of assertion failures. :param actual: torch.Tensor, scalar value, or array-like of either torch.Tensor objects or scalar values that is tested. :param expected: torch.Tensor, scalar value, or array-like of either torch.Tensor objects or scalar values that is tested. :param check_dtype: whether to check the types of the tensor :param kwargs: kwargs passed to torch.testing.assert_close. """ assert actual is not None and expected is not None, ( "'actual' and 'expected' keyword arguments must be present") in_types = (type(actual), type(expected)) if in_types == (torch.Tensor, torch.Tensor): assert actual.shape == expected.shape, ( "Shape of 'actual' (%s) should be the same as shape of" " 'expected' (%s)") % (actual.shape, expected.shape) elif in_types in ((list, list), (tuple, tuple)): assert len(actual) == len(expected), ( "Length of 'actual' (%s) should be the same as length of" " 'expected' (%s)") % (len(actual), len(expected)) for a, e in zip(actual, expected): assert_allclose(actual=a, expected=e, **kwargs) return if not isinstance(actual, torch.Tensor): actual = torch.tensor(actual) if not isinstance(expected, torch.Tensor): expected = torch.tensor(expected) if atol is None and expected.dtype == torch.float16: atol = 5e-4 if rtol is None and expected.dtype == torch.float16: rtol = 5e-3 torch.testing.assert_close(actual, expected, atol=atol, rtol=rtol, check_dtype=check_dtype, **kwargs) def assert_allequal(*, actual=None, expected=None, msg='', check_dtype=False, **kwargs): """Assertion function that enforces passing the 'actual' and 'expected' arguments to torch.testing.assert_close in the correct order by forcing the use of keyword arguments. This improves error reporting in case of assertion failures. Additionally, rtol=0 and atol=0 are passed to torch.testing.assert_close as this results in identity comparison for integer and boolean tensors. :param actual: torch.Tensor, scalar value, or array-like of either torch.Tensor objects or scalar values that is tested. :param expected: torch.Tensor, scalar value, or array-like of either torch.Tensor objects or scalar values that is tested. :param msg: message passed to torch.testing.assert_close. :param check_dtype: whether to check the types of the tensor :param kwargs: kwargs passed to torch.testing.assert_close. """ assert actual is not None and expected is not None, ( "'actual' and 'expected' keyword arguments must be present") if isinstance(actual, torch.Tensor) and isinstance(expected, torch.Tensor): assert actual.shape == expected.shape, ( "Shape of 'actual' (%s) should be the same as shape of" " 'expected' (%s)") % (actual.shape, expected.shape) torch.testing.assert_close(actual, expected, rtol=0, atol=0, msg=msg, check_dtype=check_dtype, **kwargs) def disableSmallModel(): # POPTORCH_IPU_MODEL takes precedence over POPTORCH_SMALL_IPU_MODEL if not poptorch.ipuHardwareIsAvailable(): return {"POPTORCH_IPU_MODEL": "1"} return {} def forceSmallModel(): # POPTORCH_IPU_MODEL takes precedence over POPTORCH_SMALL_IPU_MODEL return {"POPTORCH_IPU_MODEL": "0", "POPTORCH_SMALL_IPU_MODEL": "1"} def disableAllModels(): return {"POPTORCH_IPU_MODEL": "0", "POPTORCH_SMALL_IPU_MODEL": "0"} def propagateInputShapes(graph, dummyInputs): for graphInput, dummyInput in zip(graph.inputs(), dummyInputs): graphInput.inferTypeFrom(dummyInput) poptorch_core.propagateInputShapes(graph) # Wrapper model with weights to test that gradients are generated # and updated in a graph with a given op - Linear layer added to # ensure some weights exist class ModelWithWeights(torch.nn.Module): def __init__(self, op, first_input_shape, out_fn=None, loss_fn=None): super().__init__() self.op = op numel = first_input_shape.numel() self.first_input_shape = first_input_shape self.lin = torch.nn.Linear(numel, numel) # Copy original weights for training test self._weights_before = self.lin.weight.detach().clone() # A function of the output that returns what the backwards pass should # propagate through. For example, torch.median returns values and indices # but the loss should only be calculated using the values. If unspecified, # defaults to an identity function self.out_fn = out_fn # If the loss fn takes more than 1 param (e.g. a target), these extra params # must be wrapped in a function that only takes a single input self.loss_fn = loss_fn if not loss_fn is None \ else lambda x: poptorch.identity_loss(x**2, reduction='sum') # Flatten first input, pass through linear layer of same size # and pass reassembled inputs to op def forward(self, xs): assert isinstance(xs, tuple) x1 = torch.flatten(xs[0]) x1 = self.lin(x1) x1 = x1.reshape(self.first_input_shape) x = self.op(x1, *xs[1:]) loss_in = x if self.out_fn is None else self.out_fn(x) if isinstance(loss_in, tuple): l = self.loss_fn(*loss_in) else: l = self.loss_fn(loss_in) return x, l def assert_weights_changed(self): weights_after = self.lin.weight.detach().clone() assert not torch.allclose(self._weights_before, weights_after) class PrintCapfdOnExit: """Helper that prints the content of capfd on exit Useful if a test fails before its output validation step.""" def __init__(self, capfd): self.capfd = capfd def __enter__(self): pass def __exit__(self, type, value, traceback): out, err = self.capfd.readouterr() log = out + err with self.capfd.disabled(): if log: print(log.encode("ascii", "ignore").decode()) def printCapfdOnExit(func): """Decorator to print the content of capfd after the wrapped function exits.""" @functools.wraps(func) def wrapper(capfd, *args, **kwargs): with PrintCapfdOnExit(capfd): func(*args, **kwargs, capfd=capfd) return wrapper def overridePoptorchLogLevel(level=None): """Decorator to override the PopTorch log level for the duration of the test""" def decorator(func): @functools.wraps(func) def wrapper(*args, **kwargs): if level is not None: poptorch.setLogLevel(level) func(*args, **kwargs) poptorch.setLogLevel(os.environ.get("POPTORCH_LOG_LEVEL", "WARN")) return wrapper return decorator def overridePopartLogLevel(level=None): """Decorator to override the Popart log level for the duration of the test""" def decorator(func): @functools.wraps(func) def wrapper(*args, **kwargs): if level is not None: poptorch._logging.setPopartLogLevel(level) # pylint: disable=protected-access func(*args, **kwargs) poptorch._logging.setPopartLogLevel( # pylint: disable=protected-access os.environ.get("POPART_LOG_LEVEL", "WARN")) return wrapper return decorator class LogIterator: def __init__(self, lines): self._lines = lines self._current = 0 self._num_lines = len(lines) self._all_checks = [] def lineNumber(self): return self._current def findNext(self, *exprs): """Find the next line in the log matching all the regular expressions provided""" self._all_checks.append(exprs) line = self._findNext(exprs) assert line is not None, ( "\n".join(self._lines) + "\n The log above doesn't contain lines matching all " "these expressions:\n " + "\n ".join(str(e) for e in self._all_checks)) return line def _findNext(self, exprs): while self._current < self._num_lines: line = self._lines[self._current] self._current += 1 if all(re.search(e, line) for e in exprs): return line return None def assert_not_contains(self, *exprs): line = self._findNext(exprs) if line is not None: raise ValueError( f"{line}" "\n The line above matches all of the expressions " f"{exprs}") def findAll(self, expr): """Return all lines in the log matching the provided regular expression""" matching_lines = [] for line in self._lines: match = re.search(expr, line) if match is not None: matching_lines.append(match) return matching_lines class LogChecker: def __init__(self, capfd_or_str): if isinstance(capfd_or_str, str): self._log = capfd_or_str elif hasattr(capfd_or_str, "text"): self._log = capfd_or_str.text elif hasattr(capfd_or_str, "readouterr"): out, err = capfd_or_str.readouterr() self._log = out + err else: raise TypeError("LogChecker passed unsupported capture fixture") self._lines = self._log.split('\n') def createIterator(self): return LogIterator(self._lines) def assert_isEmpty(self): assert not self._log, f"Expected an empty log but got {self._log}" def assert_contains(self, *strings): """Assert there is a line in the log matching all the strings provided """ if len(strings) == 1: assert strings[0] in self._log, (f"{self._log}" "\ndoes not contain " f"'{strings[0]}'") else: assert any( all(s in line for s in strings) for line in self._lines), ( f"{self._log}" "\n No line in the above log contains all of the strings " f"{strings}") def assert_contains_after(self, string, after): """Assert there is a line in the log matching the string provided, at least one after the the line containing the other provided string""" after_hit = False for line in self._lines: if after_hit: if string in line: return elif after in line: after_hit = True raise AssertionError(f"Did not contain {string} after {after}") def assert_not_contains(self, *strings): """Assert there is no line in the log matching all the strings provided """ if len(strings) == 1: assert strings[0] not in self._log, (f"{self._log}" "\ncontains " f"'{strings[0]}'") else: for line in self._lines: if all(s in line for s in strings): # Found a line matching all the strings raise ValueError( f"{line}" "\n The line above matches all of the strings " f"{strings}") def _string_matches_exprs(self, s, exprs): return all(re.search(e, s) for e in exprs) def assert_matches(self, *exprs, per_line=True): """Assert the log matches all the regular expressions provided """ if per_line: # Found a line matching all the exprs if any( self._string_matches_exprs(line, exprs) for line in self._lines): return else: # Search the entire log at once if self._string_matches_exprs(self._log, exprs): return any_line_in = "any line in " if per_line else "" raise ValueError( f"{self._log}" f"\n All of the expressions do not match {any_line_in}" f"the log {exprs}") def assert_no_matches(self, *exprs, per_line=True): """Assert the log does not match all the regular expressions provided""" if per_line: for line in self._lines: if self._string_matches_exprs(line, exprs): # Found a line matching all the exprs raise ValueError( f"{line}" "\n The line above matches all of the expressions " f"{exprs}") else: if self._string_matches_exprs(self._log, exprs): # The log matches all the exprs raise ValueError( f"{self._log}" "\n The log above matches all of the expressions " f"{exprs}") def findall(self, pattern: str) -> list: return re.findall(pattern, self._log) # When we're running on the CPU we don't need to specify a device # but for IPU devices we need to make sure the output buffers are # created on the IPU. def outputDevice(): if poptorch.isRunningOnIpu() and poptorch._impl.isDispatchTracing(): # pylint: disable=protected-access return "ipu" return None ================================================ FILE: tests/hooks_test.py ================================================ # Copyright (c) 2022 Graphcore Ltd. All rights reserved. import pytest import torch import torch.nn as nn from poptorch import (inferenceModel, registerPostCompileHook, registerPreCompileHook) class Model(nn.Module): def forward(self, input): return input def test_precompile_and_postcompile_hooks(): """Test that registered pre and post compile hooks are called.""" model = Model() precompile_called = False postcompile_called = False def precompile(): nonlocal precompile_called precompile_called = True def postcompile(): nonlocal postcompile_called postcompile_called = True registerPreCompileHook(precompile) registerPostCompileHook(postcompile) poplar_exec = inferenceModel(model) input = torch.randn((10, 10), dtype=torch.float32) poplar_exec(input) assert precompile_called and postcompile_called def test_non_callable(): """Test that an error is raised if a non-callable is attempted to be registered""" with pytest.raises(RuntimeError, match="must be callable"): registerPreCompileHook(2) with pytest.raises(RuntimeError, match="must be callable"): registerPostCompileHook(False) def test_called_in_order(): """Test that hooks are called in the order they were registered in.""" expected_calls = [1, 2, 3] calls = [] def hookO(): nonlocal calls calls.append(expected_calls[0]) def hook1(): nonlocal calls calls.append(expected_calls[1]) def hook2(): nonlocal calls calls.append(expected_calls[2]) registerPreCompileHook(hookO) registerPreCompileHook(hook1) registerPreCompileHook(hook2) model = Model() poplar_exec = inferenceModel(model) input = torch.randn((10, 10), dtype=torch.float32) poplar_exec(input) assert calls == expected_calls def test_can_remove(): """Test that a hook is correctly removed via Torch's RemovableHandle.""" called = False def hook(): nonlocal called called = True handle = registerPostCompileHook(hook) handle.remove() model = Model() poplar_exec = inferenceModel(model) input = torch.randn((10, 10), dtype=torch.float32) poplar_exec(input) assert not called ================================================ FILE: tests/if_test.py ================================================ #!/usr/bin/env python3 # Copyright (c) 2020 Graphcore Ltd. All rights reserved. from functools import lru_cache import torch import pytest from helpers import assert_allclose import poptorch @lru_cache(maxsize=None) def infer_model(model): return poptorch.inferenceModel(model) def if_else_harness(model, expected_then, expected_else, *args): inference_model = infer_model(model) condition = torch.tensor([True]) ipu_result = inference_model(condition, *args) cpu_result = model(condition, *args) assert_allclose(expected=expected_then, actual=cpu_result) assert_allclose(expected=expected_then, actual=ipu_result) condition = torch.tensor([False]) ipu_result = inference_model(condition, *args) cpu_result = model(condition, *args) assert_allclose(expected=expected_else, actual=cpu_result) assert_allclose(expected=expected_else, actual=ipu_result) @pytest.mark.skip( reason="Returning constant from model does not work in poptorch (AFS-251)") def test_constants(): class Model(torch.nn.Module): def forward(self, condition): def body_then(): return torch.tensor([0]) def body_else(): return torch.tensor([1]) return poptorch.cond(condition, body_then, [], body_else, [])[0] args = [torch.tensor([v]) for v in range(2)] if_else_harness(Model(), args[0], args[1]) @pytest.mark.skip( reason="Returning constant from model does not work in poptorch (AFS-251)") def test_operations_on_constants(): constants = [[1., 2.], [3., 4.]] class Model(torch.nn.Module): def forward(self, condition): x = torch.tensor(constants[0]) y = torch.tensor(constants[1]) def body_then(a, b): a = a * 2 b = a * b return b def body_else(a, b): a = a - 2 b = b + a return b return poptorch.cond(condition, body_then, [x, y], body_else, [x, y])[0] args = [] exp_then = torch.tensor( [a * 2 * b for a, b in zip(constants[0], constants[1])]) exp_else = torch.tensor( [a - 2 + b for a, b in zip(constants[0], constants[1])]) if_else_harness(Model(), exp_then, exp_else, *args) @pytest.mark.skip(reason="Inplace op does not update model input (AFS-252)") def test_inplace_op(): class Model(torch.nn.Module): def forward(self, condition, x, y): def body_then(a): return a.add_(a) def body_else(b): return b return poptorch.cond(condition, body_then, [x], body_else, [y])[0] or_x = 1. x = torch.tensor([or_x]) y = torch.tensor([10.]) exp_then = x + y exp_else = y if_else_harness(Model(), exp_then, exp_else, x, y) assert torch.tensor([or_x]) == x def test_operation_expecting_constant(): constant = [1.1, 2.3] class Model(torch.nn.Module): def forward(self, condition, z): x = torch.tensor(constant) def body_then(a, b): b = a * torch.topk(b, 2)[0] return b def body_else(a, b): a = a - 2 b = b[:2] + a return b return poptorch.cond(condition, body_then, [x, z], body_else, [x, z])[0] arg = torch.rand(4) exp_then = torch.topk(torch.tensor(arg), 2)[0] * torch.tensor(constant) exp_else = torch.tensor(constant) - 2 + arg[:2] if_else_harness(Model(), exp_then, exp_else, arg) def test_body_args(): class Model(torch.nn.Module): def forward(self, condition, x, y): def body_then(a): out = a + a out = out + out return out def body_else(b): return b return poptorch.cond(condition, body_then, [x], body_else, [y])[0] args = [torch.rand(1) for _ in range(2)] if_else_harness(Model(), args[0] * 4, args[1], *args) def test_cond_training(): class Model(torch.nn.Module): def __init__(self): super().__init__() self.layer1 = torch.nn.Linear(4, 4) def forward(self, condition, x): def body(x): return self.layer1(x) out = poptorch.cond(condition, body, [x], body, [x])[0] loss = poptorch.identity_loss(out, reduction='sum') return out, loss training_model = poptorch.trainingModel(Model()) condition = torch.tensor([True]) x = torch.ones(1, 4).to(torch.float) with pytest.raises( poptorch.Error, match=r"poptorch.cond\(\) is only supported in inference"): training_model(condition, x) def test_multi_outs(): class Model(torch.nn.Module): def forward(self, condition, x, y): def body_then(a): out1 = x + y return a, out1, y def body_else(b): return b * y, y, x - y return poptorch.cond(condition, body_then, [x], body_else, [y]) args = [torch.rand(1) for _ in range(2)] exp_then = [args[0], args[0] + args[1], args[1]] exp_else = [args[1] * args[1], args[1], args[0] - args[1]] if_else_harness(Model(), exp_then, exp_else, *args) def test_diff_num_of_args(): class Model(torch.nn.Module): def forward(self, condition, x, y): def body_then(x, y): return x + y def body_else(x): return x return poptorch.cond(condition, body_then, [x, y], body_else, [x])[0] args = [torch.rand(1) for v in range(2)] exp_then = args[0] + args[1] if_else_harness(Model(), exp_then, args[0], *args) def test_args_from_main_graph(): class Model(torch.nn.Module): def forward(self, condition, x, y): def body_then(): return x * y def body_else(): return x return poptorch.cond(condition, body_then, [], body_else, [])[0] args = [torch.rand(1) for v in range(2)] exp_then = args[0] * args[1] if_else_harness(Model(), exp_then, args[0], *args) def test_call_outer_body(): class Model(torch.nn.Module): def forward(self, condition, x, y): def outer_body(): return x + y def body_then(): return outer_body() def body_else(): return x return poptorch.cond(condition, body_then, [], body_else, [])[0] args = [torch.rand(1) for v in range(2)] exp_then = args[0] + args[1] if_else_harness(Model(), exp_then, args[0], *args) def test_args_internal(): internal_inps = [[10., -10.], [0., -2.]] class Model(torch.nn.Module): def forward(self, *args): condition = args[0] x = args[1] def body_then(a, b): return x + a + b def body_else(a): return a + x in1 = torch.tensor(internal_inps[0]) return poptorch.cond(condition, body_then, [in1, torch.tensor(internal_inps[1])], body_else, [in1])[0] input_val = [5., -1.] args = [torch.tensor(input_val)] exp_then = args[0] + torch.tensor(internal_inps[0]) + torch.tensor( internal_inps[1]) exp_else = torch.tensor(internal_inps[0]) + args[0] if_else_harness(Model(), exp_then, exp_else, *args) def test_single_body(): class Model(torch.nn.Module): def forward(self, condition, x, y): def body(a, b): return a + b return poptorch.cond(condition, body, [x, y], body, [x, x])[0] args = [torch.rand(1) for _ in range(2)] exp_then = torch.tensor(args[0] + args[1]) exp_else = torch.tensor(args[0] + args[0]) if_else_harness(Model(), exp_then, exp_else, *args) def test_nested_cond(): class Model(torch.nn.Module): def forward(self, condition, cond_nested, x, y): def body_then(): def nested_then(x, y): return x + y def nested_else(): return x - y return poptorch.cond(cond_nested, nested_then, [x, y], nested_else, [])[0] def body_else(cond_nested): cond_nested = torch.logical_not(cond_nested) def nested_then(y): return x * y def nested_else(): return x * 2 return poptorch.cond(cond_nested, nested_then, [y], nested_else, [])[0] res1 = poptorch.cond(condition, body_then, [], body_else, [cond_nested])[0] re2 = poptorch.cond(condition, body_then, [], body_else, [cond_nested])[0] return res1 + re2 model = Model() cond_nested = torch.tensor([True]) args = [cond_nested] + [torch.rand(1) for v in range(2)] exp_then = 2 * (args[1] + args[2]) exp_else = 2 * args[1] * 2 if_else_harness(model, exp_then, exp_else, *args) cond_nested = torch.tensor([False]) args = [cond_nested] + [torch.rand(1) for v in range(2)] exp_then = 2 * (args[1] - args[2]) exp_else = 2 * (args[1] * args[2]) if_else_harness(model, exp_then, exp_else, *args) @pytest.mark.parametrize( ("execution_strategy"), [ poptorch.ShardedExecution, poptorch.ParallelPhasedExecution, poptorch.SerialPhasedExecution, ], ) def test_if_on_multiple_ipus(execution_strategy): class Model(torch.nn.Module): def forward(self, condition, x, y): def body_then(x, y): return x + y, y def body_else(x, y): return x, x * y with poptorch.Block("0", ipu_id=0): x, y = poptorch.cond(condition, body_then, [x, y], body_else, [x, y]) with poptorch.Block("1", ipu_id=1): x, y = poptorch.cond(torch.logical_not(condition), body_then, [x, y], body_else, [x, y]) return x, y stages = [poptorch.Stage(f"{k}") for k in range(0, 2)] strategy = execution_strategy(*stages) opts = poptorch.Options() opts.autoRoundNumIPUs(True) opts.setExecutionStrategy(strategy) ipu_model = poptorch.inferenceModel(Model(), opts) x = torch.tensor([1., 2.]) y = torch.tensor([3., 4.]) condition = torch.tensor([True]) ipu_res = ipu_model(condition, x, y) exp_res = (x + y, (x + y) * y) for a, b in zip(ipu_res, exp_res): assert all(a == b) ================================================ FILE: tests/index_ops_test.py ================================================ #!/usr/bin/env python3 # Copyright (c) 2021 Graphcore Ltd. All rights reserved. import torch import pytest import helpers import poptorch def index_op0(t, idx, v=None): if v is None: return t[idx] t[idx] = v return t def index_op1(t, idx, v=None): if v is None: return t[idx, idx] t[idx, idx] = v return t def index_op2(t, idx, v=None): if v is None: return t[:, idx] t[:, idx] = v return t def index_op3(t, idx, v=None): if v is None: return t[idx, :, idx] t[idx, :, idx] = v return t def index_op4(t, idx, v=None): if v is None: return t[:, :, idx] t[:, :, idx] = v return t def index_op5(t, idx, v=None): if v is None: return t[:, idx, idx] t[:, idx, idx] = v return t def index_op6(t, idx, v=None): if v is None: return t[idx, idx, idx, idx] t[idx, idx, idx, idx] = v return t def index_op7(t, idx, v=None): if v is None: return t[:, :, :, idx] t[:, :, :, idx] = v return t def index_op8(t, idx, v=None): if v is None: return t[:, idx, :, idx] t[:, idx, :, idx] = v return t def index_harness(op, idx, is_index_put, v=None, is_mask=False): torch.manual_seed(42) t = torch.randn(2, 3, 4, 5) if not is_mask: idx_tensor = torch.tensor(idx) else: idx_tensor = idx model = helpers.ModelWithWeights(op, t.shape) # The LR should be large enough to guarantee weights change optim = torch.optim.AdamW(model.parameters(), lr=0.1) poptorch_model = poptorch.trainingModel(model, optimizer=optim) if is_index_put: if v is None: v = torch.zeros_like(op(t, idx_tensor)) # Clone the tensor so that the original is unchanged by the in-place op native_out, _ = model((t.clone(), idx_tensor, v)) poptorch_out, _ = poptorch_model((t, idx_tensor, v)) else: native_out, _ = model((t, idx_tensor)) poptorch_out, _ = poptorch_model((t, idx_tensor)) # Inference test - check outputs helpers.assert_allclose(actual=poptorch_out, expected=native_out) # Training test - check weights changed poptorch_model.assert_weights_changed() index_ops = [ index_op0, index_op1, index_op2, index_op3, index_op4, index_op5, index_op6, index_op7, index_op8, ] index_indices = ([0], [[1]], [0, 1], [[1, 0]], [[0, 1], [1, 0]]) @pytest.mark.parametrize("idxs", index_indices) @pytest.mark.parametrize("op", index_ops) def test_index(op, idxs): index_harness(op, idxs, False) def test_index_bool_mask_failure(): with pytest.raises( torch._subclasses.fake_tensor.DynamicOutputShapeException): # pylint: disable=protected-access index_harness(index_ops[0], [True, False], False) def test_index_on_max_indices(): def op(x): _, argmax_tensor = torch.max(x, dim=1) b = x[:, argmax_tensor] return b, argmax_tensor inp_tensor = torch.rand(1, 10, 2) model = helpers.ModelWithWeights(op, inp_tensor.shape, lambda x: x[0]) poptorch_model = poptorch.trainingModel(model) native_out, _ = model((inp_tensor, )) poptorch_out, _ = poptorch_model((inp_tensor, )) # Inference test - check outputs for native, pop in zip(native_out, poptorch_out): helpers.assert_allclose(actual=pop, expected=native) # Training test - check weights changed poptorch_model.assert_weights_changed() @pytest.mark.parametrize("idxs", index_indices) @pytest.mark.parametrize("op", index_ops) def test_index_put(op, idxs): index_harness(op, idxs, True) def test_index_put_scalar(): def op(t, idx, v): t[idx, idx] = v return t # For each element e in t[0, 0], e = 0 index_harness(op, [[0]], True, 0) def test_index_put_broadcastable(): v = torch.zeros(5) # For each row r in t[0, 0], r = [0, 0, 0, 0, 0] index_harness(index_op1, [[0]], True, v) @pytest.mark.parametrize("mask_size, dtype", [ (1, torch.bool), (2, torch.uint8), (3, torch.bool), (4, torch.uint8), ]) def test_index_put_masked_fill(mask_size, dtype): torch.manual_seed(42) mask_shape = [2, 3, 4, 5][:mask_size] mask = (torch.rand(mask_shape) > 0.5).type(dtype) v = torch.tensor([0.]) index_harness(index_op0, mask, True, v=v, is_mask=True) @pytest.mark.parametrize("mask_size, dtype", [ (1, torch.bool), (2, torch.uint8), (3, torch.bool), (4, torch.uint8), ]) def test_index_put_masked_assign(mask_size, dtype): torch.manual_seed(42) mask_shape = [2, 3, 4, 5][:mask_size] mask = (torch.rand(mask_shape) > 0.5).type(dtype) v = torch.zeros([2, 3, 4, 5][mask_size:], dtype=torch.float32) if len(v.size()) == 0: # To avoid a size 0 tensor v = v.unsqueeze(0) index_harness(index_op0, mask, True, v=v, is_mask=True) def get_index_fill_fn(dim): def index_fill(t, idx, v): t.index_fill_(dim, idx, v) return t return index_fill @pytest.mark.parametrize("value", (-1, torch.tensor(-1))) @pytest.mark.parametrize("dim", [1, 2, 3]) def test_index_fill(value, dim): torch.manual_seed(42) op = get_index_fill_fn(dim) index_harness(op, [0, 2], True, value) @pytest.mark.parametrize("dim", range(-3, 3)) def test_index_select(dim): op = lambda src, index: src.index_select(dim, index) torch.manual_seed(0) x = torch.randn(2, 4, 8) sz = x.shape[dim] indices = torch.randint(sz, (sz, )) model = helpers.ModelWithWeights(op, x.shape) native_out, _ = model((x, indices)) poptorch_model = poptorch.trainingModel(model) poptorch_out, _ = poptorch_model((x, indices)) # Inference test - check outputs helpers.assert_allclose(actual=poptorch_out, expected=native_out) # Training test - check weights changed poptorch_model.assert_weights_changed() @helpers.printCapfdOnExit @helpers.overridePoptorchLogLevel("TRACE") @pytest.mark.parametrize("dim", [0, 1]) def test_vectorized_scatter(capfd, dim): def op(out, index, src): if dim == 0: out[index, :] = src else: out[:, index] = src return out torch.manual_seed(0) N = 20 out = torch.randn(N, 30) sz = out.shape[dim] - N // 10 indices = torch.randint(sz, (sz, )) src_sz = (sz, out.shape[1]) if dim == 0 else (out.shape[0], sz) src = torch.randn(src_sz) model = helpers.ModelWithWeights(op, out.shape) # Clone the tensor so that the original is unchanged by the in-place op native_out, _ = model((out.clone(), indices, src)) poptorch_model = poptorch.trainingModel(model) poptorch_out, _ = poptorch_model((out.clone(), indices, src)) # Inference test - check outputs helpers.assert_allclose(actual=poptorch_out, expected=native_out) # Training test - check weights changed poptorch_model.assert_weights_changed() it = helpers.LogChecker(capfd).createIterator() it.findNext("Using vectorized ScatterReduce with none reduction") ================================================ FILE: tests/inplace_test.py ================================================ #!/usr/bin/env python3 # Copyright (c) 2021 Graphcore Ltd. All rights reserved. import re import torch import torch.nn as nn import pytest import helpers import poptorch def test_inplace_add(): class Model(nn.Module): def forward(self, x): if isinstance(x, (tuple, list)): x[0] += 4 elif isinstance(x, (dict)): x['input'] += 3 else: x += 1 poptorch_model = poptorch.inferenceModel(Model()) tensor_in = torch.Tensor([1.0]) assert poptorch_model(tensor_in) is None assert tensor_in == 2.0 assert poptorch_model(tensor_in) is None assert tensor_in == 3.0 assert poptorch_model(torch.Tensor([1.0])) is None assert tensor_in == 3.0 # We're changing the input type: must recompile poptorch_model.destroy() list_in = [torch.Tensor([1.0])] cpu_in = [torch.Tensor([1.0])] model = Model() for i in range(2): print(f"Run {i}") cpu_out = model(cpu_in) poptorch_out = poptorch_model(list_in) assert cpu_out == poptorch_out assert list_in == cpu_in def test_inplace_add_multi_elements(): class Model(nn.Module): def forward(self, _x, y): y += 1 poptorch_model = poptorch.inferenceModel(Model()) nested_tuple_in = ((torch.Tensor([1.0]), torch.Tensor([1.0])), (torch.Tensor([1.0]))) tensor_in = torch.Tensor([1.0]) assert poptorch_model(nested_tuple_in, tensor_in) is None assert tensor_in == 2.0 def test_inplace_sub(): class Model(nn.Module): def forward(self, x): if isinstance(x, (tuple, list)): x[0] -= 3 elif isinstance(x, (dict)): x['input'] -= 2 else: x -= 1 poptorch_model = poptorch.inferenceModel(Model()) tensor_in = torch.Tensor([1.0]) assert poptorch_model(tensor_in) is None assert tensor_in == 0.0 assert poptorch_model(tensor_in) is None assert tensor_in == -1.0 assert poptorch_model(torch.Tensor([1.0])) is None assert tensor_in == -1.0 # We're changing the input type: must recompile poptorch_model.destroy() list_in = [torch.Tensor([1.0])] cpu_in = [torch.Tensor([1.0])] model = Model() for i in range(2): print(f"Run {i}") cpu_out = model(cpu_in) poptorch_out = poptorch_model(list_in) assert cpu_out == poptorch_out assert list_in == cpu_in def test_inplace_div(): class Model(nn.Module): def forward(self, x): if isinstance(x, (tuple, list)): x[0] /= 4 elif isinstance(x, (dict)): x['input'] /= 3 else: x /= 2 poptorch_model = poptorch.inferenceModel(Model()) tensor_in = torch.Tensor([1.0]) assert poptorch_model(tensor_in) is None assert tensor_in == 0.5 assert poptorch_model(tensor_in) is None assert tensor_in == 0.25 assert poptorch_model(torch.Tensor([1.0])) is None assert tensor_in == 0.25 # We're changing the input type: must recompile poptorch_model.destroy() list_in = [torch.Tensor([1.0])] cpu_in = [torch.Tensor([1.0])] model = Model() for i in range(2): print(f"Run {i}") cpu_out = model(cpu_in) poptorch_out = poptorch_model(list_in) assert cpu_out == poptorch_out assert list_in == cpu_in def test_inplace_mul(): class Model(nn.Module): def forward(self, x): if isinstance(x, (tuple, list)): x[0] *= 4 elif isinstance(x, (dict)): x['input'] *= 3 else: x *= 2 poptorch_model = poptorch.inferenceModel(Model()) tensor_in = torch.Tensor([1.0]) assert poptorch_model(tensor_in) is None assert tensor_in == 2.0 assert poptorch_model(tensor_in) is None assert tensor_in == 4.0 assert poptorch_model(torch.Tensor([1.0])) is None assert tensor_in == 4.0 # We're changing the input type: must recompile poptorch_model.destroy() list_in = [torch.Tensor([1.0])] cpu_in = [torch.Tensor([1.0])] model = Model() for i in range(2): print(f"Run {i}") cpu_out = model(cpu_in) poptorch_out = poptorch_model(list_in) assert cpu_out == poptorch_out assert list_in == cpu_in def test_inplace_masked_fill(): class Model(nn.Module): def forward(self, x): x.masked_fill_(x > 0.5, 1.0) poptorch_model = poptorch.inferenceModel(Model()) x = torch.tensor([[0, 0.7], [0.2, 3.5]]) poptorch_model(x) assert x[0][0] == 0 assert x[0][1] == 1.0 assert x[1][0] == 0.2 assert x[1][1] == 1.0 def test_chained_inplace(): class Model(nn.Module): def forward(self, x, y): x += y x += 2.0 x += y model = Model() t1 = torch.tensor([1.]) cpu_t1 = torch.tensor([1.]) t2 = torch.tensor([2.]) poptorch_model = poptorch.inferenceModel(model) out = model(cpu_t1, t2) assert out is None out = poptorch_model(t1, t2) assert out is None assert cpu_t1 == 7.0 assert t1 == 7.0 def test_inplace_zero(): class Model(nn.Module): def forward(self, x): # (Simply setting it to zero gets pruned by PopART) a = torch.sum(x) x.zero_() x += a poptorch_model = poptorch.inferenceModel(Model()) x = torch.tensor([[0, 0.5], [0.25, 2.0]]) poptorch_model(x) assert x[0][0] == 2.75 assert x[0][1] == 2.75 assert x[1][0] == 2.75 assert x[1][1] == 2.75 def test_inplace_fill(): class Model(nn.Module): def forward(self, x): a = torch.sum(x) x.fill_(1.0) x += a poptorch_model = poptorch.inferenceModel(Model()) x = torch.tensor([[0, 0.5], [0.25, 2.0]]) poptorch_model(x) assert x[0][0] == 3.75 assert x[0][1] == 3.75 assert x[1][0] == 3.75 assert x[1][1] == 3.75 def test_inplace_non_input(): class Model(nn.Module): def forward(self, x): a = x + 1 a += 1 return a poptorch_model = poptorch.inferenceModel(Model()) x = torch.tensor([[0, 0.5], [0.25, 2.0]]) y = poptorch_model(x) assert x[0][0] == 0 assert x[0][1] == 0.5 assert x[1][0] == 0.25 assert x[1][1] == 2.0 assert y[0][0] == 2 assert y[0][1] == 2.5 assert y[1][0] == 2.25 assert y[1][1] == 4.0 def test_double_underscore(): # This tests aten::__and__ is not treated as inplace class Model(nn.Module): def forward(self, x, l): return x[0].int() & l.int() model = Model() poptorch_model = poptorch.inferenceModel(model) inp, l = torch.rand(10, 10), torch.LongTensor([10]) out = model(inp, l) popout = poptorch_model(inp, l) helpers.assert_allclose(actual=popout, expected=out) def test_half_buffer_inplace(): class Model(nn.Module): def __init__(self): super().__init__() self.register_buffer('buff', torch.ones(5, dtype=torch.float16)) def forward(self, x): # pylint: disable=no-member out = x + self.buff self.buff += 1 return out model = Model() poptorch_model = poptorch.inferenceModel(model) x = torch.tensor([0.1, 0.2, 0.3, 0.4, 0.5], dtype=torch.float16) out = poptorch_model(x) helpers.assert_allclose(actual=out, expected=torch.tensor([1.1, 1.2, 1.3, 1.4, 1.5], dtype=torch.float16)) poptorch_model.copyWeightsToHost() helpers.assert_allclose(actual=poptorch_model.buff, expected=torch.tensor([2.0, 2.0, 2.0, 2.0, 2.0], dtype=torch.float16)) def test_float_to_half_buffer_inplace_with_training(): torch.manual_seed(42) # pylint: disable=attribute-defined-outside-init class Model(nn.Module): def __init__(self): super().__init__() # need at least one parameter for a training model self.param = nn.Parameter(torch.ones(5, 5)) self.register_buffer("buff", torch.ones(5)) self.loss = nn.MSELoss() def forward(self, x): # pylint: disable=no-member out = self.buff + self.param self.buff += 1 return out, self.loss(out, x) model = Model().train().half() poptorch_model = poptorch.trainingModel(model) x = torch.rand(5, 5).half() native_out, native_loss = model(x) # Reset buff model.buff = torch.ones(5, 5) poptorch_out, poptorch_loss = poptorch_model(x) helpers.assert_allclose(actual=native_out, expected=poptorch_out) helpers.assert_allclose(actual=native_loss, expected=poptorch_loss) def test_inplace_on_buffer_and_input(): fill_value = 3 shape = (1, 2) class Model(torch.nn.Module): def __init__(self): super().__init__() self.register_buffer("buffer", torch.ones(shape)) def forward(self, x): # Perform inplace ops on both the input and our buffer. x.fill_(fill_value) buffer_update = self.buffer + x self.buffer.copy_(buffer_update) return self.buffer, x model = poptorch.inferenceModel(Model()) buf, out = model(torch.ones(shape)) expected_out = torch.full(shape, fill_value) expected_buf = expected_out + 1 helpers.assert_allequal(actual=out, expected=expected_out) helpers.assert_allequal(actual=buf, expected=expected_buf) def test_two_inplace_copies(): fill_value = 3 shape = (1, 2) class Model(torch.nn.Module): def forward(self, x): res = torch.full(shape, fill_value) x.copy_(res) # Do a second `copy_` to our input. res += 3 x.copy_(res) return x model = poptorch.inferenceModel(Model()) out = model(torch.ones(shape)) expected_out = torch.full(shape, fill_value) + 3 helpers.assert_allequal(actual=out, expected=expected_out) def test_two_inplace_copies_buffer(): fill_value = 3 shape = (1, 2) class Model(torch.nn.Module): def __init__(self): super().__init__() self.register_buffer("buffer", torch.ones(shape)) def forward(self, x): x.fill_(fill_value) buffer_update = self.buffer + x self.buffer.copy_(buffer_update) # Do a second `copy_` to our buffer. buffer_update += 5 self.buffer.copy_(buffer_update) return self.buffer, x model = poptorch.inferenceModel(Model()) buf, out = model(torch.ones(shape)) expected_out = torch.full(shape, fill_value) expected_buf = expected_out + 6 helpers.assert_allequal(actual=out, expected=expected_out) helpers.assert_allequal(actual=buf, expected=expected_buf) def direct_assign(x, step): x[0:2:step.item()] = x[0:2:step.item()] * 0 return x def direct_assign_inplace(x, step): x[0:2:step.item()] *= 0 return x def direct_fill(x, step): x[0:2:step.item()] = 0 return x # Slicing entire dimensions lowers to slice(slice(x)) def chained_slice(x, step): x[:, :2:step.item()].mul_(0) return x def modify_before_assign(x, step): x *= 2 x[0:2:step.item()] = x[0:2:step.item()] * 0 return x def modify_region(x, step): x[1:x.shape[0]:step.item(), :] += 1 return x @pytest.mark.parametrize("step_size", [1, 2]) @pytest.mark.parametrize("op", [ direct_assign, direct_assign_inplace, direct_fill, chained_slice, modify_before_assign, modify_region ]) def test_inplace_modify_slice(op, step_size): t = torch.rand(4, 4) step = torch.tensor(step_size) class Model(torch.nn.Module): pass Model.forward = lambda _, x: op(x, step) cpu_model = Model() ipu_model = poptorch.inferenceModel(cpu_model) if step_size == 1: ipu_input = t.clone() cpu_input = t.clone() # Ensure outputs match helpers.assert_allclose(actual=ipu_model(ipu_input), expected=cpu_model(cpu_input)) # Ensure that any inplace modification of graph inputs is # correctly reflected helpers.assert_allclose(actual=ipu_input, expected=cpu_input) else: try: ipu_model.compile(t) except poptorch.Error as e: assert re.match( r"In\-place modification of slices with step " r"size other than 1 is not supported\.", e.message) def test_inplace_modify_select(): shape = (3, 4, 2) inpA = torch.randint(55, shape) inpB = torch.randint(66, shape) inpC = torch.randint(77, shape) class ModelWrapper(torch.nn.Module): def forward(self, tensorA, tensorB, tensorC): tensorA = tensorA - tensorB tensorA[0:1] += tensorC[1] tensorA[0] += tensorC[0] tensorA[1][2] += tensorC[2][1] tensorA[1][3][1] += tensorC[2][3][0] return tensorA model = ModelWrapper() cpu_out = model(inpA, inpB, inpC) poptorch_model = poptorch.inferenceModel(model) ipu_out = poptorch_model(inpA, inpB, inpC) helpers.assert_allclose(actual=ipu_out, expected=cpu_out) def test_index_put_on_buffer(): class Model(torch.nn.Module): def __init__(self): super().__init__() p_init = torch.arange(6, dtype=torch.float).reshape(2, 3) self.register_buffer("p", p_init) def forward(self, x, idx): self.p[(idx, )] = x return self.p model = Model() ipu_model = poptorch.inferenceModel(Model()) x = torch.empty(3).fill_(-1) idx = torch.tensor([0]) cpu_out = model(x, idx) ipu_out = ipu_model(x, idx) helpers.assert_allclose(actual=ipu_out, expected=cpu_out) ================================================ FILE: tests/inputs_test.py ================================================ #!/usr/bin/env python3 # Copyright (c) 2020 Graphcore Ltd. All rights reserved. import collections import re import torch import torch.nn as nn import pytest import helpers import poptorch @pytest.mark.parametrize("use_half", [True, False]) def test_simple_tuple(use_half): class SimpleAdder(nn.Module): def forward(self, t): assert isinstance(t, tuple) (x, y) = t assert isinstance(x, torch.Tensor) assert isinstance(y, torch.Tensor) return x + y model = SimpleAdder() inference_model = poptorch.inferenceModel(model) t1 = torch.tensor([1.]) t2 = torch.tensor([2.]) if use_half: model.half() t1 = t1.half() t2 = t2.half() assert inference_model((t1, t2)).float() == 3.0 # Run more than once assert inference_model((t1, t2)).float() == 3.0 def test_type_change(): class SimpleAdder(nn.Module): def forward(self, t): assert isinstance(t, tuple) (x, y) = t assert isinstance(x, torch.Tensor) assert isinstance(y, torch.Tensor) return x + y model = SimpleAdder() inference_model = poptorch.inferenceModel(model) t1 = torch.tensor([1.]) t2 = torch.tensor([2.]) assert inference_model((t1, t2)).float() == 3.0 # Run more than once assert inference_model((t1, t2)).float() == 3.0 t1 = torch.tensor([1]) t2 = torch.tensor([2]) error_msg = (".*expected torch.float32 but got torch.int64.*") with pytest.raises(poptorch.Error, match=error_msg): assert inference_model((t1, t2)).float() == 3 inference_model.destroy() assert inference_model((t1, t2)).float() == 3 def test_shape_change(): class SimpleAdder(nn.Module): def forward(self, t): assert isinstance(t, tuple) (x, y) = t assert isinstance(x, torch.Tensor) assert isinstance(y, torch.Tensor) return x + y model = SimpleAdder() inference_model = poptorch.inferenceModel(model) t1 = torch.tensor([1.]) t2 = torch.tensor([2.]) assert inference_model((t1, t2)).float() == 3.0 # Run more than once assert inference_model((t1, t2)).float() == 3.0 t1 = torch.tensor([1., 1.]) t2 = torch.tensor([2., 2.]) error_msg = ("expected torch.Size([1]) but got torch.Size([2])") with pytest.raises(poptorch.Error, match=re.escape(error_msg)): assert inference_model((t1, t2)).float() == 3 inference_model.destroy() native_out = model((t1, t2)) for i in range(2): print(f"Run {i}") poptorch_out = inference_model((t1, t2)) helpers.assert_allclose(expected=native_out, actual=poptorch_out) @pytest.mark.parametrize("use_half", [True, False]) @pytest.mark.parametrize("thing_to_test", ['List', 'Tuple', 'Mixed']) def test_nested_tuples_and_lists(use_half, thing_to_test): class SimpleAdder(nn.Module): def forward(self, tpl1, t2, tpl34567): (t1, ) = tpl1 (t3, (t4, t5), _) = tpl34567 (t6, _) = tpl34567[2] t7 = tpl34567[2][1] assert isinstance(t1, torch.Tensor) assert isinstance(t2, torch.Tensor) assert isinstance(t3, torch.Tensor) assert isinstance(t4, torch.Tensor) assert isinstance(t5, torch.Tensor) assert isinstance(t6, torch.Tensor) assert isinstance(t7, torch.Tensor) return t1 + t2 + t3 + t4 + t5 + t6 + t7 model = SimpleAdder() inference_model = poptorch.inferenceModel(model) t1 = torch.tensor([1.]) t2 = torch.tensor([2.]) t3 = torch.tensor([3.]) t4 = torch.tensor([4.], dtype=torch.float64) t5 = torch.tensor([5.]) t6 = torch.tensor([6.]) t7 = torch.tensor([7.], dtype=torch.float64) if use_half: model.half() t1 = t1.half() t2 = t2.half() t3 = t3.half() t4 = t4.half() t5 = t5.half() t6 = t6.half() t7 = t7.half() # Run more than once for i in range(2): print(f"Run {i}") if thing_to_test == "List": assert inference_model([ t1, ], t2, [t3, [t4, t5], [t6, t7]]).float() == 28.0 elif thing_to_test == "Tuple": assert inference_model((t1, ), t2, (t3, (t4, t5), (t6, t7))).float() == 28.0 else: assert inference_model([ t1, ], t2, [t3, (t4, t5), [t6, t7]]).float() == 28.0 @pytest.mark.parametrize("use_half", [True, False]) def test_optional_inputs(use_half): dtype = torch.float16 if use_half else torch.float32 class SimpleAdder(nn.Module): def forward(self, t1, t2, t3=torch.ones(1, dtype=dtype), t4=torch.zeros(1, dtype=dtype)): return t1 * t3 + t2 * t4 model = SimpleAdder() inference_model = poptorch.inferenceModel(model) t1 = torch.tensor([1.]) t2 = torch.tensor([2.]) t4 = torch.tensor([4.]) if use_half: model.half() t1 = t1.half() t2 = t2.half() t4 = t4.half() assert inference_model(t1, t2).float() == 1.0 assert inference_model(t1, t2, t4=t4).float() == 9.0 assert inference_model(t4=t4, t1=t1, t2=t2).float() == 9.0 def test_non_tensor_inputs_dispatch(): class Model(nn.Module): def forward( self, t1, scalar=2, t2_opt=None, ): if t2_opt is not None: return t2_opt * scalar + t1 * scalar return t1 * scalar model = Model() t1 = torch.tensor([3.]) ipu = poptorch.inferenceModel(model)(t1) cpu = model(t1) helpers.assert_allclose(expected=cpu, actual=ipu) scalar = 4 ipu = poptorch.inferenceModel(model)(t1, scalar) cpu = model(t1, scalar) helpers.assert_allclose(expected=cpu, actual=ipu) t2 = torch.tensor([5.]) ipu = poptorch.inferenceModel(model)(t1, scalar, t2) cpu = model(t1, scalar, t2) helpers.assert_allclose(expected=cpu, actual=ipu) ipu = poptorch.inferenceModel(model)(t1, t2_opt=t2) cpu = model(**{"t1": t1, "t2_opt": t2}) helpers.assert_allclose(expected=cpu, actual=ipu) @pytest.mark.parametrize("use_half", [True, False]) def test_list_inputs(use_half): class SimpleAdder(nn.Module): def forward(self, t1, t2, x): l = [t1, t2] x = l[0] + x l[1] = x return l model = SimpleAdder() inference_model = poptorch.inferenceModel(model) t1 = torch.tensor([1.]) t2 = torch.tensor([2.]) t3 = torch.tensor([4.]) if use_half: model.half() t1 = t1.half() t2 = t2.half() t3 = t3.half() expected = [torch.tensor([1.0]), torch.tensor([5.0])] assert [t.float() for t in inference_model(t1, t2, t3)] == expected # Call multiple times to check the fast path works assert [t.float() for t in inference_model(t1, t2, t3)] == expected assert [t.float() for t in inference_model(t1, t2, t3)] == expected assert [t.float() for t in inference_model(t1, t2, t3)] == expected def test_unused_tuple(): class SimpleAdder(nn.Module): def forward(self, x, y, z): # pylint: disable=unused-argument return x + y model = SimpleAdder() inference_model = poptorch.inferenceModel(model) t1 = torch.tensor([1.]) t2 = torch.tensor([2.]) z = (torch.tensor([1.]), torch.tensor([1.])) # Run more than once for i in range(2): print(f"Run {i}") inference_model(t1, t2, z) def test_dict_input(): class DictDivider(nn.Module): def forward(self, d): # pylint: disable=unused-argument return d['x'] / d['y'] model = DictDivider() z = {'x': torch.tensor([1.]), 'y': torch.tensor([2.])} native_out = model(z) inference_model = poptorch.inferenceModel(model) # Run more than once for i in range(4): # Reorder the dict to check order doesn't matter if i == 1: z = {'y': torch.tensor([2.]), 'x': torch.tensor([1.])} # Missing argument elif i == 2: z = {'y': torch.tensor([2.])} with pytest.raises(poptorch.Error, match="Missing arguments: x."): inference_model(z) continue # Extra argument elif i == 3: z = { 'x': torch.tensor([1.]), 'y': torch.tensor([2.]), 'z': torch.tensor([3.]) } with pytest.raises(poptorch.Error, match="Unexpected arguments: z."): inference_model(z) continue poptorch_out = inference_model(z) helpers.assert_allclose(expected=native_out, actual=poptorch_out) def test_nested_dict_input(): class DictAdder(nn.Module): def forward(self, d): # pylint: disable=unused-argument return d[0]['d']['x'] + d[0]['d']['y'] + d[1] model = DictAdder() z = [{ 'd': { 'x': torch.tensor([1.]), 'y': torch.tensor([2.]) } }, torch.tensor([3.])] native_out = model(z) inference_model = poptorch.inferenceModel(model) # Run more than once for i in range(2): print(f"Run {i}") poptorch_out = inference_model(z) helpers.assert_allclose(expected=native_out, actual=poptorch_out) @pytest.mark.parametrize("fwd_args", [True, False]) def test_ordered_dict_inputs(fwd_args): """ OrderedDict based types shouldn't require any custom parser.""" class MyArg(collections.OrderedDict): def print(self): return str(self) class Model(torch.nn.Module): def forward(self, args): assert isinstance(args, MyArg) return args["a"] * 2 + poptorch.ipu_print_tensor(args["b"]) class ModelWrapper(Model): def forward(self, *args, **kwargs): print(len(args)) return super().forward(*args, **kwargs) if fwd_args: model = ModelWrapper() else: model = Model() poptorch_model = poptorch.inferenceModel(model) for i in range(2): print(f"Run {i}") args = MyArg() args["b"] = torch.randn(2, 2) args["a"] = torch.randn(2, 2) native_out = model(args) poptorch_out = poptorch_model(args) helpers.assert_allclose(expected=native_out, actual=poptorch_out) @pytest.mark.parametrize("device_iterations", [1, 3]) def test_custom_input(device_iterations): batch_size = 2 combined_batch_size = device_iterations * batch_size class MyArg: def __init__(self, **kwargs): for key, value in kwargs.items(): self.__setattr__(key, value) class MyArgParser(poptorch.ICustomArgParser): def yieldTensors(self, struct): keys = sorted(struct.__dict__.keys()) for key in keys: maybe_tensor = struct.__dict__[key] if isinstance(maybe_tensor, torch.Tensor): yield maybe_tensor def reconstruct(self, structure, tensor_iterator): data = {} keys = sorted(structure.__dict__.keys()) for key in keys: data[key] = next(tensor_iterator) return MyArg(**data) poptorch.registerCustomArgParser(MyArg, MyArgParser()) class SimpleAdder(torch.nn.Module): def forward(self, custom_input): assert custom_input.tensor.shape[0] == batch_size custom_input.result = custom_input.tensor + custom_input.tensor return custom_input adder_model = SimpleAdder() adder_model.eval() opts = poptorch.Options() opts.deviceIterations(device_iterations=device_iterations) ipu_adder_model = poptorch.inferenceModel(adder_model, opts) for i in range(4): input = torch.full((combined_batch_size, 1), i) result = ipu_adder_model(MyArg(tensor=input)) assert torch.equal(result.tensor, input) assert torch.equal(result.result, torch.full((combined_batch_size, 1), i + i)) torch.manual_seed(42) ones = torch.ones(5, 5) x = torch.randn(5, 5) y = torch.randn(5, 5) z = torch.randn(5, 5) t = torch.randn(5, 5) class Model(torch.nn.Module): def forward(self, x, y=None, z=None, t=None): r = x if y is not None: r = torch.add(r, y) * 3 if z is not None: r = torch.add(r, z) * 4 if t is not None: r = torch.add(r, t) * 5 return torch.tanh(r) def test_none_input_pass_one_kwarg(): model = Model() poptorch_model = poptorch.inferenceModel(model) native_out = model(x, y, z, t=None) # Run more than once for i in range(2): print(f"Run {i}") poptorch_out = poptorch_model(x, y, z, t=None) helpers.assert_allclose(expected=native_out, actual=poptorch_out) def test_none_input_pass_two_kwarg(): model = Model() poptorch_model = poptorch.inferenceModel(model) native_out = model(x, y, z=None, t=None) # Run more than once for i in range(2): print(f"Run {i}") poptorch_out = poptorch_model(x, y, z=None, t=None) helpers.assert_allclose(expected=native_out, actual=poptorch_out) def test_none_input_pass_skip_one_kwarg(): model = Model() poptorch_model = poptorch.inferenceModel(model) native_out = model(x, y, z=None) # Run more than once for i in range(2): print(f"Run {i}") poptorch_out = poptorch_model(x, y, z=None) helpers.assert_allclose(expected=native_out, actual=poptorch_out) def test_none_input_trace_dispatch_non_default_kwarg(): model = Model() poptorch_model = poptorch.inferenceModel(model) native_out = model(x, y=None) # Run more than once for i in range(2): print(f"Run {i}") poptorch_out = poptorch_model(x, y=None) helpers.assert_allclose(expected=native_out, actual=poptorch_out) def test_none_input_pass_last_arg(): model = Model() poptorch_model = poptorch.inferenceModel(model) native_out = model(x, y, z, None) # Run more than once for i in range(2): print(f"Run {i}") poptorch_out = poptorch_model(x, y, z, None) helpers.assert_allclose(expected=native_out, actual=poptorch_out) def test_none_input_pass_two_arg(): model = Model() poptorch_model = poptorch.inferenceModel(model) native_out = model(x, y, None, None) # Run more than once for i in range(2): print(f"Run {i}") poptorch_out = poptorch_model(x, y, None, None) helpers.assert_allclose(expected=native_out, actual=poptorch_out) @pytest.mark.parametrize("args", [(x, None, None, None), (x, ), (x, None)]) @pytest.mark.parametrize("fwd_args", [True, False]) def test_none_input_dispatch_non_default_arg_tuples(args, fwd_args): class ModelWrapper(Model): def forward(self, *args, **kwargs): # pylint: disable=signature-differs return super().forward(*args, **kwargs) if fwd_args: model = ModelWrapper() else: model = Model() poptorch_model = poptorch.inferenceModel(model) native_out = model(*args) # Run more than once for i in range(2): print(f"Run {i}") poptorch_out = poptorch_model(*args) helpers.assert_allclose(expected=native_out, actual=poptorch_out) @pytest.mark.parametrize("args", [{ "x": x, "t": t }, { "z": z, "t": None, "x": x }]) @pytest.mark.parametrize("fwd_args", [True, False]) def test_none_input_dispatch_non_default_arg_dict(args, fwd_args): class ModelWrapper(Model): def forward(self, *args, **kwargs): # pylint: disable=signature-differs return super().forward(*args, **kwargs) if fwd_args: model = ModelWrapper() else: model = Model() poptorch_model = poptorch.inferenceModel(model) native_out = model(**args) # Run more than once for i in range(2): print(f"Run {i}") poptorch_out = poptorch_model(**args) helpers.assert_allclose(expected=native_out, actual=poptorch_out) @pytest.mark.parametrize("fwd_args", [True, False]) def test_custom_arg_parser(fwd_args): class MyArg: def __init__(self, a, b): self.a = a self.b = b class MyParser(poptorch.ICustomArgParser): def yieldTensors(self, struct) -> None: yield struct.a yield struct.b def reconstruct(self, _original_structure, tensor_iterator): return MyArg(next(tensor_iterator), next(tensor_iterator)) class OutputContainer(collections.OrderedDict): def print(self): return str(self) poptorch.registerCustomArgParser(MyArg, MyParser()) class Model(torch.nn.Module): def forward(self, args): # Make sure to use a poptorch specific op # to check the graph is not empty or running on the CPU out = OutputContainer() out["sum"] = args.a + poptorch.ipu_print_tensor(args.b) out["a"] = args.a return out class ModelWrapper(Model): def forward(self, *args, **kwargs): print(len(args)) return super().forward(*args, **kwargs) if fwd_args: model = ModelWrapper() else: model = Model() poptorch_model = poptorch.inferenceModel(model) args = MyArg(torch.randn(2, 2), torch.randn(2, 2)) for i in range(2): print(f"Run {i}") args = MyArg(torch.randn(2, 2), torch.randn(2, 2)) native_out = model(args) poptorch_out = poptorch_model(args) # Make sure we get an OutputContainer and the elements are in the same order assert isinstance(native_out, OutputContainer) assert isinstance(poptorch_out, OutputContainer) print(native_out.print()) print(poptorch_out.print()) for native_key, poptorch_key in zip(native_out, poptorch_out): assert native_key == poptorch_key helpers.assert_allclose(expected=native_out[native_key], actual=poptorch_out[poptorch_key]) @pytest.mark.parametrize("fwd_args", [True, False]) def test_none_input_dispatch_args_kwargs(fwd_args): class Model(torch.nn.Module): def forward(self, a, b, *c, y=None, z=None, t=None, u=3, v="op", **w): r = len(v) * b + a * len(w) for i, x in enumerate(c): r += (i + 1) * x if y is not None: r = torch.add(r, y) * 3 if z is not None: r = torch.add(r, z) * 4 if t is not None: r = torch.add(r, t) * 5 return u * r class ModelWrapper(Model): def forward(self, *args, **kwargs): print(len(args)) return super().forward(*args, **kwargs) if fwd_args: model = ModelWrapper() else: model = Model() poptorch_model = poptorch.inferenceModel(model) a = 2 b = torch.randn(2, 2) c = torch.randn(2, 2) d = torch.randn(2, 2) e = torch.randn(2, 2) t = torch.randn(2, 2) x = torch.randn(2, 2) m = torch.randn(2, 2) z = torch.randn(2, 2) native_out = model(a, b, c, d, e, t=t, x=x, m=m, z=z) for i in range(2): print(f"Run {i}") poptorch_out = poptorch_model(a, b, c, d, e, t=t, x=x, m=m, z=z) helpers.assert_allclose(expected=native_out, actual=poptorch_out) if fwd_args: expected = "Missing arguments: z." else: expected = "Type mismatch for z: expected .*Tensor.* but got .*None" with pytest.raises(poptorch.Error, match=expected): poptorch_out = poptorch_model(a, b, c, d, e, t=t, x=x, m=m) with pytest.raises(poptorch.Error, match="Missing arguments: m."): poptorch_out = poptorch_model(a, b, c, d, e, t=t, x=x, z=z) poptorch_model.destroy() native_out = model(a, b, c, d, e, t=t, x=x, m=m, z=z, u=5, v="foobar") for i in range(2): print(f"Run {i}") poptorch_out = poptorch_model(a, b, c, d, e, t=t, x=x, m=m, z=z, u=5, v="foobar") helpers.assert_allclose(expected=native_out, actual=poptorch_out) with pytest.raises(poptorch.Error, match="mismatch for u: expected 5 but got 3"): poptorch_out = poptorch_model(a, b, c, d, e, t=t, x=x, m=m, z=z, u=3, v="foobar") with pytest.raises( poptorch.Error, match=("Number of positional arguments mismatch: expected" " 5 arguments but got 4")): poptorch_model(a, b, c, e, t=t, x=x, m=m, z=z, u=5, v="foobar") with pytest.raises( poptorch.Error, match=("Number of positional arguments mismatch: expected " "5 arguments but got 2")): poptorch_model(a, b, t=t, x=x, m=m, z=z, u=5, v="foobar") poptorch_model.destroy() if fwd_args: error_type = TypeError error = "missing 1 required positional argument: 'b'" else: error_type = poptorch.Error error = "Mandatory parameter b missing" with pytest.raises(error_type, match=error): poptorch_model(a) native_out = model(a, b) for i in range(2): print(f"Run {i}") poptorch_out = poptorch_model(a, b) helpers.assert_allclose(expected=native_out, actual=poptorch_out) def test_no_inputs_no_output(): class Model(torch.nn.Module): def __init__(self): super().__init__() self.x = torch.tensor([1.], dtype=torch.float) def forward(self): self.x += self.x model = Model() poptorch_model = poptorch.inferenceModel(model) poptorch_model() poptorch_model() def test_return_and_use_input(): class Model(torch.nn.Module): def forward(self, input): c = torch.tensor([1.]) return c, input + c model = Model() poptorch_model = poptorch.inferenceModel(model) assert poptorch_model(torch.tensor([0.])) == (torch.tensor([1.]), torch.tensor([1.])) assert poptorch_model(torch.tensor([1.])) == (torch.tensor([1.]), torch.tensor([2.])) def test_return_and_use_nested_input(): class Model(torch.nn.Module): def forward(self, input): c = torch.tensor([1.]) c = poptorch.set_available_memory(c, 0.1) return c, (c, input + c) model = Model() poptorch_model = poptorch.inferenceModel(model) assert poptorch_model(torch.tensor([0.])) == (torch.tensor([1.]), (torch.tensor([1.]), torch.tensor([1.]))) assert poptorch_model(torch.tensor([1.])) == (torch.tensor([1.]), (torch.tensor([1.]), torch.tensor([2.]))) def test_scalar_tensor_input(): class Square(torch.nn.Module): def forward(self, x): return x * x model = Square() s = poptorch.inferenceModel(model) x = torch.tensor(3.) # shape = torch.Size([]) helpers.assert_allclose(actual=s(x), expected=model(x)) def test_returned_only_inputs(): class Model(torch.nn.Module): def forward(self, x, y, z): # x and y will become Identity ops inputs and will get passed out # as the model outputs return x, y, z + 0.0 m = Model() p = poptorch.inferenceModel(m) x = torch.tensor([1, 2]) y = torch.tensor([3, 4]) z = torch.tensor([1.2, 3.4]) for cpu_out, ipu_out in zip(m(x, y, z), p(x, y, z)): helpers.assert_allclose(actual=ipu_out, expected=cpu_out) x = torch.tensor([11, 12]) y = torch.tensor([13, 14]) z = torch.tensor([11.2, 13.4]) for cpu_out, ipu_out in zip(m(x, y, z), p(x, y, z)): helpers.assert_allclose(actual=ipu_out, expected=cpu_out) def test_returned_only_inputs_with_params(): class Model(torch.nn.Module): def __init__(self): super().__init__() # Add parameter to ensure they're handled correctly self.lin = torch.nn.Linear(2, 1) def forward(self, z, x, y): # x and y will be erased as inputs and converted to # host-side-only constants return x, y, self.lin(z) m = Model() p = poptorch.inferenceModel(m) x = torch.tensor([1, 2]) y = torch.tensor([3, 4]) z = torch.tensor([1.2, 3.4]) for cpu_out, ipu_out in zip(m(z, x, y), p(z, x, y)): helpers.assert_allclose(actual=ipu_out, expected=cpu_out) ================================================ FILE: tests/io_performance_test.py ================================================ #!/usr/bin/env python3 # Copyright (c) 2021 Graphcore Ltd. All rights reserved. import os # pylint: disable=unused-import import unittest.mock import numpy as np import pytest import torch import helpers import poptorch IMAGE_SIZE = (3, 512, 512) DATASET_SIZE = 1000 BATCH_SIZE = 16 class ImageDataset(torch.utils.data.Dataset): def __init__(self, io_dtype): super().__init__() self.io_dtype = io_dtype def __len__(self): return DATASET_SIZE def __getitem__(self, _): return torch.randint(0, 256, IMAGE_SIZE).to(self.io_dtype) def get_mean_cycle_count(io_dtype, capfd): class Model(torch.nn.Module): def forward(self, x): x = x.to(torch.float32) x = x * 2 return x.to(io_dtype) opts = poptorch.Options() opts.logCycleCount(True) data_loader = poptorch.DataLoader( opts, ImageDataset(io_dtype), BATCH_SIZE, shuffle=False, drop_last=True, ) model = poptorch.inferenceModel(Model(), opts) num_iterations = 0 for x in data_loader: num_iterations += 1 _ = model(x) data_loader.terminate() log_matches = helpers.LogChecker(capfd).createIterator().findAll( r'Total number of IPU cycles: (\d+)') assert len(log_matches) == num_iterations cycle_counts = [] for match in log_matches: cycle_counts.append(int(match.group(1))) return np.array(cycle_counts).mean() @pytest.mark.ipuHardwareRequired @pytest.mark.parametrize("io_dtype1,io_dtype2", [(torch.float32, torch.int8), (torch.float32, torch.uint8), (torch.float32, torch.float16)]) @helpers.printCapfdOnExit @unittest.mock.patch.dict("os.environ", helpers.disableAllModels()) @helpers.overridePoptorchLogLevel("DEBUG") def test_compare_io_performance(capfd, io_dtype1, io_dtype2): cycle_count_1 = get_mean_cycle_count(io_dtype1, capfd) cycle_count_2 = get_mean_cycle_count(io_dtype2, capfd) # We only log the resulting cycle counts and ratios due to high variance # between the runs. print("test_compare_io_performance[{},{}]," "cycle_count1={}, cycle_count2={}, ratio={:.4f}".format( io_dtype1, io_dtype2, cycle_count_1, cycle_count_2, cycle_count_1 / cycle_count_2)) ================================================ FILE: tests/ipu_print_tensor_test.py ================================================ #!/usr/bin/env python3 # Copyright (c) 2022 Graphcore Ltd. All rights reserved. import torch import pytest import poptorch match_str = [ """title: { {1.4962566 1.7682219} {1.0884774 1.1320305} }""", """title: [ [1.4962566e+00, 1.7682219e+00] [1.0884774e+00, 1.1320305e+00] ]""", """title: ( (1.4962566;1.7682219) (1.0884774;1.1320305) )""" ] brackets = { "parentheses": ("(", ")"), "square": ("[", "]"), "curly": ("{", "}") } @pytest.mark.parametrize( "title,print_gradient,summarise_threshold,edge_items," "max_line_width,digits,float_format,separator,brackets_type," "match_str_idx", [("title", True, 1000, 3, 75, 8, "auto", None, "curly", 0), ("title", True, 500, 2, 15, 8, "scientific", ",", "square", 1), ("title", True, 1500, 1, 125, 8, "fixed", ";", "parentheses", 2)]) def test_print_ipu_tensor(capfd, title, print_gradient, summarise_threshold, edge_items, max_line_width, digits, float_format, separator, brackets_type, match_str_idx): separator = " " if separator is None else separator class Model(torch.nn.Module): def __init__(self): super().__init__() self.bias = torch.nn.Parameter(torch.zeros(())) def forward(self, x): x = x + 1 x = poptorch.ipu_print_tensor(x, title, print_gradient, summarise_threshold, edge_items, max_line_width, digits, float_format, separator, *brackets[brackets_type]) return x + self.bias poptorch_model = poptorch.inferenceModel(Model()) torch.manual_seed(0) x = torch.rand((2, 2)) _ = poptorch_model(x) captured = capfd.readouterr() # Very awkward to test this 'dynamically' so just test against some known # outputs above. Quite small tensors to test, but testing large ones would # be messy. assert match_str[match_str_idx] in captured.err ================================================ FILE: tests/loop_test.py ================================================ #!/usr/bin/env python3 # Copyright (c) 2020 Graphcore Ltd. All rights reserved. import torch import pytest import helpers import poptorch def test_loop_constant(): class Model(torch.nn.Module): def forward(self, x): def body(x): return x * 2 return poptorch.for_loop(10, body, [x])[0] inference_model = poptorch.inferenceModel(Model()) x = torch.tensor([1.]) assert inference_model(x) == pow(2, 10) def test_loop_simple(): class Model(torch.nn.Module): def forward(self, x, y): def body(x): return x * y return poptorch.for_loop(10, body, [x])[0] inference_model = poptorch.inferenceModel(Model()) x = torch.tensor([1.]) y = torch.tensor([2.]) assert inference_model(x, y) == pow(2, 10) def test_loop_multiple_inputs(): class Model(torch.nn.Module): def forward(self, x, y, z, w): def body(x, y, z, w): return x * y, y + z, x * w, w + 1 return poptorch.for_loop(10, body, [x, y, z, w]) inference_model = poptorch.inferenceModel(Model()) x = torch.tensor([0.1]) y = torch.tensor([0.2]) z = torch.tensor([0.3]) w = torch.tensor([0.4]) out = inference_model(x, y, z, w) # Check by running equiv on host. x = torch.tensor([0.1]) y = torch.tensor([0.2]) z = torch.tensor([0.3]) w = torch.tensor([0.4]) for _ in range(0, 10): _z = x * w x *= y y += z w = w + 1 z = _z for host, ipu in zip([x, y, z, w], out): assert host == ipu def test_loop_non_tensor_in(): class Model(torch.nn.Module): def forward(self, x, _): def body(x, y): return x * y, y + 1 return poptorch.for_loop(10, body, [x, 5]) inference_model = poptorch.inferenceModel(Model()) x = torch.tensor([1.]) y = torch.tensor([2.]) msg = "(Object contained in list at index 1 is not torch.tensor)" with pytest.raises(ValueError, match=msg): inference_model(x, y) def test_loop_non_list_in(): class Model(torch.nn.Module): def forward(self, x, y): def body(x): return x * y return poptorch.for_loop(10, body, x) inference_model = poptorch.inferenceModel(Model()) x = torch.tensor([1.]) y = torch.tensor([2.]) msg = "(Object is not list)" with pytest.raises(ValueError, match=msg): inference_model(x, y) def test_loop_weights(): class Model(torch.nn.Module): def __init__(self): super().__init__() self.layer1 = torch.nn.Linear(1, 256) self.layer2 = torch.nn.Conv2d(4, 1, [8, 8]) def forward(self, x): def body(x): act = self.layer1(x) act = act.reshape([1, 4, 8, 8]) act = self.layer2(act) return act.flatten() return poptorch.for_loop(2, body, [x])[0] inference_model = poptorch.inferenceModel(Model()) x = torch.tensor([1.]) inference_model(x) def test_loop_weights_use_twice(): class Model(torch.nn.Module): def __init__(self): super().__init__() self.layer1 = torch.nn.Linear(4, 4) def forward(self, x): def body(x): act = self.layer1(x) return self.layer1(act) return poptorch.for_loop(2, body, [x]) inference_model = poptorch.inferenceModel(Model()) x = torch.ones(1, 4).to(torch.float) inference_model(x) def test_loop_use_output(): class Model(torch.nn.Module): def forward(self, x): def body(x): return x + x out = poptorch.for_loop(2, body, [x])[0] loss = poptorch.identity_loss(out, reduction='sum') return out, loss inference_model = poptorch.inferenceModel(Model()) x = torch.ones(1, 4).to(torch.float) inference_model(x) def test_loop_training(): class Model(torch.nn.Module): def __init__(self): super().__init__() self.layer1 = torch.nn.Linear(4, 4) def forward(self, x): def body(x): return self.layer1(x) out = poptorch.for_loop(2, body, [x])[0] loss = poptorch.identity_loss(out, reduction='sum') return out, loss training_model = poptorch.trainingModel(Model()) x = torch.ones(1, 4).to(torch.float) with pytest.raises( poptorch.Error, match=r"poptorch.for_loop\(\) is only supported in inference"): training_model(x) def test_loop_body_inplace_ops_1(): class Model(torch.nn.Module): def forward(self, x): # Body inputs are passed by value so 'x' remains unchanged. def body(y): y += 1 return y return poptorch.for_loop(3, body, [x])[0] poptorch_model = poptorch.inferenceModel(Model()) x = torch.ones(1, 5).to(torch.int32) x_copy = torch.ones(1, 5).to(torch.int32) out = poptorch_model(x) helpers.assert_allequal(actual=x, expected=x_copy) helpers.assert_allequal(actual=out, expected=x_copy * 4) def test_loop_body_inplace_ops_2(): class Model(torch.nn.Module): def forward(self, x): # Body inputs are passed by value so 'x' remains unchanged. def body(y): y += 1 y += 1 return y return poptorch.for_loop(3, body, [x])[0] poptorch_model = poptorch.inferenceModel(Model()) x = torch.ones(1, 5).to(torch.int32) x_copy = torch.ones(1, 5).to(torch.int32) out = poptorch_model(x) helpers.assert_allequal(actual=x, expected=x_copy) helpers.assert_allequal(actual=out, expected=x_copy * 7) def test_loop_body_inplace_ops_3(): class Model(torch.nn.Module): def forward(self, x): x += 1 # Body inputs are passed by value so 'x' remains unchanged. def body(y): y += 1 return y return poptorch.for_loop(3, body, [x])[0] poptorch_model = poptorch.inferenceModel(Model()) x = torch.ones(1, 5).to(torch.int32) x_copy = torch.ones(1, 5).to(torch.int32) out = poptorch_model(x) helpers.assert_allequal(actual=x, expected=x_copy * 2) helpers.assert_allequal(actual=out, expected=x_copy * 5) def test_loop_body_inplace_ops_4(): class Model(torch.nn.Module): def forward(self, x): x += 1 # Body inputs are passed by value so 'x' remains unchanged. def body(y): y += 1 return y z = poptorch.for_loop(3, body, [x])[0] x += 1 return z poptorch_model = poptorch.inferenceModel(Model()) x = torch.ones(1, 5).to(torch.int32) x_copy = torch.ones(1, 5).to(torch.int32) out = poptorch_model(x) helpers.assert_allequal(actual=x, expected=x_copy * 3) helpers.assert_allequal(actual=out, expected=x_copy * 5) def test_loop_with_constant_inputs_only(): class Model(torch.nn.Module): def forward(self): # 't0' will be evaluated as part of constexpr folding. t0 = torch.tensor([0., 0.]) t0 = t0 + 8 # 't1' and 't2' must not be evaluated as part of constexpr folding. t1 = torch.tensor([1., 2.]) t2 = torch.tensor([3., 4.]) def func(x, y): x = x * 2 y = y * x return x, y t1, t2 = poptorch.for_loop(5, func, [t1, t2]) return t1, t0 poptorch_model = poptorch.inferenceModel(Model()) helpers.assert_allequal(actual=poptorch_model(), expected=(torch.tensor([32., 64.]), torch.tensor([8., 8.]))) def test_loop_with_same_trip_count_on_multiple_ipus(): class Model(torch.nn.Module): def forward(self, x, y): def func(x, y): x = x + y return x, y # Note: both trip_count equal to 5 with poptorch.Block("0", ipu_id=0): x, y = poptorch.for_loop(5, func, [x, y]) with poptorch.Block("1", ipu_id=1): x, y = poptorch.for_loop(5, func, [x, y]) return x, y native = Model() stages = [poptorch.Stage(f"{k}") for k in range(0, 2)] strategy = poptorch.ShardedExecution(*stages) opts = poptorch.Options() opts.setExecutionStrategy(strategy) ipu = poptorch.inferenceModel(native, opts) x = torch.tensor([1., 2.]) y = torch.tensor([1., 2.]) ipu_out = ipu(x, y)[0] native_out = x + 10 * y helpers.assert_allclose(actual=ipu_out, expected=native_out) ================================================ FILE: tests/losses_test.py ================================================ #!/usr/bin/env python3 # Copyright (c) 2020 Graphcore Ltd. All rights reserved. import random import os # pylint: disable=unused-import import unittest.mock import torch import torch.optim as optim import torch.nn.functional as F import pytest import helpers import poptorch def loss_harness(loss, inputs, target, reduction, op=None, training=True, **kwargs): def moveTo(structure, device): if isinstance(structure, dict): return {k: moveTo(v, device) for k, v in structure.items()} if torch.is_tensor(structure): return structure.to(device) return structure if len(inputs) == 1: loss_fn = lambda x: loss(x, moveTo(target, x.device), reduction=reduction, **moveTo(kwargs, x.device)) if op is None: op = lambda x: x elif len(inputs) == 2: loss_fn = lambda x, y: loss(x, y, moveTo(target, x.device), reduction=reduction, **moveTo(kwargs, x.device)) if op is None: op = lambda x, y: (x, y) else: assert len(inputs) == 3 # The only supported loss fn with 3 inputs is TripletMarginLoss # which has no "target" per se loss_fn = lambda x, y, z: loss( x, y, z, reduction=reduction, **moveTo(kwargs, x.device)) if op is None: op = lambda x, y, z: (x, y, z) model = helpers.ModelWithWeights(op, inputs[0].shape, loss_fn=loss_fn) poptorch_model = poptorch.trainingModel( model) if training else poptorch.inferenceModel(model) native_out, _ = model(tuple(inputs)) poptorch_out, poptorch_loss = poptorch_model(tuple(inputs)) # Inference test - check outputs helpers.assert_allclose(actual=poptorch_out, expected=native_out) if training: # Training test - check weights have changed poptorch_model.assert_weights_changed() # Return the poptorch model and original outputs for any further # testing return poptorch_model, poptorch_out, poptorch_loss @pytest.mark.parametrize("reduction", ["mean", "sum"]) def test_L1Loss(reduction): torch.manual_seed(42) target = torch.randn(10) input = torch.randn(10) poptorch_model, original, original_loss = loss_harness( F.l1_loss, [input], target, reduction) # Make sure the first run doesn't already pass the test. assert original_loss > 0.1 assert not torch.allclose(original, target, rtol=1e-02, atol=1e-02) for i in range(0, 1000): out, loss = poptorch_model((input, )) # Model needs to adjust the LR in the middle to converge if i == 500: poptorch_model.setOptimizer( optim.SGD(poptorch_model.model.parameters(), lr=0.001)) # Check we have trained the "model" assert loss < original_loss # "sum" L1 losses tend to be very large compared to "mean" if reduction == "sum": assert loss < 0.1 else: assert loss < 0.001 helpers.assert_allclose(actual=out, expected=target, rtol=1e-02, atol=1e-02) @pytest.mark.parametrize("reduction", ["mean", "sum"]) def test_MSELoss(reduction): torch.manual_seed(42) target = torch.randn(10) input = torch.randn(10) poptorch_model, original, original_loss = loss_harness( F.mse_loss, [input], target, reduction) # Make sure the first run doesn't already pass the test assert original_loss > 0.1 assert not torch.allclose(original, target, rtol=1e-02, atol=1e-02) for _ in range(0, 1000): out, loss = poptorch_model((input, )) # Check we have trained the "model" assert loss < 0.001 helpers.assert_allclose(actual=out, expected=target, rtol=1e-02, atol=1e-02) cross_entropy_params = [ # Input shape, reduction ((1, 10), "mean"), ((1, 10, 2), "sum"), ((1, 10, 2, 3), "mean"), ] @pytest.mark.parametrize("input_shape, reduction", cross_entropy_params) def test_CrossEntropy(input_shape, reduction): torch.manual_seed(42) input = torch.randn(input_shape) label_shape = [input_shape[0]] if len(input_shape) > 2: label_shape.extend(input_shape[2:]) label = torch.randint(0, 10, label_shape) poptorch_model, _, original_loss = loss_harness(F.cross_entropy, [input], label, reduction) for _ in range(0, 100): out, loss = poptorch_model((input, )) # Check we have trained the "model" assert loss < original_loss helpers.assert_allequal(actual=torch.argmax(out, dim=1), expected=label) # Test softmax and logsoftmax for dimensions more than 2 def op_withdim(op, input): # Run on CPU. native_out = op(input) # Run on IPU. poptorch_model = poptorch.inferenceModel(op) poptorch_out = poptorch_model(input) helpers.assert_allclose(expected=native_out, actual=poptorch_out) ops_float = [ torch.nn.Softmax, torch.nn.LogSoftmax, ] @unittest.mock.patch.dict("os.environ", helpers.disableSmallModel()) @pytest.mark.parametrize("op", ops_float) @pytest.mark.parametrize("dim", range(-4, 3)) def test_op_withdim_4d(op, dim): N, C = 11, 22 M, K = 33, 44 torch.manual_seed(42) x = torch.randn(N, C, M, K) op_withdim(op(dim=dim), x) @pytest.mark.parametrize("op", ops_float) @pytest.mark.parametrize("dim", range(-2, 1)) def test_op_withdim_2d(op, dim): N, C = 17, 13 torch.manual_seed(42) x = torch.randn(N, C) op_withdim(op(dim=dim), x) # Test NLL loss by using it to match a target label. @pytest.mark.parametrize("reduction", ["mean", "sum"]) def test_NLLLoss(reduction): torch.manual_seed(42) op = lambda x: F.log_softmax(x, dim=1) label = torch.randint(0, 10, [1]) input = torch.randn(1, 10) poptorch_model, _, original_loss = loss_harness(F.nll_loss, [input], label, reduction, op) for _ in range(0, 100): out, loss = poptorch_model((input, )) # Check we have trained the "model" assert loss < original_loss assert torch.argmax(out, dim=1) == label # Test NLL loss 2d by using it to match a target label. @pytest.mark.parametrize("reduction", ["mean", "sum"]) def test_NLLLoss2d(reduction): torch.manual_seed(42) N, C, M = 3, 2, 5 op = lambda x: F.log_softmax(x, dim=1) y = torch.empty(N, M, M, dtype=torch.long).random_(0, C) x = torch.randn(N, C, M, M) poptorch_model, _, original_loss = loss_harness(F.nll_loss, [x], y, reduction, op) for _ in range(0, 100): out, loss = poptorch_model((x, )) # Check we have trained the "model" assert loss < original_loss helpers.assert_allclose(actual=torch.argmax(out, dim=1), expected=y) # This also servees as the NLL loss test as it uses NLL under the hood. @pytest.mark.parametrize("reduction", ["mean", "sum"]) def test_BCE(reduction): torch.manual_seed(42) target = torch.empty(10).uniform_() input = torch.randn(10) poptorch_model, _, original_loss = loss_harness(F.binary_cross_entropy, [input], target, reduction, op=torch.sigmoid) # Make sure the first run doesn't already pass the test. _, original_loss = poptorch_model((input, )) for _ in range(0, 2500): out, loss = poptorch_model((input, )) # # Check we have trained the "model" assert loss < original_loss helpers.assert_allclose(actual=out, expected=target, rtol=1e-02, atol=1e-02) # TODO(T22975) # This also servees as the NLL loss test as it uses NLL under the hood. # Re-enable once pytorch fixes https://github.com/pytorch/pytorch/issues/40679 # def test_BCE_direct_with_weight(): # reductions = ["mean", "sum"] # torch.manual_seed(42) # for reduction in reductions: # weight = torch.randn(10) # model = torch.nn.BCELoss(weight=weight, reduction=reduction) # poptorch_model = poptorch.inferenceModel(model) # for i in range(0, 10): # target = torch.empty(10, 10).random_(2) # input = torch.empty(10, 10).uniform_() # groundTruth = model(input, target) # poptorch_out = poptorch_model(input, target) # helpers.assert_allclose(expected=groundTruth, actual=poptorch_out) @pytest.mark.parametrize("reduction", {"mean", "sum", "batchmean"}) @pytest.mark.parametrize("log_target", {True, False}) def test_KLDiv(reduction, log_target): torch.manual_seed(42) # 2D Tensors to test batchmean target = torch.empty(3, 10).uniform_(-1, 1) input = torch.randn(3, 10) loss_harness(F.kl_div, [input], target, reduction, log_target=log_target) @pytest.mark.parametrize("reduction", {"mean", "sum"}) @pytest.mark.parametrize("log_input", {True, False}) @pytest.mark.parametrize("full", {True, False}) def test_PoissonNLLLoss(reduction, log_input, full): torch.manual_seed(42) target = torch.poisson(torch.rand(10) * 5) input = torch.empty(10).uniform_() loss_harness(F.poisson_nll_loss, [input], target, reduction, full=full, log_input=log_input) @pytest.mark.parametrize("reduction", {"mean", "sum"}) def test_HingeEmbeddingLoss(reduction): torch.manual_seed(42) delta = torch.rand(1) + 0.5 # Generate random set of 1s and -1s for labels target = torch.randint(2, [10]) * 2 - 1 input = torch.empty(10).uniform_() loss_harness(F.hinge_embedding_loss, [input], target, reduction, margin=delta.item()) torch.manual_seed(42) params_bcewithlogits = [ ( torch.rand(10, 3), # Inputs torch.empty(10, 3).uniform_(), # Targets torch.rand(10, 3), # Weights torch.rand(3) # Pos Weights ), # Numerical stability test (torch.tensor([88.0]), torch.tensor([0.5]), None, None) ] @pytest.mark.parametrize("reduction", {"mean", "sum"}) @pytest.mark.parametrize("input, target, weight, pos_weight", params_bcewithlogits) def test_BCEWithLogitsLoss(reduction, input, target, weight, pos_weight): loss_harness(F.binary_cross_entropy_with_logits, [input], target, reduction, weight=weight, pos_weight=pos_weight) @pytest.mark.parametrize("reduction", {"mean", "sum"}) def test_SmoothL1Loss(reduction): torch.manual_seed(42) input = torch.randn(10) target = torch.empty(10).uniform_() loss_harness(F.smooth_l1_loss, [input], target, reduction) @pytest.mark.parametrize("reduction", {"mean", "sum"}) def test_SoftMarginLoss(reduction): torch.manual_seed(42) input = torch.empty(10).uniform_() # Generate random set of 1s and -1s for labels target = torch.randint(2, [10]) * 2 - 1 loss_harness(F.soft_margin_loss, [input], target, reduction) # TODO(T30688): Support MultiLabelSoftMarginLoss @pytest.mark.skip() @pytest.mark.parametrize("reduction", {"mean", "sum"}) @pytest.mark.parametrize("specify_weight", {True, False}) def test_MultiLabelSoftMarginLoss(reduction, specify_weight): torch.manual_seed(42) weight = torch.randn(3, 10) if specify_weight else None input = torch.empty(3, 10).uniform_() # Generate random set of 0s and 1s for labels target = torch.randint(2, [3, 10]) loss_harness(F.multilabel_soft_margin_loss, [input], target, reduction, weight=weight) @pytest.mark.parametrize("reduction", {"mean", "sum"}) def test_CosineEmbeddingLoss(reduction): pytest.skip("TODO(T66165): Fails due to detach op pass") torch.manual_seed(42) # Margin should be between -1 and 1 margin = torch.rand(1) * 2 - 1 input1 = torch.empty(10, 3).uniform_() input2 = torch.empty(10, 3).uniform_() # Generate random set of 1s and -1s for labels target = torch.randint(2, [10]) * 2 - 1 loss_harness(F.cosine_embedding_loss, [input1, input2], target, reduction, margin=margin.item()) @pytest.mark.parametrize("reduction", {"mean", "sum"}) def test_MarginRankingLoss(reduction): torch.manual_seed(42) # Margin should be between -1 and 1 margin = torch.rand(1) * 2 - 1 # As per the current PyTorch implementation, both dims must be equal input1 = torch.empty(10, 10).uniform_() input2 = torch.empty(10, 10).uniform_() # Generate random set of 1s and -1s for labels target = torch.randint(2, [10, 10]) * 2 - 1 loss_harness(F.margin_ranking_loss, [input1, input2], target, reduction, margin=margin.item()) @pytest.mark.parametrize("p", {2., 3.}) @pytest.mark.parametrize("swap", {True, False}) @pytest.mark.parametrize("reduction", {"mean", "sum"}) def test_TripletMarginLoss(p, swap, reduction): torch.manual_seed(42) # Between 0 and 2 margin = torch.rand(1) * 2 anchor = torch.randn(10, 5) positive = torch.randn(10, 5) negative = torch.randn(10, 5) loss_harness(F.triplet_margin_loss, [anchor, positive, negative], None, reduction, margin=margin.item(), p=p, swap=swap) @pytest.mark.parametrize("blank", {0, 3}) @pytest.mark.parametrize("reduction", {"mean", "sum"}) @pytest.mark.parametrize("zero_infinity", [True, False]) @pytest.mark.parametrize("lengths_are_tensors", [True, False]) def test_CTCLoss(blank, reduction, zero_infinity, lengths_are_tensors): T = 10 # Input sequence length N = 4 # Batch size C = 5 # Number of classes S = 6 if not zero_infinity else 10 # Target sequence length S_min = 3 # Minimum target length torch.manual_seed(42) # Initialize random batch of input vectors, for *size = (T,N,C) input = torch.randn(T, N, C).log_softmax(-1).detach() if lengths_are_tensors: input_lengths = torch.full(size=(N, ), fill_value=T, dtype=torch.long) target_lengths = torch.randint(low=S_min, high=S, size=(N, ), dtype=torch.long) else: input_lengths = [T] * N target_lengths = [random.randint(S_min, S - 1) for _ in range(N)] # Initialize random batch of targets (0..C excluding the blank class) target = torch.randint(low=0, high=C - 1, size=(N, S), dtype=torch.long) target[target > blank] += 1 loss_harness(F.ctc_loss, [input], target, reduction, input_lengths=input_lengths, target_lengths=target_lengths, blank=blank, zero_infinity=zero_infinity) @pytest.mark.parametrize("reduction", ("mean", "sum")) def test_identity_with_linear_out_returned(reduction): torch.manual_seed(42) el_in = 2 class Model(torch.nn.Module): def __init__(self): super().__init__() self.lin = torch.nn.Linear(el_in, el_in) def forward(self, x): out = self.lin(x) loss = poptorch.identity_loss(out, reduction=reduction) return loss, out x = torch.rand(1, 1, el_in) model = Model() native_loss, native_out = model(x) poptorch_model = poptorch.trainingModel(model) poptorch_loss, poptorch_out = poptorch_model(x) helpers.assert_allclose(actual=poptorch_loss, expected=native_loss) helpers.assert_allclose(actual=poptorch_out, expected=native_out) assert native_loss.shape != native_out.shape ================================================ FILE: tests/lstm_test.py ================================================ #!/usr/bin/env python3 # Copyright (c) 2020 Graphcore Ltd. All rights reserved. import torch import torch.nn as nn import helpers import poptorch def test_lstm(): torch.manual_seed(42) lstm = nn.LSTM(3, 3) ipuLstm = poptorch.inferenceModel(lstm) inputs = [torch.randn(1, 3) for _ in range(5)] # initialize the hidden state. hidden = (torch.randn(1, 1, 3), torch.randn(1, 1, 3)) for i in inputs: # Step through the sequence one element at a time. # after each step, hidden contains the hidden state. out, newHidden = lstm(i.view(1, 1, -1), hidden) ipuOut, ipuHidden = ipuLstm(i.view(1, 1, -1), hidden) helpers.assert_allclose(expected=newHidden[0], actual=ipuHidden[0]) helpers.assert_allclose(expected=newHidden[1], actual=ipuHidden[1]) helpers.assert_allclose(expected=out, actual=ipuOut) hidden = newHidden def test_lstm2(): torch.manual_seed(42) numHidden = 5 inputSize = 3 lstm = nn.LSTM(3, numHidden) ipuLstm = poptorch.inferenceModel(lstm) inputs = [torch.randn(1, inputSize) for _ in range(5)] # Add the extra 2nd dimension inputs = torch.cat(inputs).view(len(inputs), 1, -1) hidden = (torch.randn(1, 1, numHidden), torch.randn(1, 1, numHidden)) out, newHidden = lstm(inputs, hidden) ipuOut, ipuHidden = ipuLstm(inputs, hidden) helpers.assert_allclose(expected=newHidden[0], actual=ipuHidden[0]) helpers.assert_allclose(expected=newHidden[1], actual=ipuHidden[1]) helpers.assert_allclose(expected=out, actual=ipuOut) def test_lstm_twice(): torch.manual_seed(42) numHidden = 5 inputSize = 3 lstm = nn.LSTM(3, numHidden) ipuLstm = poptorch.inferenceModel(lstm) inputs = [torch.randn(1, inputSize) for _ in range(5)] # Add the extra 2nd dimension inputs = torch.cat(inputs).view(len(inputs), 1, -1) hidden = (torch.randn(1, 1, numHidden), torch.randn(1, 1, numHidden)) out, newHidden = lstm(inputs, hidden) ipuOut, ipuHidden = ipuLstm(inputs, hidden) helpers.assert_allclose(expected=newHidden[0], actual=ipuHidden[0]) helpers.assert_allclose(expected=newHidden[1], actual=ipuHidden[1]) helpers.assert_allclose(expected=out, actual=ipuOut) out, newHidden = lstm(inputs, hidden) ipuOut2, ipuHidden2 = ipuLstm(inputs, hidden) helpers.assert_allclose(expected=newHidden[0], actual=ipuHidden2[0]) helpers.assert_allclose(expected=newHidden[1], actual=ipuHidden2[1]) helpers.assert_allclose(expected=out, actual=ipuOut2) helpers.assert_allclose(expected=ipuOut, actual=ipuOut2) def test_lstm_batch_first(): torch.manual_seed(42) numHidden = 5 inputSize = 3 lstm = nn.LSTM(3, numHidden, batch_first=True) ipuLstm = poptorch.inferenceModel(lstm) inputs = [torch.randn(1, inputSize) for _ in range(5)] # Add the extra 2nd dimension inputs = torch.cat(inputs).view(1, len(inputs), -1) hidden = (torch.randn(1, 1, numHidden), torch.randn(1, 1, numHidden)) out, newHidden = lstm(inputs, hidden) ipuOut, ipuHidden = ipuLstm(inputs, hidden) helpers.assert_allclose(expected=newHidden[0], actual=ipuHidden[0]) helpers.assert_allclose(expected=newHidden[1], actual=ipuHidden[1]) helpers.assert_allclose(expected=out, actual=ipuOut) def test_lstm_batched(): torch.manual_seed(42) numHidden = 5 inputSize = 3 batch = 4 lstm = nn.LSTM(3, numHidden) ipuLstm = poptorch.inferenceModel(lstm) inputs = [torch.randn(batch, inputSize) for _ in range(5)] # Add the extra 2nd dimension inputs = torch.cat(inputs).view(len(inputs), batch, -1) print(inputs.shape) hidden = (torch.randn(1, batch, numHidden), torch.randn(1, batch, numHidden)) out, newHidden = lstm(inputs, hidden) ipuOut, ipuHidden = ipuLstm(inputs, hidden) helpers.assert_allclose(expected=newHidden[0], actual=ipuHidden[0]) helpers.assert_allclose(expected=newHidden[1], actual=ipuHidden[1]) helpers.assert_allclose(expected=out, actual=ipuOut) def test_lstm_batched_batch_first(): torch.manual_seed(42) numHidden = 5 inputSize = 3 batch = 4 lstm = nn.LSTM(3, numHidden, batch_first=True) ipuLstm = poptorch.inferenceModel(lstm) inputs = [torch.randn(batch, inputSize) for _ in range(5)] # Add the extra 2nd dimension inputs = torch.cat(inputs).view(batch, len(inputs), -1) hidden = (torch.randn(1, batch, numHidden), torch.randn(1, batch, numHidden)) out, newHidden = lstm(inputs, hidden) ipuOut, ipuHidden = ipuLstm(inputs, hidden) helpers.assert_allclose(expected=newHidden[0], actual=ipuHidden[0]) helpers.assert_allclose(expected=newHidden[1], actual=ipuHidden[1]) helpers.assert_allclose(expected=out, actual=ipuOut) def test_lstm_fc(): torch.manual_seed(42) batch_size = 2 input_size = 5 op = nn.LSTM(input_size, hidden_size=3, num_layers=1, bias=True) input = torch.randn(1, batch_size, input_size) out_fn = lambda x: x[0] model = helpers.ModelWithWeights(op, input.shape, out_fn) poptorch_model = poptorch.trainingModel(model) (native_out, (native_hn, native_cn)), _ = model((input, )) (poptorch_out, (poptorch_hn, poptorch_cn)), _ = poptorch_model((input, )) # Inference test - check outputs helpers.assert_allclose(actual=poptorch_out, expected=native_out) helpers.assert_allclose(actual=poptorch_hn, expected=native_hn) helpers.assert_allclose(actual=poptorch_cn, expected=native_cn) # Training test - check weights have changed poptorch_model.assert_weights_changed() ================================================ FILE: tests/math_ops_test.py ================================================ #!/usr/bin/env python3 # Copyright (c) 2020 Graphcore Ltd. All rights reserved. import unittest import torch import pytest import helpers import poptorch non_differentiable_ops = [ torch.ceil, torch.floor, torch.round, torch.sign, torch.trunc, torch.argmax, torch.argmin, torch.remainder, torch.floor_divide ] def op_harness(op, inputs, assert_func, test_training=False, out_fn=None): is_unary = len(inputs) == 1 if not is_unary: assert len(inputs) == 2 if test_training and not op in non_differentiable_ops: model = helpers.ModelWithWeights(op, inputs[0].shape, out_fn) # Run on CPU. native_out, _ = model(tuple(inputs)) # The LR should be large enough that a single training step will # definitely cause weights to change optim = torch.optim.AdamW(model.parameters(), lr=0.1) # Run on IPU. poptorch_model = poptorch.trainingModel(model, optimizer=optim) poptorch_out, _ = poptorch_model(tuple(inputs)) # Training test - check weights have changed poptorch_model.assert_weights_changed() else: class Model(torch.nn.Module): def __init__(self, op): super().__init__() self.op = op if is_unary: Model.forward = lambda self, x: self.op(x) else: Model.forward = lambda self, x, y: self.op(x, y) model = Model(op) # Run on CPU. native_out = model(*inputs) # Run on IPU. poptorch_model = poptorch.inferenceModel(model) poptorch_out = poptorch_model(*inputs) assert_func(native_out, poptorch_out) unary_ops_float = [ torch.abs, torch.acos, torch.acosh, torch.asin, torch.asinh, torch.atan, torch.atanh, # torch.angle, torch.ceil, torch.cos, torch.cosh, # torch.conj, torch.digamma torch.erf, torch.erfc, #torch.erfinv, torch.exp, torch.expm1, torch.floor, torch.frac, # torch.imag, torch.lgamma, torch.log, torch.log10, torch.log1p, torch.log2, # torch.logical_not, torch.mvlgamma, torch.neg, # torch.real, torch.reciprocal, torch.round, torch.rsqrt, torch.sigmoid, torch.sign, torch.sin, torch.sinh, torch.sqrt, torch.square, torch.tan, torch.tanh, torch.trunc, ] @pytest.mark.parametrize("op", unary_ops_float) def test_unary_ops_float(op): torch.manual_seed(42) input = torch.randn([1, 2, 10, 10]) def assert_(native_out, poptorch_out): helpers.assert_allclose(expected=native_out, actual=poptorch_out, atol=1e-03, rtol=1e-03, equal_nan=True) op_harness(op, [input], assert_, test_training=True) @pytest.mark.parametrize("inplace", [True, False]) @pytest.mark.parametrize("exponent", [4.0, 3, 2.5]) def test_binary_pow(inplace, exponent): torch.manual_seed(42) input = torch.randn([1, 2, 10, 200]) def assert_(native_out, poptorch_out): helpers.assert_allclose(actual=poptorch_out, expected=native_out, equal_nan=True) def op(x): if inplace: # Although inplace would work, the native and poptorch output will # naturally not match as the input is changed x = x + 0 return x.pow_(exponent) return torch.pow(x, exponent) op_harness(op, [input], assert_) unary_ops_int = [ torch.bitwise_not, ] @pytest.mark.parametrize("op", unary_ops_int) def test_unary_ops_int(op): torch.manual_seed(42) input = torch.randint(-1000, 1000, [1, 2, 10, 200]) def assert_(native_out, poptorch_out): helpers.assert_allequal(actual=poptorch_out, expected=native_out) op_harness(op, [input], assert_) unary_ops_bool = [ torch.bitwise_not, ] @pytest.mark.parametrize("op", unary_ops_bool) def test_unary_ops_bool(op): torch.manual_seed(42) input = torch.randint(2, [1, 2, 10, 200]) > 0 def assert_(native_out, poptorch_out): helpers.assert_allequal(actual=poptorch_out, expected=native_out) op_harness(op, [input], assert_) # Parameterize torch.clamp unit tests for different supported overloads clamp_inputs = [{ "min": 0.2, "max": 0.8 }, { "min": 0.2 }, { "max": 0.8 }, { "min": 0.8, "max": 0.2 }] @pytest.mark.parametrize("args", clamp_inputs) def test_clamp(args): torch.manual_seed(42) input = torch.randn([1, 2, 10, 10]) def op_clamp(x): return x.clamp(**args) def assert_(native_out, poptorch_out): helpers.assert_allclose(actual=poptorch_out, expected=native_out) op_harness(op_clamp, [input], assert_, test_training=True) @pytest.mark.parametrize("args", clamp_inputs) def test_clamp_(args): torch.manual_seed(42) input = torch.randn([1, 2, 10, 10]) def op_clamp_(x): return x.clamp_(**args) def assert_(native_out, poptorch_out): helpers.assert_allclose(actual=poptorch_out, expected=native_out) op_harness(op_clamp_, [input], assert_, test_training=True) @pytest.mark.parametrize("args", clamp_inputs) def test_clamp_mul_exp(args): torch.manual_seed(42) t = torch.randn([1, 2, 10, 10], dtype=torch.float16) class Model(torch.nn.Module): def forward(self, x): x = x.clamp(**args) x = torch.exp(0.5 * x) return x model = Model() ipu_model = poptorch.inferenceModel(model) actual_out = ipu_model(t) expected_out = model(t.to(torch.float32)) helpers.assert_allclose(actual=actual_out, expected=expected_out) @pytest.mark.parametrize( "op", [torch.clamp_min, torch.clamp_min_, torch.clamp_max, torch.clamp_max_]) def test_clamp_min_max(op): torch.manual_seed(42) magnitude = 1 input = torch.randn(1, 2, 10, 10) * magnitude def op_clamp(x): return op(x, magnitude * 0.75) def assert_(native_out, poptorch_out): helpers.assert_allclose(actual=poptorch_out, expected=native_out) op_harness(op_clamp, [input], assert_, test_training=True) @pytest.mark.parametrize( "op", [torch.clamp_min, torch.clamp_min_, torch.clamp_max, torch.clamp_max_]) def test_clamp_min_max_tensor(op): torch.manual_seed(42) magnitude = 1 input = torch.randn(1, 2, 10, 10) * magnitude def op_clamp(x): return op(x, torch.tensor(magnitude * 0.75)) def assert_(native_out, poptorch_out): helpers.assert_allclose(actual=poptorch_out, expected=native_out) op_harness(op_clamp, [input], assert_, test_training=True) clamp_int_inputs = [ { "min": -4.5, "max": 5.5 }, { "min": -4.5 }, { "max": 5.5 }, { "min": -5, "max": 5 }, { "min": -5 }, { "max": 5 }, ] @pytest.mark.parametrize("args", clamp_int_inputs) def test_clamp_int(args): torch.manual_seed(42) t = torch.randint(-100, 100, (100, )) class Model(torch.nn.Module): def forward(self, x): return torch.clamp(x, **args) model = Model() ipu_model = poptorch.inferenceModel(model) helpers.assert_allequal(actual=ipu_model(t), expected=model(t)) binary_ops_float = [ torch.add, torch.atan2, torch.div, torch.sub, torch.fmod, torch.floor_divide, torch.mul, torch.remainder, torch.true_divide ] @pytest.mark.parametrize("op", binary_ops_float) def test_binary_ops_float(op): torch.manual_seed(42) input1 = torch.randn([1, 2, 5, 1]) * 100.0 input2 = torch.randn([1, 2, 5, 1]) * 10.0 def assert_(native_out, poptorch_out): helpers.assert_allclose(actual=poptorch_out, expected=native_out, atol=1e-05, rtol=1e-05, equal_nan=True) op_harness(op, [input1, input2], assert_, test_training=True) binary_ops_basic_element_wise_float = [ torch.add, torch.div, torch.sub, torch.mul, ] @pytest.mark.parametrize("op", binary_ops_basic_element_wise_float) def test_binary_ops_elementwise_edgecases(op): torch.manual_seed(42) input1 = torch.randn([1, 2, 10, 10]) input2 = torch.randn([1]) def assert_(native_out, poptorch_out): helpers.assert_allclose(actual=poptorch_out, expected=native_out, atol=1e-04, rtol=1e-04, equal_nan=True) class Model(torch.nn.Module): def __init__(self, op): super().__init__() self.op = op # Constant on LHS Model.forward = lambda self, x, _y: self.op(x, 4.0) op_harness(Model(op), [input1, input2], assert_, test_training=True) # Constant on RHS Model.forward = lambda self, x, _y: self.op(2.5, x) op_harness(Model(op), [input1, input2], assert_, test_training=True) # Constant on LHS wrong type. Model.forward = lambda self, x, _y: self.op(x, 4) op_harness(Model(op), [input1, input2], assert_, test_training=True) # Constant on RHS wrong type Model.forward = lambda self, x, _y: self.op(134, x) op_harness(Model(op), [input1, input2], assert_, test_training=True) binary_ops_basic_element_wise_bool = [ torch.add, torch.mul, ] @pytest.mark.parametrize("op", binary_ops_basic_element_wise_bool) def test_binary_ops_elementwise_bools(op): input1 = torch.tensor([False, True, False, True]) input2 = torch.tensor([False, False, True, True]) def assert_(native_out, poptorch_out): helpers.assert_allequal(actual=poptorch_out, expected=native_out) class Model(torch.nn.Module): def __init__(self, op): super().__init__() self.op = op # Both bools Model.forward = lambda self, x, y: self.op(x, y) op_harness(Model(op), [input1, input2], assert_) # Float on LHS Model.forward = lambda self, x, y: self.op(x.to(torch.float) + 1.0, y) op_harness(Model(op), [input1, input2], assert_) # Float on RHS Model.forward = lambda self, x, y: self.op(x, y.to(torch.float) + 1.0) op_harness(Model(op), [input1, input2], assert_) # Int on LHS Model.forward = lambda self, x, y: self.op(x.to(torch.int) + 1, y) op_harness(Model(op), [input1, input2], assert_) # Int on RHS Model.forward = lambda self, x, y: self.op(x, y.to(torch.int) + 1) op_harness(Model(op), [input1, input2], assert_) @pytest.mark.parametrize("op", [torch.fmod, torch.remainder]) def test_modulo_mixed_sign(op): input1 = torch.tensor([-4.3, 7.2, 5.0, 4.3, -7.2, 8.0]) input2 = torch.tensor([2.1, -3.4, 8.0, -2.1, 3.4, 5.0]) def assert_(native_out, poptorch_out): helpers.assert_allclose(actual=poptorch_out, expected=native_out, atol=1e-05, rtol=1e-05, equal_nan=True) op_harness(op, [input1, input2], assert_) def __and__(x, y): return x & y def __or__(x, y): return x | y def __xor__(x, y): return x ^ y binary_op_int = [ torch.bitwise_and, torch.bitwise_or, torch.bitwise_xor, __and__, __or__, __xor__ ] @pytest.mark.parametrize("op", binary_op_int) def test_binary_int_ops(op): input1 = torch.tensor([-4, 7, 5, 4, -7, 8], dtype=torch.int) input2 = torch.tensor([2, -3, 8, -2, 3, 5], dtype=torch.int) def assert_(native_out, poptorch_out): helpers.assert_allclose(actual=poptorch_out, expected=native_out, atol=1e-05, rtol=1e-05, equal_nan=True) op_harness(op, [input1, input2], assert_) # Poplar doesn't support binary ops on 8-bit integral types, but test we can # pass the rest of them. @pytest.mark.parametrize("dtype", [torch.int16, torch.int32, torch.int64]) def test_binary_int_op_types(dtype): input1 = torch.tensor([-4, 7, 5, 4, -7, 8], dtype=dtype) input2 = torch.tensor([2, -3, 8, -2, 3, 5], dtype=dtype) def assert_(native_out, poptorch_out): helpers.assert_allclose(actual=poptorch_out, expected=native_out, atol=1e-05, rtol=1e-05, equal_nan=True) op_harness(torch.bitwise_and, [input1, input2], assert_) binary_op_bool = [ torch.bitwise_and, torch.bitwise_or, # torch.bitwise_xor, TODO(T43716) torch.logical_and, torch.logical_or, #torch.logical_xor TODO(T43716) ] @pytest.mark.parametrize("op", binary_op_bool) def test_binary_bool_ops(op): input1 = torch.tensor([-4, 7, 5, 4, -7, 8]) > 0 input2 = torch.tensor([2, -3, 8, -2, 3, 5]) > 0 def assert_(native_out, poptorch_out): helpers.assert_allclose(actual=poptorch_out, expected=native_out, atol=1e-05, rtol=1e-05, equal_nan=True) op_harness(op, [input1, input2], assert_) # These functions support API 1 - op(input) reduction_ops_api1 = [ torch.max, torch.min, torch.amax, torch.amin, torch.argmax, torch.argmin, # torch.dist, torch.mean, torch.median, # torch.mode, torch.linalg.norm, torch.prod, #torch.std, torch.std_mean, torch.sum, #torch.unique, torch.unique_consecutive,torch.var, torch.var_mean, ] # These functions support API 2 - op(input,dim,keep_dim) reduction_ops_api2 = [ torch.max, torch.min, torch.amax, torch.amin, torch.argmax, torch.argmin, # torch.dist, torch.mean, torch.median, # torch.mode, torch.linalg.norm, torch.prod, torch.logsumexp, # logsumexp doesn't support API 1. #torch.std, torch.std_mean, torch.sum, #torch.unique, torch.unique_consecutive,torch.var, torch.var_mean, ] @pytest.mark.parametrize("op", reduction_ops_api1) def test_reduction_ops_float(op): torch.manual_seed(42) input = torch.randn([1, 2, 10, 10]) def assert_(native_out, poptorch_out): poptorch_out = poptorch_out.reshape(native_out.shape) if native_out.dtype == torch.float32: helpers.assert_allclose(actual=poptorch_out, expected=native_out, atol=1e-05, rtol=1e-05, equal_nan=True) else: helpers.assert_allequal(actual=poptorch_out, expected=native_out) op_harness(op, [input], assert_, test_training=True) @pytest.mark.parametrize("op", reduction_ops_api2) @pytest.mark.parametrize("dim", range(4)) @pytest.mark.parametrize("keepdim", [False, True]) def test_reduction_ops_float_api2(op, dim, keepdim): torch.manual_seed(42) input = torch.randn([1, 2, 10, 10]) def operation(x): return op(x, dim=dim, keepdim=keepdim) # Whether op returns both values and indices with API 2. returns_tuple = op in [torch.max, torch.min, torch.median] def assert_(native_out, poptorch_out): if returns_tuple: helpers.assert_allclose(actual=poptorch_out[0], expected=native_out.values) helpers.assert_allequal(actual=poptorch_out[1].to(torch.int64), expected=native_out.indices) elif native_out.dtype == torch.float32: helpers.assert_allclose(actual=poptorch_out, expected=native_out) elif torch.numel(native_out) > 1: # Work around not returning longs from popart. helpers.assert_allequal(actual=poptorch_out.to(torch.int64), expected=native_out) else: helpers.assert_allequal(actual=poptorch_out, expected=native_out) # This check must be repeated here because we need to check the op before we # wrap the function otherwise it won't match in the test harness test_training = not op in non_differentiable_ops out_fn = (lambda x: x.values) if returns_tuple else None op_harness(operation, [input], assert_, test_training=test_training, out_fn=out_fn) @pytest.mark.parametrize("op", [torch.min, torch.max]) @pytest.mark.parametrize("dim", range(3)) @pytest.mark.parametrize("keepdim", [False, True]) def test_minmax_tuple_out(op, dim, keepdim): torch.manual_seed(42) input = torch.randn([1, 2, 10, 10]) def operation(x): return op(x, dim=dim, keepdim=keepdim) def assert_(native_out, poptorch_out): assert isinstance(native_out, tuple) and isinstance( poptorch_out, tuple) assert len(native_out) == len(poptorch_out) for i, native in enumerate(native_out): helpers.assert_allclose(actual=poptorch_out[i], expected=native) out_fn = lambda x: x.values op_harness(operation, [input], assert_, test_training=True, out_fn=out_fn) # Interesting p-values for testing torch.linalg.norm(X, p=<>) norm_pvals = [ 'fro', float('inf'), float('-inf'), 1, 1.0, -1, # 2, 2.0, -2, 'nuc' Unsupported ] @pytest.mark.parametrize("p", norm_pvals) def test_norm_p_values(p): torch.manual_seed(42) input = torch.randn([2, 10]) def operation(x): return torch.linalg.norm(x, ord=p) def assert_(native_out, poptorch_out): helpers.assert_allclose(actual=poptorch_out, expected=native_out) op_harness(operation, [input], assert_, test_training=True) def test_norm_dtype(): torch.manual_seed(42) input = torch.randn([2, 10]) def operation(x): return torch.linalg.norm(x, dtype=torch.float, ord="fro") def assert_(native_out, poptorch_out): helpers.assert_allclose(actual=poptorch_out, expected=native_out) op_harness(operation, [input], assert_, test_training=True) comparison_ops = [ # torch.allclose, # Not supported in trace, seems to get optimized out. # torch.argsort, # Not in Onnx. TODO(T23319) torch.eq, # torch.equal, # Not supported as the return of trace in JIT. torch.ge, torch.gt, # torch.kthvalue, # Not in Onnx. torch.le, torch.lt, torch.max, torch.min, torch.ne, ] @pytest.mark.parametrize("op", comparison_ops) def test_compare_operations(op): torch.manual_seed(42) lhs = torch.randn([1, 2, 10, 200]) rhs = torch.randn([1, 2, 10, 200]) indices = torch.randint(0, 200, [30]) # Make a few of the indices equal. for i in indices: lhs[0][0][0][i] = rhs[0][0][0][i] def assert_(native_out, poptorch_out): helpers.assert_allequal(actual=poptorch_out, expected=native_out) op_harness(op, [lhs, rhs], assert_) if op not in (torch.min, torch.max): constant_rhs = lambda x: op(x, 0.34) op_harness(constant_rhs, [lhs], assert_) comparison_unity_nan_inf_ops = [ # torch.isfinite, torch.isinf, # Not in Onnx torch.isnan, ] @pytest.mark.parametrize("op", comparison_unity_nan_inf_ops) def test_compare_unity_nan_inf_ops(op): torch.manual_seed(42) input = torch.tensor([ 1.0, float('inf'), 2.0, float('-inf'), float('nan'), float('-nan'), 13.0 ]) def assert_(native_out, poptorch_out): helpers.assert_allequal(actual=poptorch_out, expected=native_out) op_harness(op, [input], assert_) comparison_unity = [torch.max, torch.min] @pytest.mark.parametrize("op", comparison_unity) def test_compare_unity_operations(op): torch.manual_seed(42) input = torch.randn([1, 2, 10, 10]) def operation(x): return op(x) def assert_(native_out, poptorch_out): helpers.assert_allclose(actual=poptorch_out, expected=native_out) op_harness(operation, [input], assert_, test_training=True) @pytest.mark.parametrize("largest", [True, False]) def test_topk(largest): torch.manual_seed(42) input = torch.randn([1, 2, 10, 10]) def operation(x): return torch.topk(x, k=10, dim=-1, largest=largest) def assert_(native_out, poptorch_out): helpers.assert_allclose(actual=poptorch_out[0], expected=native_out.values) helpers.assert_allequal(actual=poptorch_out[1], expected=native_out.indices) out_fn = lambda x: x.values op_harness(operation, [input], assert_, test_training=True, out_fn=out_fn) @pytest.mark.parametrize("shape", [(17, 4), (18, 23, 5)]) @pytest.mark.parametrize("descending", [True, False]) @unittest.mock.patch.dict("os.environ", helpers.disableSmallModel()) def test_sort(shape, descending): torch.manual_seed(42) input = torch.randn(*shape) def operation(x): return torch.sort(x, descending=descending) def assert_(native_out, poptorch_out): helpers.assert_allclose(actual=poptorch_out[0], expected=native_out.values) helpers.assert_allequal(actual=poptorch_out[1], expected=native_out.indices) out_fn = lambda x: x.values op_harness(operation, [input], assert_, test_training=True, out_fn=out_fn) @pytest.mark.parametrize("descending", [True, False]) @unittest.mock.patch.dict("os.environ", helpers.disableSmallModel()) def test_sort_stable(descending): torch.manual_seed(42) input = torch.tensor([[2.0, 2.0, 1.0, 10.0, 11.0], [2.0, 15.0, 15.0, 10.0, 11.0]]) def operation(x): return torch.sort(x, descending=descending, stable=True) def assert_(native_out, poptorch_out): helpers.assert_allclose(actual=poptorch_out[0], expected=native_out.values) helpers.assert_allequal(actual=poptorch_out[1], expected=native_out.indices) out_fn = lambda x: x.values op_harness(operation, [input], assert_, test_training=True, out_fn=out_fn) def test_bincount(): torch.manual_seed(42) input_size = 7 input = torch.randint(0, 8, (input_size, ), dtype=torch.int64) def operation(x): return torch.bincount(x, minlength=input_size + 1) def assert_(native_out, poptorch_out): helpers.assert_allequal(actual=poptorch_out, expected=native_out) op_harness(operation, [input], assert_, test_training=False) def test_bincount_error(): torch.manual_seed(42) input_size = 7 input = torch.randint(0, 8, (input_size, ), dtype=torch.int64) def operation(x): return torch.bincount(x) def assert_(native_out, poptorch_out): helpers.assert_allequal(actual=poptorch_out, expected=native_out) with pytest.raises( poptorch.poptorch_core.Error, match= "Bincount `minlength` must be specified and must be a constant. " "On the IPU MK2 platform the minimum length is also the " "maximum length"): op_harness(operation, [input], assert_, test_training=False) def test_bincount_weights(): torch.manual_seed(42) input_size = 7 input = torch.randint(0, 8, (input_size, ), dtype=torch.int64) def operation(x): weights = torch.linspace(0, 1, steps=input_size) return torch.bincount(x, weights, minlength=input_size + 1) def assert_(native_out, poptorch_out): helpers.assert_allequal(actual=poptorch_out, expected=native_out) op_harness(operation, [input], assert_, test_training=False) types = [torch.float32, torch.int32] @pytest.mark.parametrize("ty", types) def test_constant_arrays(ty): torch.manual_seed(42) input = torch.randn([10]).to(ty) def operation(x): constant_tensor = torch.tensor([1, -2, -3, 4, 5, 6, 7, -8, 9, -10], dtype=ty) return torch.sub(x, constant_tensor) def assert_(native_out, poptorch_out): helpers.assert_allequal(actual=poptorch_out, expected=native_out) op_harness(operation, [input], assert_) @pytest.mark.parametrize("ty", types) def test_big_constant_arrays_sliced(ty): torch.manual_seed(42) input = torch.randn([1]).to(ty) def operation(x): big_array = torch.tensor( [[ 155, 229, 322, 453, 655, 888, 1128, 1694, 2036, 2502, 3089, 3858, 4636, 5883, 7375, 9172, 10149, 12462, 15113, 17660, 21157, 24747, 27980, 31506, 35713, 41035, 47021, 43, 59138, 63927, 69176, 74386, 80589, 86498, 92472, 97689, 45, -424, 5, 6, 435, 124632, 128948, 132547, 135586, 42, 5, 147577, 5 ], [ 2, 1, 1, 3, 45, 46, 46, 83, 149, 160, 276, 414, 523, 589, 622, 724, 724, 1045, 1045, 1439, 24, 2335, 2749, 2941, 4025, 4440, 4440, 24, 7024, 7024, 8326, 9362, 10361, 10950, 12384, 13030, -8, 324, 425, 67, -245, -2425, 21815, 22837, 24392, 324, 234, 2435, 4325 ], [ 3, 7, 10, 12, 17, 21, 29, 34, 52, 79, 107, 148, 197, 233, 366, 463, 631, 827, -2344, -2, 1441, 1809, 2158, 2503, 2978, 3405, 4032, -324, 5664, 45, 53, -25, 8215, 9134, 10023, 10779, -2345, 4, 13155, 5, 98754, 143535, 245232, 16523, 17127, 2, 42, 5, 19468 ]], dtype=ty) return x * big_array[0] def assert_(native_out, poptorch_out): helpers.assert_allequal(actual=poptorch_out, expected=native_out) op_harness(operation, [input], assert_) # Parametrize input tensor shapes for addcdiv to make sure broadcasting works. broadcastable_shapes = [ ((3, 1), (3, 1), (3, 1)), ((1, 3), (3, 1), (1, 3)), ((5, 3), (5, 1), (1, 3)), ((1, ), (3, 1), (2, )), ] @pytest.mark.parametrize("shapes", broadcastable_shapes) @pytest.mark.parametrize("scale", [0.35, 4.91, 12.0, -0.53, -3.45, -9.0, 0.0]) def test_addcdiv(shapes, scale): torch.manual_seed(42) class Model(torch.nn.Module): def forward(self, tensor0, tensor1, tensor2): return torch.addcdiv( tensor0, tensor1, tensor2, value=scale, ) t0 = torch.randn(shapes[0]) t1 = torch.randn(shapes[1]) t2 = torch.randn(shapes[2]) model = Model() native_out = model(t0, t1, t2) poptorch_model = poptorch.inferenceModel(model) poptorch_out = poptorch_model(t0, t1, t2) helpers.assert_allclose(actual=poptorch_out, expected=native_out) cross_shapes = [(3, 4, 5, 6), (4, 3, 5, 6), (4, 5, 3, 6), (4, 5, 6, 3), (6, 3, 3, 5)] @pytest.mark.parametrize("shape", cross_shapes) def test_cross_shape(shape): torch.manual_seed(42) x = torch.randn(shape) y = torch.randn(shape) def assert_(native_out, poptorch_out): helpers.assert_allclose(actual=poptorch_out, expected=native_out) op_harness(torch.cross, [x, y], assert_, test_training=True) @pytest.mark.parametrize("axis", range(0, 4)) def test_cross_axis(axis): torch.manual_seed(42) class Model(torch.nn.Module): def __init__(self, axis): super().__init__() self.axis = axis def forward(self, x, y): return torch.cross(x, y, self.axis) x = torch.randn(3, 3, 3, 3) y = torch.randn(3, 3, 3, 3) def assert_(native_out, poptorch_out): helpers.assert_allclose(actual=poptorch_out, expected=native_out) op_harness(Model(axis), [x, y], assert_, test_training=True) @pytest.mark.parametrize( "params", [ # dims?, unbiased ( False, ), ([0, 1, -1], True) ]) @pytest.mark.parametrize( "op", [torch.var, torch.var_mean, torch.std, torch.std_mean]) def test_var_std(op, params): torch.manual_seed(42) x = torch.randn(3, 4, 5) model = lambda x: op(x, *params) def assert_(native_out, poptorch_out): helpers.assert_allclose(actual=poptorch_out, expected=native_out) op_harness(model, [x], assert_) @pytest.mark.parametrize("axis", range(0, 4)) @pytest.mark.parametrize("descending", [True, False]) def test_argsort(axis, descending): torch.manual_seed(42) input = torch.randn([3, 4, 5, 5]) def operation(x): return torch.argsort(x, dim=axis, descending=descending) def assert_(native_out, poptorch_out): helpers.assert_allclose(actual=poptorch_out, expected=native_out) op_harness(operation, [input], assert_) def test_reciprocal_intergral_input(): torch.manual_seed(42) input = torch.randint(256, size=(640, 480)) def operation(original_sizes): image_size = 896 ratio_image_size = (image_size / torch.amax(original_sizes).unsqueeze(axis=-1)) multiplication = (ratio_image_size * original_sizes) return ratio_image_size, multiplication def assert_(native_out, poptorch_out): helpers.assert_allclose(actual=poptorch_out, expected=native_out) op_harness(operation, [input], assert_) ================================================ FILE: tests/misc_nn_layers_test.py ================================================ #!/usr/bin/env python3 # Copyright (c) 2020 Graphcore Ltd. All rights reserved. import torch import torch.nn.functional as F import pytest import helpers import poptorch # Linears # torch.nn.Identity, torch.nn.Linear, torch.nn.Bilinear, # Dropouts # torch.nn.Dropout, torch.nn.Dropout2d, torch.nn.Dropout3d, torch.nn.AlphaDropout, # Sparse # torch.nn.Embedding, torch.nn.Embedding.from_pretrained, torch.nn.EmbeddingBag, torch.nn.EmbeddingBag.from_pretrained, include_bias = [True, False] def op_harness(op, inputs, inference_test_fn=None): if inference_test_fn is None: inference_test_fn = lambda native_out, poptorch_out: helpers.assert_allclose( expected=native_out, actual=poptorch_out) model = helpers.ModelWithWeights(op, inputs[0].shape) # Run on CPU. native_out, _ = model(tuple(inputs)) # Run on IPU. # Setup IPU seed opts = poptorch.Options() opts.randomSeed(torch.initial_seed()) poptorch_model = poptorch.trainingModel(model, options=opts) poptorch_out, _ = poptorch_model(tuple(inputs)) # Inference test - check outputs inference_test_fn(native_out, poptorch_out) # Training test - check weights changed poptorch_model.assert_weights_changed() @pytest.mark.parametrize("scale_factor", [2, 3.5, 5.00001, 5.12498]) @pytest.mark.parametrize("input_shape", [(1, 2, 8), (2, 2, 2, 8), (2, 3, 4, 2, 8)]) def test_upsample_nearest(scale_factor, input_shape): torch.manual_seed(42) op = torch.nn.Upsample(scale_factor=scale_factor, mode="nearest") x = torch.randn(*input_shape) op_harness(op, [x]) def test_downsample_nearest(): torch.manual_seed(42) # test case from T44610 op = torch.nn.Upsample(scale_factor=0.435714, mode="nearest") x = torch.randn(1, 2, 14, 14) op_harness(op, [x]) # TODO(T43375): replace scale factor 5 with 3.5 @pytest.mark.parametrize("scale_factor", [2, 5]) @pytest.mark.parametrize("input_shape", [(1, 2, 3, 4), (2, 2, 2, 8)]) @pytest.mark.parametrize("align_corners", [True, False]) def test_upsample_bilinear_factor(scale_factor, input_shape, align_corners): torch.manual_seed(42) op = torch.nn.Upsample(scale_factor=scale_factor, mode="bilinear", align_corners=align_corners) x = torch.randn(*input_shape) op_harness(op, [x]) @pytest.mark.parametrize("shapes", [[(1, 2, 3, 4), (6, 8)], [(2, 2, 2, 8), (7, 28)]]) @pytest.mark.parametrize("align_corners", [True, False]) def test_upsample_bilinear_factor_shapes(shapes, align_corners): torch.manual_seed(42) op = torch.nn.Upsample(size=shapes[1], mode="bilinear", align_corners=align_corners) x = torch.randn(*shapes[0]) op_harness(op, [x]) @pytest.mark.parametrize("shape", [(2, 2, 14, 14)]) def test_upsample_bicubic(shape): torch.manual_seed(42) model = torch.nn.Upsample(scale_factor=0.4357, mode='bicubic') x = torch.randn(*shape) # Run on CPU. native_out = model(x) # Run on IPU. poptorch_model = poptorch.inferenceModel(model) poptorch_out = poptorch_model(x) helpers.assert_allclose(expected=native_out, actual=poptorch_out) @pytest.mark.parametrize("mode, input_shape", [("linear", (1, 2, 3)), ("trilinear", (1, 2, 3, 4, 5))]) def test_unsupported_upsample(mode, input_shape): torch.manual_seed(42) scale_factor = 2 model = torch.nn.Upsample(scale_factor=scale_factor, mode=mode) x = torch.randn(*input_shape) # Run on IPU. poptorch_model = poptorch.inferenceModel(model) with pytest.raises(poptorch.Error, match="only 'nearest' is supported"): poptorch_model(x) def test_linear(): torch.manual_seed(42) model = torch.nn.Linear(20, 30) x = torch.randn(128, 20) # Run on CPU. native_out = model(x) # Run on IPU. poptorch_model = poptorch.inferenceModel(model) poptorch_out = poptorch_model(x) assert native_out.size() == poptorch_out.size() helpers.assert_allclose(expected=native_out, actual=poptorch_out) @pytest.mark.parametrize("include_bias", include_bias) @pytest.mark.parametrize("input_feature_shape", [(), (3, 4)]) def test_bilinear(include_bias, input_feature_shape): torch.manual_seed(42) op = torch.nn.Bilinear(10, 20, 30, bias=include_bias) x1 = torch.randn(8, *input_feature_shape, 10) x2 = torch.randn(8, *input_feature_shape, 20) op_harness(op, [x1, x2]) def test_identity(): torch.manual_seed(42) op = torch.nn.Identity(20, 30, 40) x = torch.randn(128, 20) op_harness(op, [x]) dropout_ops = [torch.nn.Dropout, torch.nn.Dropout2d, torch.nn.Dropout3d] @pytest.mark.parametrize("dropout_op", dropout_ops) def test_dropout_inference(dropout_op): torch.manual_seed(42) model = dropout_op() model.eval() x = torch.randn(128, 20) # Run on CPU. native_out = model(x) # Run on IPU. poptorch_model = poptorch.inferenceModel(model) poptorch_out = poptorch_model(x) msg = f"{dropout_op.__name__} in inference session should equal identity." helpers.assert_allequal(expected=native_out, actual=poptorch_out, msg=msg) @pytest.mark.parametrize("dropout_op", dropout_ops) def test_dropout_eval_during_training(dropout_op): torch.manual_seed(42) dropout = dropout_op() dropout.eval() x = torch.randn(128, 20) # Create a model consisting of a single dropout operation # with a dummy parameter for the optimizer dropout.register_parameter('param', torch.nn.Parameter(torch.empty(10))) native_out = dropout(x) # Create a poptorch training model with a fixed random seed for deterministic runs # Note that the loss is irrelevant and ignored. class ModelWithLoss(torch.nn.Module): def __init__(self): super().__init__() self.dropout = dropout self.loss = torch.nn.L1Loss() def forward(self, data, target): out = self.dropout(data) loss = self.loss(out, target) return out, loss model = ModelWithLoss() poptorch_model = poptorch.trainingModel(model) dummy_label = torch.zeros_like(x) poptorch_out, _ = poptorch_model(x, dummy_label) assert native_out.size() == poptorch_out.size() msg = f"{dropout_op.__name__} should equal identity." helpers.assert_allequal(expected=x, actual=poptorch_out, msg=msg) @pytest.mark.ipuHardwareRequired def test_dropout_training(): torch.manual_seed(42) drop_ratio = 0.8 dropout_op = torch.nn.Dropout(drop_ratio) # Input size needs to be large enough for convergence to expected dropout ratio sz = [100, 4, 3] x = torch.ones(sz, dtype=torch.float) def check_ratio(_, poptorch_out): # Instead we test that poptorch converge to the expected dropout ratio actual_ratio = x[poptorch_out == 0].sum() / x.numel() helpers.assert_allclose(actual=actual_ratio, expected=drop_ratio, rtol=0.01, atol=0.01) op_harness(dropout_op, [x], check_ratio) @pytest.mark.ipuHardwareRequired def test_dropout2d_training(): torch.manual_seed(42) drop_ratio = 0.8 dropout_op = torch.nn.Dropout2d(drop_ratio) # Input size needs to be large enough for convergence to expected dropout ratio N = 30 C = 30 num_channels = torch.as_tensor(N * C, dtype=torch.float) sz = [N, C, 2, 2] x = torch.ones(sz, dtype=torch.float) def check_ratio(_, poptorch_out): channel_mask = (poptorch_out == 0).all(-1).all(-1) actual_ratio = channel_mask.sum() / num_channels helpers.assert_allclose(actual=actual_ratio, expected=drop_ratio, rtol=0.01, atol=0.01) op_harness(dropout_op, [x], check_ratio) @pytest.mark.ipuHardwareRequired def test_dropout3d_training(): torch.manual_seed(42) drop_ratio = 0.6 dropout_op = torch.nn.Dropout3d(drop_ratio) # Input size needs to be large enough for convergence to expected dropout ratio N = 30 C = 30 num_channels = torch.as_tensor(N * C, dtype=torch.float) sz = [N, C, 2, 2, 1] x = torch.ones(sz, dtype=torch.float) def check_ratio(_, poptorch_out): channel_mask = (poptorch_out == 0).all(-1).all(-1).all(-1) actual_ratio = channel_mask.sum() / num_channels helpers.assert_allclose(actual=actual_ratio, expected=drop_ratio, rtol=0.01, atol=0.01) op_harness(dropout_op, [x], check_ratio) def test_embedding(): torch.manual_seed(42) model = torch.nn.Embedding(10, 3) x = torch.LongTensor([[1, 2, 4, 5], [4, 3, 2, 9]]) # Run on CPU. native_out = model(x) # Run on IPU. poptorch_model = poptorch.inferenceModel(model) poptorch_out = poptorch_model(x) assert native_out.size() == poptorch_out.size() helpers.assert_allequal(expected=native_out, actual=poptorch_out) # pylint: disable=unsubscriptable-object def test_embedding_padding_idx(): torch.manual_seed(0) class TestEmbedding(torch.nn.Module): def __init__(self): super().__init__() torch.manual_seed(0) self.embedding = torch.nn.Embedding(10, 4, padding_idx=0) def forward(self, x): y = self.embedding(x) loss = poptorch.identity_loss(y.sum(), "none") return y, loss model = TestEmbedding() # pylint:disable=unsubscriptable-object x = torch.arange(0, model.embedding.weight.shape[0]) y, loss = model(x) loss.backward() grad = model.embedding.weight.grad options = poptorch.Options() options.anchorTensor("grad_embedding", "Gradient___embedding.weight") pop_model = poptorch.trainingModel(TestEmbedding(), options=options) pop_y, pop_loss = pop_model(x) pop_grad = pop_model.getAnchoredTensor("grad_embedding") helpers.assert_allclose(actual=pop_y, expected=y) helpers.assert_allclose(actual=pop_loss, expected=loss) helpers.assert_allclose(actual=pop_grad, expected=grad) @pytest.mark.parametrize("mode", ["max", "mean", "sum"]) def test_embedding_bag(mode): torch.manual_seed(0) model = torch.nn.EmbeddingBag(10, 3, mode=mode) x = torch.LongTensor([[1, 2, 4, 5], [4, 3, 2, 9]]) cpu_out = model(x) pop_model = poptorch.inferenceModel(model) pop_out = pop_model(x) helpers.assert_allclose(actual=pop_out, expected=cpu_out) def test_embedding_bag_per_sample_weights(): class Model(torch.nn.Module): def __init__(self): super().__init__() # per_sample_weights are only supported for mode="sum" self.embedding_bag = torch.nn.EmbeddingBag(10, 3, mode="sum") def forward(self, x, p): return self.embedding_bag(x, per_sample_weights=p) torch.manual_seed(0) model = Model() x = torch.LongTensor([[1, 2, 4, 5], [4, 3, 2, 9]]) p = torch.randn(2, 4) cpu_out = model(x, p) pop_model = poptorch.inferenceModel(model) pop_out = pop_model(x, p) helpers.assert_allclose(actual=pop_out, expected=cpu_out) @pytest.mark.parametrize("mode", ["max", "mean", "sum"]) def test_embedding_bag_include_last_offset(mode): class Model(torch.nn.Module): def __init__(self): super().__init__() self.weight = torch.nn.Parameter(torch.Tensor(10, 3)) torch.nn.init.normal_(self.weight) def forward(self, x): offsets = torch.arange(0, x.numel(), x.size(1)) offsets = torch.cat((offsets, torch.tensor([x.numel()]))) x = x.reshape(-1) return F.embedding_bag(x, self.weight, offsets=offsets, include_last_offset=True, mode=mode) torch.manual_seed(0) model = Model() x = torch.LongTensor([[1, 2, 4, 5], [4, 3, 2, 9]]) cpu_out = model(x) pop_model = poptorch.inferenceModel(model) pop_out = pop_model(x) helpers.assert_allclose(actual=pop_out, expected=cpu_out) def test_pixel_shuffle(): torch.manual_seed(42) op = torch.nn.PixelShuffle(3) x = torch.randn(2, 18, 4, 4) op_harness(op, [x]) @pytest.mark.parametrize("params", [(2, 2, 1, 1, 1, 1), (3, 2, 1, 1, 1, 1), (2, 4, 1, 1, 1, 1), (2, 2, 2, 1, 1, 1), (2, 2, 1, 3, 1, 1), (2, 2, 1, 1, 3, 1), (2, 2, 1, 1, 1, 4)]) # Tests aten::im2col def test_unfold(params): (kernel_size_x, kernel_size_y, dilation_x, dilation_y, stride_x, stride_y) = params padding = 2 y_in = 19 x_in = 23 torch.manual_seed(42) unfold_layer = torch.nn.Unfold(kernel_size=(kernel_size_y, kernel_size_x), dilation=(dilation_y, dilation_x), padding=padding, stride=(stride_y, stride_x)) numel_y = (y_in + 2 * padding - dilation_y * (kernel_size_y - 1) - 1) // stride_y + 1 numel_x = (x_in + 2 * padding - dilation_x * (kernel_size_x - 1) - 1) // stride_x + 1 numel = numel_y * numel_x linear_layer = torch.nn.Linear(numel, numel) combined = torch.nn.Sequential(unfold_layer, linear_layer) inputs = [torch.rand(1, 1, y_in, x_in)] op_harness(combined, inputs) @pytest.mark.parametrize("params", [(2, 2, 1, 1, 1, 1), (3, 2, 1, 1, 1, 1), (2, 4, 1, 1, 1, 1), (2, 2, 2, 1, 1, 1), (2, 2, 1, 3, 1, 1), (2, 2, 1, 1, 3, 1), (2, 2, 1, 1, 1, 3)]) # Tests aten::col2im def test_fold(params): (kernel_size_x, kernel_size_y, dilation_x, dilation_y, stride_x, stride_y) = params torch.manual_seed(42) orig_input = torch.rand(2, 3, 11, 13) # unfold the input to provide an input to fold unfold_args = { "kernel_size": (kernel_size_y, kernel_size_x), "dilation": (dilation_y, dilation_x), "padding": (0, 0), "stride": (stride_y, stride_x) } unfold = torch.nn.Unfold(**unfold_args) unfolded = unfold(orig_input) unfold_args["output_size"] = orig_input.shape[2:] op = torch.nn.Fold(**unfold_args) op_harness(op, [unfolded]) # Tests aten::col2im with padding @pytest.mark.parametrize("stride_x", [1, 3]) @pytest.mark.parametrize("stride_y", [1, 3]) def test_fold_with_padding(stride_x, stride_y): torch.manual_seed(42) orig_input = torch.rand(2, 2, 11, 13) # unfold the input to provide an input to fold unfold_args = { "kernel_size": (2, 2), "dilation": (1, 1), "padding": (2, 2), "stride": (stride_y, stride_x) } unfold = torch.nn.Unfold(**unfold_args) unfolded = unfold(orig_input) # Since it is zero-padded, add a little to every value unfolded += 1.0 unfold_args["output_size"] = orig_input.shape[2:] op = torch.nn.Fold(**unfold_args) op_harness(op, [unfolded]) @pytest.mark.parametrize("dim", [0, 1, None]) def test_weight_norm(dim): torch.manual_seed(42) x = torch.randn(10) class Model(torch.nn.Module): def __init__(self): super().__init__() lin = torch.nn.Linear(10, 5) # Wrap the linear layer with a weight_norm - This should # decompose "weight" into "weight_v" and "weight_g" self.lin = torch.nn.utils.weight_norm(lin, "weight", dim) def forward(self, x): x = self.lin(x) return x, poptorch.identity_loss(x**2, reduction="sum") model = Model() weight_v_before = model.lin.weight_v.detach().clone() weight_g_before = model.lin.weight_g.detach().clone() native_out, _ = model(x) poptorch_model = poptorch.trainingModel(model) poptorch_out, _ = poptorch_model(x) helpers.assert_allclose(expected=native_out, actual=poptorch_out) tensor_names = poptorch_model.getTensorNames() decomposed_tensors = ["weight_v", "weight_g"] # Check that both decomposed tensors exist in the graph assert all(f"lin.{t}" in tensor_names for t in decomposed_tensors) # Check that they also exist in the backward graph assert all(f"UpdatedVar___lin.{t}" in tensor_names for t in decomposed_tensors) # Ensure that the original weight tensor does NOT exist - # autograd should be performed with respect to the decomposed tensors # only assert "lin.weight" not in tensor_names assert "UpdatedVar___lin.weight" not in tensor_names n = 3 # Run a few more times to ensure that the decomposed weights are being # updated each time for i in range(n): weight_v_after = poptorch_model.lin.weight_v.detach().clone() weight_g_after = poptorch_model.lin.weight_g.detach().clone() # Ensure the decomposed weights changed since the previous iteration assert not torch.allclose(weight_v_before, weight_v_after) assert not torch.allclose(weight_g_before, weight_g_after) # Prepare for the next iteration if i != n - 1: weight_v_before = weight_v_after weight_g_before = weight_g_after poptorch_model(x) ================================================ FILE: tests/misc_test.py ================================================ #!/usr/bin/env python3 # Copyright (c) 2020 Graphcore Ltd. All rights reserved. import re import pytest import torch import torch.nn as nn import helpers import poptorch @helpers.overridePoptorchLogLevel() def test_set_log_level(): for i in range(5): poptorch.setLogLevel(i) with pytest.raises(ValueError, match="Invalid log level integer"): poptorch.setLogLevel(5) poptorch.setLogLevel("TRACE") poptorch.setLogLevel("DEBUG") poptorch.setLogLevel("INFO") poptorch.setLogLevel("WARN") poptorch.setLogLevel("ERR") poptorch.setLogLevel("OFF") err_str = "Unknown log level: wibble. Valid values are DEBUG, ERR, INFO, " err_str += "OFF, TRACE and WARN" with pytest.raises(ValueError, match=err_str): poptorch.setLogLevel("wibble") @helpers.printCapfdOnExit @helpers.overridePopartLogLevel() def test_set_popart_log_level(capfd): # Only strings are allowed with pytest.raises(ValueError, match="Level must be one of"): poptorch._logging.setPopartLogLevel(0) # pylint: disable=protected-access # Only some strings are allowed with pytest.raises(ValueError, match="Level must be one of"): poptorch._logging.setPopartLogLevel("FOO") # pylint: disable=protected-access poptorch._logging.setPopartLogLevel("DEBUG") # pylint: disable=protected-access poptorch._logging.setPopartLogLevel("INFO") # pylint: disable=protected-access poptorch._logging.setPopartLogLevel("WARN") # pylint: disable=protected-access model = torch.nn.Linear(2, 2) inference_model = poptorch.inferenceModel(model) inference_model(torch.randn([2, 2])) log = helpers.LogChecker(capfd) log.assert_no_matches(r"popart:devicex \d+\.\d+ T:") log.assert_no_matches(r"popart:ir \d+\.\d+ D:") log.assert_no_matches(r"popart:ir \d+\.\d+ I:") log.assert_no_matches(r"popart:session \d+\.\d+ T:") log.assert_no_matches(r"popart:popart \d+\.\d+ T:") poptorch._logging.setPopartLogLevel("ERR") # pylint: disable=protected-access poptorch._logging.setPopartLogLevel("OFF") # pylint: disable=protected-access poptorch._logging.setPopartLogLevel("TRACE") # pylint: disable=protected-access inference_model = poptorch.inferenceModel(model) inference_model(torch.randn([2, 2])) log = helpers.LogChecker(capfd) log.assert_matches(r"popart:devicex \d+\.\d+ T:") log.assert_matches(r"popart:ir \d+\.\d+ D:") log.assert_matches(r"popart:ir \d+\.\d+ I:") log.assert_matches(r"popart:session \d+\.\d+ T:") log.assert_matches(r"popart:popart \d+\.\d+ T:") def test_zero_size_tensor_error(): class Model(torch.nn.Module): def forward(self, x): # The operation doesn't matter, we just want to produce the # failure on an operation that works with zero-sized tensors # in native Torch return torch.nn.functional.interpolate(x, size=(10, 10)) x = torch.randn(0, 2, 5, 5) poptorch_model = poptorch.inferenceModel(Model()) with pytest.raises( poptorch.Error, match= r"Zero-sized tensors are unsupported \(Got shape \[0, 2, 5, 5\]\)" ): poptorch_model(x) def test_torch_backward_error(): x = torch.Tensor([5.0]) model = helpers.ModelWithWeights(lambda x: x, x.shape) poptorch_model = poptorch.trainingModel(model) poptorch_out, poptorch_loss = poptorch_model((x, )) error_message = ( r"backward\(\) cannot be called explicitly on " r"outputs of a PopTorch model. If you're using a trainingModel, " r"the backwards pass is performed automatically when invoking the " r"model. If you're using an inferenceModel, you should use a " r"trainingModel instead.") with pytest.raises(poptorch.Error, match=error_message): poptorch_out.backward() with pytest.raises(poptorch.Error, match=error_message): poptorch_loss.backward() @pytest.mark.parametrize( "error_type", poptorch.poptorch_core.TestErrorType.__members__.values()) def test_generic_error_handling(error_type): with pytest.raises(poptorch.Error) as e: poptorch.poptorch_core._throwTestError(error_type) # pylint: disable=protected-access assert "throwTestError::bottomLevel" in e.value.args[0] assert "throwTestError::topLevel" in e.value.args[0] def test_specific_error_handling(): try: poptorch.poptorch_core._throwTestError( # pylint: disable=protected-access poptorch.poptorch_core.TestErrorType.PoplarRecoverableFullReset) assert False, "Expected an error to be thrown" except poptorch.RecoverableError as e: assert e.recovery_action == "FULL_RESET" assert "throwTestError::bottomLevel" in e.location assert "throwTestError::topLevel" in e.location assert e.type == "poplar_recoverable_runtime_error" # Message shouldn't contain any backtrace assert "throwTestError::bottomLevel" not in e.message assert "throwTestError::topLevel" not in e.message try: poptorch.poptorch_core._throwTestError( # pylint: disable=protected-access poptorch.poptorch_core.TestErrorType.PoplarLinkError) assert False, "Expected an error to be thrown" except poptorch.Error as e: # Make sure the backtrace was reset between the two exceptions assert e.location.count("throwTestError::bottomLevel") == 1 assert e.location.count("throwTestError::topLevel") == 1 assert e.type == "poplar_link_error" # Message shouldn't contain any backtrace assert "throwTestError::bottomLevel" not in e.message assert "throwTestError::topLevel" not in e.message # Make sure the link error is added at the end of the error message assert "-lfoo not found" in e.message try: poptorch.poptorch_core._throwTestError( # pylint: disable=protected-access poptorch.poptorch_core.TestErrorType.PoplarUnrecoverable) assert False, "Expected an error to be thrown" except poptorch.UnrecoverableError as e: # Make sure the backtrace was reset between the two exceptions assert e.location.count("throwTestError::bottomLevel") == 1 assert e.location.count("throwTestError::topLevel") == 1 assert e.type == "poplar_unrecoverable_runtime_error" # Message shouldn't contain any backtrace assert "throwTestError::bottomLevel" not in e.message assert "throwTestError::topLevel" not in e.message @helpers.printCapfdOnExit @helpers.overridePoptorchLogLevel("DEBUG") @helpers.overridePopartLogLevel("DEBUG") def test_outline_attribute(capfd): class Model(torch.nn.Module): def __init__(self): super().__init__() self.gn1 = torch.nn.GroupNorm(4, 8) self.gn2 = torch.nn.GroupNorm(2, 8) def forward(self, x): with poptorch.Attribute(__outline={"layer": "embedding"}): x = self.gn1(x) return self.gn2(x) input = torch.randn(3, 8) poptorch_model = poptorch.inferenceModel(Model()) poptorch_model(input) testlog = helpers.LogChecker(capfd) get_regex = lambda op_name: (f'Op "{op_name}/.+", ' r"[0-9]+ of type ai\.graphcore\." ".+:1" r"(?:\n.+)+" f"{op_name}" r".+(?:\n.+)+" "layer: layer:embedding") # Ensure the first group norm has the outline attribute testlog.assert_matches(get_regex("gn1"), per_line=False) # Ensure the second group norm doesn't have the attribute, # as it is outside the attribute scope testlog.assert_no_matches(get_regex("gn2"), per_line=False) it = testlog.createIterator() it.findNext("lowered to PopART") # Ensure none of the attributes key / values are actually lowered to PopART # (They should have been converted to attributes) it.assert_not_contains("Char") # Note: the ipu models are not supported by poptorch.ConnectionType.Never @pytest.mark.ipuHardwareRequired def test_compile_without_ipu(): class SimpleAdder(nn.Module): def forward(self, x, y): return x + y model = SimpleAdder() opts = poptorch.Options().connectionType(poptorch.ConnectionType.Never) inference_model = poptorch.inferenceModel(model, opts) t1 = torch.tensor([1.]) t2 = torch.tensor([2.]) inference_model.compile(t1, t2) def test_error_on_cpu_tensor(): class Model(nn.Module): def forward(self, x): return torch.index_select(x, 0, torch.LongTensor([1, 0])) model = Model() inference_model = poptorch.inferenceModel(model) t1 = torch.rand(4) with pytest.raises(poptorch.Error, match=re.escape( "Expected an IPU tensor but got tensor(device=cpu, " "shape=[2], dtype=Long)")): inference_model.compile(t1) ================================================ FILE: tests/multiconv_test.py ================================================ #!/usr/bin/env python3 # Copyright (c) 2020 Graphcore Ltd. All rights reserved. from io import StringIO import json import pytest import torch from torch import nn import helpers import poptorch def getPopartMultiConvs(poptorch_model): ir_as_json = json.load(StringIO(poptorch_model._debugGetPopartIR())) # pylint: disable=protected-access assert "maingraph" in ir_as_json, "Expected maingraph in serialized IR." r = [] for op in ir_as_json["maingraph"]: if op["type"] == "MultiConv": r.append(op) return r def assert_contains_multiconv(poptorch_model, expected_num=1): num_multiconv = len(getPopartMultiConvs(poptorch_model)) msg = (f"Wrong number of MultiConv ops.\n" f" Expected : {expected_num}\n" f" Actual : {num_multiconv}.") assert num_multiconv == expected_num, msg @pytest.mark.parametrize("num_layers", [1, 2, 3]) def test_multiconv_basic(num_layers): class Model(torch.nn.Module): def __init__(self): super().__init__() self.convA = nn.Conv2d(1, 1, 5) self.convB = nn.Conv2d(1, 1, 5, bias=False) def forward(self, x): with poptorch.MultiConv(): a = self.convA(x) absx = torch.abs(x) b = self.convB(absx) return a + b m = [Model() for i in range(num_layers)] m = torch.nn.Sequential(*m) torch.manual_seed(0) input = torch.randn(2, 1, 28, 28) native = m(input) poptorch_model = poptorch.inferenceModel(m) poptorch_out = poptorch_model(input) assert_contains_multiconv(poptorch_model, num_layers) for cpu, pop in zip(native, poptorch_out): helpers.assert_allclose(expected=cpu, actual=pop) def multiconv_harness(multiconv): class Model(torch.nn.Module): def __init__(self): super().__init__() self.conv1 = nn.Conv2d(1, 10, 5) self.conv2 = nn.Conv2d(1, 10, 5) self.MultiConv = multiconv def forward(self, x): y = torch.pow(x, 2) with self.MultiConv: u = self.conv1(x) v = self.conv2(y) return u - v m = Model() torch.manual_seed(0) x = torch.randn(2, 1, 28, 28) native = m(x) poptorch_model = poptorch.inferenceModel(m) poptorch_out = poptorch_model(x) helpers.assert_allclose(expected=native, actual=poptorch_out) assert_contains_multiconv(poptorch_model) def test_multiconv_options_broadcast(): multiconv = ( poptorch.MultiConv().availableMemoryProportions(0.8).partialsTypes( torch.float).planType( poptorch.MultiConvPlanType.Parallel).perConvReservedTiles( 100).cycleBackOff(0.3)).enableConvDithering(True) multiconv_harness(multiconv) def test_multiconv_options_per_conv(): partials_types = [torch.float, torch.float] multiconv = (poptorch.MultiConv().availableMemoryProportions( (0.8, 0.7)).partialsTypes(partials_types).planType( poptorch.MultiConvPlanType.Parallel).perConvReservedTiles( 120).cycleBackOff(0.4)).enableConvDithering(True) multiconv_harness(multiconv) def test_multiconv_layers(): class Network(nn.Module): def __init__(self): super().__init__() self.layer1A = nn.Sequential(nn.Conv2d(1, 10, 5), nn.MaxPool2d(2), nn.ReLU()) self.layer1B = nn.Sequential(nn.Conv2d(1, 10, 5), nn.MaxPool2d(2), nn.ReLU()) self.layer2 = nn.Sequential(nn.Conv2d(10, 20, 5), nn.MaxPool2d(2), nn.ReLU()) self.layer3 = nn.Linear(320, 256) self.layer3_act = nn.ReLU() self.layer4 = nn.Linear(256, 10) self.softmax = nn.LogSoftmax(1) def forward(self, x): with poptorch.MultiConv(): absx = torch.abs(x) y = self.layer1A(absx) z = self.layer1B(x) x = y + z x = self.layer2(x) x = x.view(-1, 320) x = self.layer3_act(self.layer3(x)) x = self.layer4(x) x = self.softmax(x) return x model = Network() # Run on CPU. input = torch.randn(2, 1, 28, 28) native_out = model(input) poptorch_model = poptorch.inferenceModel(model) poptorch_out = poptorch_model(input) assert_contains_multiconv(poptorch_model) helpers.assert_allclose(actual=poptorch_out, expected=native_out) def test_invalid_multiconv_nested(): class Model(torch.nn.Module): def __init__(self): super().__init__() self.conv = nn.Conv2d(1, 10, 10) def forward(self, x): with poptorch.MultiConv(): with poptorch.MultiConv(): return self.conv(x) m = Model() poptorch_model = poptorch.inferenceModel(m) msg = "Nested poptorch.MultiConv is not supported" with pytest.raises(poptorch.Error, match=msg): poptorch_model(torch.zeros(2, 1, 32, 32)) def test_invalid_multiconv_empty(): class Model(torch.nn.Module): def forward(self, x): with poptorch.MultiConv(): return torch.pow(x, 2) m = Model() poptorch_model = poptorch.inferenceModel(m) msg = "Unexpected end_multi_conv" with pytest.raises(poptorch.Error, match=msg): poptorch_model(torch.ones(2, 2)) def test_invalid_multiconv_options(): mc = poptorch.MultiConv() with pytest.raises(ValueError, match="Invalid partials types"): mc.partialsTypes("half") with pytest.raises(AssertionError, match="Invalid plan type"): mc.planType("parallel") ================================================ FILE: tests/non_contiguous_tensors_test.py ================================================ #!/usr/bin/env python3 # Copyright (c) 2020 Graphcore Ltd. All rights reserved. import torch import helpers import poptorch class FiveAdder(torch.nn.Module): def forward(self, in_1, in_2, in_3, in_4, in_5): return in_1 + in_2 + in_3 + in_4 + in_5 def test_non_contiguous(): torch.manual_seed(23148) model = FiveAdder() poptorch_model = poptorch.inferenceModel(model) OUTER_DIM = 1000 INNER_DIM = 40 nc1 = torch.randn([OUTER_DIM, INNER_DIM + 1])[:, 0:INNER_DIM] nc2 = torch.transpose(torch.randn([INNER_DIM, OUTER_DIM]), 0, 1) nc3 = torch.tensor([1.0]).expand([OUTER_DIM, INNER_DIM]) c1 = torch.randn([OUTER_DIM, INNER_DIM]) c2 = torch.randn([2, OUTER_DIM, INNER_DIM])[0, :, :] assert not nc1.is_contiguous() assert not nc2.is_contiguous() assert not nc3.is_contiguous() assert c1.is_contiguous() assert c2.is_contiguous() native_out = model(nc1, c1, nc2, c2, nc3) poptorch_out = poptorch_model(nc1, c1, nc2, c2, nc3) assert native_out.shape == (OUTER_DIM, INNER_DIM) print(native_out) print(poptorch_out) helpers.assert_allclose(actual=poptorch_out, expected=native_out) ================================================ FILE: tests/norms_test.py ================================================ #!/usr/bin/env python3 # Copyright (c) 2020 Graphcore Ltd. All rights reserved. import os # pylint: disable=unused-import import unittest.mock from copy import deepcopy import torch import torch.optim as optim import torch.nn as nn import pytest import helpers import poptorch # Norms #'torch.nn.BatchNorm1d', 'torch.nn.BatchNorm2d', 'torch.nn.BatchNorm3d', 'torch.nn.GroupNorm', 'torch.nn.SyncBatchNorm', 'torch.nn.SyncBatchNorm.convert_sync_batchnorm', # 'torch.nn.InstanceNorm1d', 'torch.nn.InstanceNorm2d', 'torch.nn.InstanceNorm3d', 'torch.nn.LayerNorm', 'torch.nn.LocalResponseNorm', batch_norm_params = [ # Norm, affine, running_stats, training (nn.BatchNorm1d, False, False, False), (nn.BatchNorm2d, True, True, False), ] @pytest.mark.parametrize("batch_norm, affine, running_stats, training", batch_norm_params) @unittest.mock.patch.dict("os.environ", helpers.disableSmallModel()) def test_batchNorm(batch_norm, affine, running_stats, training): torch.manual_seed(42) C = 4 input_shape = [3, C, 5] if batch_norm in (nn.BatchNorm2d, nn.BatchNorm3d): input_shape.append(6) if batch_norm is nn.BatchNorm3d: input_shape.append(7) input = torch.randn(input_shape) norm = batch_norm(C, affine=affine, track_running_stats=running_stats) # pylint: disable=W0212 norm._buffers["running_mean"] = torch.randn([C]) norm._buffers["running_var"] = torch.clamp(torch.randn([C]) + 1.0, min=0.1) norm.train(training) model = helpers.ModelWithWeights(norm, input.shape) ipumodel = deepcopy(model) poptorch_model = poptorch.trainingModel( ipumodel) if training else poptorch.inferenceModel(ipumodel) # Run pytorch native on CPU. native_out, _ = model((input, )) # Run on IPU. poptorch_out, _ = poptorch_model((input, )) # Inference test - check outputs helpers.assert_allclose(actual=poptorch_out, expected=native_out) # Training test - check weights changed if training: poptorch_model.assert_weights_changed() def test_batchNorm_typing(): torch.manual_seed(42) class Model(torch.nn.Module): def __init__(self): super().__init__() self.bn = nn.BatchNorm1d(100) def forward(self, x, y): return self.bn(x) + y m = Model() ipu_model = poptorch.inferenceModel(m) x = torch.randn(20, 100, dtype=torch.half) y = torch.randn(20, 100, dtype=torch.half) ipu_model(x, y) def test_batchNorm_eval_during_training(): torch.manual_seed(42) class Model(torch.nn.Module): def __init__(self): super().__init__() self.bn = nn.BatchNorm1d(100) self.loss = torch.nn.MSELoss() def forward(self, x, target): y = self.bn(x) return y, self.loss(y, target) input = torch.randn([16, 100]) target = torch.randn([16, 100]) model = Model() for param in model.parameters(): param.requires_grad = False model.bn.eval() running_mean_init = model.bn.running_mean.clone().detach() running_var_init = model.bn.running_var.clone().detach() # Run pytorch native on CPU. native_out, _ = model(input, target) # Run on IPU. ipu_model = poptorch.trainingModel(model) poptorch_out, _ = ipu_model(input, target) # TODO: T38684 # Implicit copy only happens when we touch the params so copy explicitly. ipu_model.copyWeightsToHost() helpers.assert_allclose(actual=poptorch_out, expected=native_out) helpers.assert_allequal(actual=model.bn.running_mean, expected=running_mean_init) helpers.assert_allequal(actual=model.bn.running_var, expected=running_var_init) @pytest.mark.parametrize("norm_dim", range(4)) def test_layerNorm(norm_dim): torch.manual_seed(42) elementwise_affine = norm_dim % 2 == 1 input = torch.randn([3, 2, 5, 2]) layerNorm = nn.LayerNorm(input.shape[norm_dim:], elementwise_affine=elementwise_affine) model = helpers.ModelWithWeights(layerNorm, input.shape) # Run pytorch native on CPU. native_out, _ = model((input, )) poptorch_model = poptorch.trainingModel(model) # Run on IPU. poptorch_out, _ = poptorch_model((input, )) # Inference test - check outputs helpers.assert_allclose(actual=poptorch_out, expected=native_out, atol=1e-4, rtol=1e-4) # Training test - check weights changed poptorch_model.assert_weights_changed() def test_layerNormPretrainedWeights(): torch.manual_seed(42) class Model(nn.Module): def __init__(self): super().__init__() self.conv = nn.Conv2d(5, 5, kernel_size=(1, 1)) self.norm = nn.LayerNorm((5, 3, 10)) def forward(self, x): x = self.conv(x) return self.norm(x) model = Model() input = torch.randn([3, 5, 3, 10]) modelOut = model(input) # Run on IPU. ipuModel = poptorch.inferenceModel(model) poptorch_out = ipuModel(input) # Marginally more leeway. helpers.assert_allclose(actual=poptorch_out, expected=modelOut, rtol=1e-4, atol=1e-6) # We aren't training to any real target we just want to update the beta/gamma parameters and check they still work in popart. criterion = nn.MSELoss() optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9) model.train() for _ in range(0, 10): outputs = model(input) optimizer.zero_grad() loss = criterion(outputs, torch.ones([3, 5, 3, 10])) loss.backward() optimizer.step() model.eval() # Run on IPU with trained weights. ipuModel = poptorch.inferenceModel(model) poptorch_out = ipuModel(input) # Run on CPU again with trained weights. outputs = model(input) helpers.assert_allclose(actual=poptorch_out, expected=outputs, rtol=1e-4, atol=1e-6) @pytest.mark.parametrize("dims", {2, 3, 4, 5}) def test_groupNorm(dims): if dims == 2: # TODO(T49073): Match torch 1.10 GroupNorm implementation pytest.skip("Numerical differences between PyTorch and PopTorch") torch.manual_seed(42) affine = dims % 2 == 0 shape = [3, 10] if dims > 2: rand_shape = torch.randint(2, 5, [dims - 2]) shape.extend(rand_shape.tolist()) input = torch.randn(shape) groupNorm = nn.GroupNorm(5, 10, affine=affine) model = helpers.ModelWithWeights(groupNorm, input.shape) # Run pytorch native on CPU. native_out, _ = model((input, )) # Run on IPU. poptorch_model = poptorch.trainingModel(model) poptorch_out, _ = poptorch_model((input, )) # Inference test - check outputs helpers.assert_allclose(actual=poptorch_out, expected=native_out) # Training test - check weights changed poptorch_model.assert_weights_changed() def test_groupNorm_exfail(): torch.manual_seed(42) shape = [3, 10] input = torch.randn(shape) groupNorm = nn.GroupNorm(5, 10) # Run pytorch native on CPU. native_output = groupNorm(input) opts = poptorch.Options() opts._Popart.set("groupNormStridedChannelGrouping", True) # pylint: disable=protected-access # Run on IPU. ipuModel = poptorch.inferenceModel(groupNorm, opts) poptorch_out = ipuModel(input) # Group norm is pending correctness changes in popart/poplar so we will just test the shape/type for now. assert poptorch_out.size() == native_output.size() assert poptorch_out.type() == native_output.type() assert not torch.allclose(poptorch_out, native_output, atol=1e-1, rtol=0.1) def test_groupNorm_typing(): torch.manual_seed(42) class Model(torch.nn.Module): def __init__(self): super().__init__() self.gn = torch.nn.GroupNorm(4, 16) def forward(self, x): return self.gn(x) m = Model() ipu_model = poptorch.inferenceModel(m) x = torch.randn(20, 16, 50, dtype=torch.half) assert ipu_model(x).dtype == torch.half instance_norm_params = [ # norm, dims (nn.InstanceNorm1d, 1), (nn.InstanceNorm2d, 2), (nn.InstanceNorm3d, 3) ] @pytest.mark.parametrize("instance_norm, d", instance_norm_params) def test_instanceNorm(instance_norm, d): torch.manual_seed(42) affine = d % 2 == 1 class Model(nn.Module): def __init__(self): super().__init__() self.norm = instance_norm(6, affine=affine) self.fc1 = nn.Linear(6 * 2**d, 10) self.loss = nn.CrossEntropyLoss() def forward(self, x, target): out = self.norm(x) out = out.flatten(1) out = self.fc1(out) loss = self.loss(out, target) return out, loss for _ in range(3): model = Model() opt = optim.AdamW(model.parameters(), lr=0.01) poptorch_model = poptorch.trainingModel(model, optimizer=opt) shape = [5, 6] shape.extend([2 for _ in range(d)]) # Offset the data by multiplying by random values and shifting by a random bias input = torch.randint(2, 10, shape) * torch.randn( shape) + torch.randint(2, 10, [1]) * torch.randn(1) label = torch.randint(0, 10, [shape[0]]) _, original_loss = poptorch_model(input, label) for _ in range(0, 100): out, loss = poptorch_model(input, label) # Check we have trained the model assert loss < original_loss assert loss < 0.03 helpers.assert_allequal(actual=torch.argmax(out, dim=1), expected=label) def test_batchnorm_statistics(): torch.manual_seed(42) input_data = [torch.randn([4, 4, 3, 3]) for _ in range(10)] label = torch.ones(4).long() class Model(torch.nn.Module): def __init__(self): super().__init__() self.bn = torch.nn.BatchNorm2d(4) self.loss = torch.nn.CrossEntropyLoss() def forward(self, args, loss_inputs=None): output = self.bn(args) if loss_inputs is None: return output reduced = torch.mean(output, dim=(2, 3)) return output, self.loss(reduced, loss_inputs) model1 = Model() model1.train() optimizer = optim.SGD(model1.parameters(), lr=0.0) training_model = poptorch.trainingModel(model1, optimizer=optimizer) for data in input_data: training_model(data, label) model2 = Model() model2.train() for data in input_data: model2(data) # Shouldn't be needed but buffers alone don't trigger the copy. training_model.copyWeightsToHost() # Running mean is very close helpers.assert_allclose(actual=model2.bn.running_mean, expected=model1.bn.running_mean) # Running var is not so close. helpers.assert_allclose(actual=model2.bn.running_var, expected=model1.bn.running_var) @pytest.mark.parametrize('p', (1, 2, 1.0, 2.0, float('inf'), float('-inf'), 'fro')) def test_norm_in_loop(p): embedding = torch.nn.Parameter(torch.randn((200, 100))) num_loops = 3 class Model(torch.nn.Module): def __init__(self, ): super().__init__() self.embedding = embedding def forward(self): def loop_body(norm): norm += torch.norm(self.embedding[:100], p=p, dim=-1) return norm cumulative_norm = torch.zeros(100, device=self.embedding.device) (cumulative_norm, ) = poptorch.for_loop( num_loops, loop_body, [cumulative_norm], ) return cumulative_norm class RefModel(torch.nn.Module): def __init__(self, ): super().__init__() self.embedding = embedding def forward(self): cumulative_norm = torch.zeros(100, device=self.embedding.device) for _ in range(num_loops): cumulative_norm += torch.norm(self.embedding[:100], p=p, dim=-1) return cumulative_norm native = Model() ipu = poptorch.inferenceModel(native) ipu_out = ipu() native_out = RefModel()() helpers.assert_allclose(actual=ipu_out, expected=native_out) ================================================ FILE: tests/ops_test.py ================================================ #!/usr/bin/env python3 # Copyright (c) 2020 Graphcore Ltd. All rights reserved. import re import torch import helpers import poptorch def test_print_tensor(): class Model(torch.nn.Module): def forward(self, x): return poptorch.ipu_print_tensor(x) m = poptorch.inferenceModel(Model()) m(torch.randn(5)) def test_print_tensor_with_title(): class Model(torch.nn.Module): def forward(self, x): return poptorch.ipu_print_tensor(x, "my_tensor") m = poptorch.inferenceModel(Model()) m(torch.randn(5)) def test_nop(): class Model(torch.nn.Module): def forward(self, x): return poptorch.nop(x) * 2 m = poptorch.inferenceModel(Model()) m(torch.randn(5)) def test_name_scope(): class Model(torch.nn.Module): def forward(self, x, y): with poptorch.NameScope("NameScope"): return x + y model = Model() poptorch_model = poptorch.inferenceModel(model) torch.manual_seed(42) x = torch.randn(10, 10) y = torch.randn(10, 10) poptorch_model(x, y) ir = poptorch_model._debugGetPopartIR() # pylint: disable=protected-access assert ir.find('"name":"NameScope/Add:InPlace"') != -1 @helpers.printCapfdOnExit @helpers.overridePoptorchLogLevel("TRACE") def test_available_memory_last_op(capfd): class Model(torch.nn.Module): def forward(self, x): x = torch.matmul(x, x) return poptorch.set_available_memory(x, 0.3) input = torch.randn(10, 10) poptorch_model = poptorch.inferenceModel(Model()) poptorch_model.compile(input) # Check the trace log to make sure set_available_memory isn't pruned # before it's lowered to PopART ir_before_popart_regex = \ (r"Graph before lowering to PopART:\n" r".*\n" r".* popart::matmul.*\n" r".* poptorch::set_available_memory.*") log = helpers.LogChecker(capfd) log.assert_matches(ir_before_popart_regex, per_line=False) @helpers.printCapfdOnExit @helpers.overridePoptorchLogLevel("TRACE") def test_available_memory_linear(capfd): class LinModel(torch.nn.Module): def __init__(self): super().__init__() self.conv = torch.nn.Conv2d(3, 3, 3) self.lin = torch.nn.Linear(3, 3) def forward(self, x): x = self.conv(x) x = self.lin(x) x = poptorch.set_available_memory(x, 0.3) return x x = torch.rand(2, 3, 5, 5) model = LinModel() poptorch_model = poptorch.inferenceModel(model) poptorch_model(x) log = helpers.LogChecker(capfd) it = log.createIterator() # Assert that the set_available_memory node references the matmul, not the # add. it.findNext("Graph before lowering to PopART:") matmul_line = it.findNext("popart::matmul").strip() matmul_var = matmul_line.partition(" ")[0] sam_line = it.findNext("poptorch::set_available_memory").strip() actual_var = re.match(r".*set_available_memory[^\(]+\(([^\)]+).*", sam_line).group(1) assert actual_var == matmul_var ================================================ FILE: tests/optimizers_test.py ================================================ #!/usr/bin/env python3 # Copyright (c) 2020 Graphcore Ltd. All rights reserved. import copy from io import StringIO import json import os import tempfile import unittest.mock import pytest import torch import torch.optim as optim from torch.optim.lr_scheduler import ExponentialLR import helpers import poptorch # Convenience classes for testing class LAMBNoBias(poptorch.optim.LAMB): def __init__(self, *args, **kwargs): super().__init__(*args, bias_correction=False, **kwargs) class AdamWNoBias(poptorch.optim.AdamW): def __init__(self, *args, **kwargs): super().__init__(*args, bias_correction=False, **kwargs) poptorch_optimizers = [ poptorch.optim.SGD, poptorch.optim.Adam, poptorch.optim.AdamW, poptorch.optim.RMSprop, poptorch.optim.LAMB, LAMBNoBias, AdamWNoBias ] supported_torch_optimizers = [ optim.SGD, optim.Adam, optim.AdamW, optim.RMSprop ] all_optimizers = poptorch_optimizers + supported_torch_optimizers def assert_is_ipu_optimizer_state(state, should_be_empty=False): assert isinstance(state, dict) assert "ipu_state" in state assert "ipu_param" in state if should_be_empty: assert state["ipu_state"] is None assert state["ipu_param"] is None else: assert isinstance(state["ipu_state"], dict) and all( isinstance(k, str) and isinstance(v, torch.Tensor) for k, v in state["ipu_state"].items()), state assert isinstance(state["ipu_param"], dict) and all( isinstance(k, str) and isinstance(v, torch.Tensor) for k, v in state["ipu_param"].items()), state assert len(state["ipu_param"]) > 0, "All optimizers have parameters" # Not all optimizers have a state though class OptimizerTestModel: def __init__(self, options=None, num_groups=1): layers = [torch.nn.Linear(10, 10) for _ in range(num_groups)] if num_groups == 1: base_model = layers[0] else: base_model = torch.nn.Sequential(*layers) self.input = torch.randn(1, 10) self.label = torch.randint(0, 10, [1]) self.options = options class Model(torch.nn.Module): def __init__(self): super().__init__() self.base_model = base_model self.loss = torch.nn.CrossEntropyLoss() def forward(self, data, target): out = self.base_model(data) loss = self.loss(out, target) return out, loss self.model = Model() self.poptorch_model = None def parameters(self): return self.model.parameters() def setOptimizer(self, optimizer): if self.poptorch_model is None: self.poptorch_model = poptorch.trainingModel(self.model, self.options, optimizer=optimizer) else: self.poptorch_model.setOptimizer(optimizer) def run(self): if self.poptorch_model is None: raise RuntimeError("Call setOptimizer first.") out_loss = self.poptorch_model(self.input, self.label) return out_loss @pytest.mark.parametrize("opt", helpers.onlyFirstIfReduced(all_optimizers)) def test_optimizer(opt): torch.manual_seed(42) model = OptimizerTestModel() # "Train" with learning rate of zero and check the loss remains the same. if opt == poptorch.optim.SGD: optimizer = opt(model.parameters(), lr=0.00, use_combined_accum=False) else: optimizer = opt(model.parameters(), lr=0.00) # Make sure the first run doesn't already pass the test. model.setOptimizer(optimizer) _, original_loss = model.run() # Loss shouldn't change. for _ in range(0, 50): out, loss = model.run() assert loss == original_loss # We shouldn't get the right result. assert not torch.argmax(out, dim=1) == model.label # Update the optimizer and check the loss now begins to decrease. optimizer.param_groups[0]['lr'] = 0.01 model.setOptimizer(optimizer) for _ in range(0, 1000): out, loss = model.run() # Check we have trained the "model" assert loss < original_loss assert loss < 0.03 assert torch.argmax(out, dim=1) == model.label @pytest.mark.parametrize( "opt", {optim.SGD, optim.AdamW, poptorch.optim.SGD, poptorch.optim.AdamW}) def test_sgd_IR(opt): torch.manual_seed(42) model = OptimizerTestModel() # "Train" with learning rate of zero and check the loss remains the same. if opt == poptorch.optim.SGD: optimizer = opt(model.parameters(), lr=0.01, use_combined_accum=False) else: optimizer = opt(model.parameters(), lr=0.01) model.setOptimizer(optimizer) model.run() as_json = json.load(StringIO(model.poptorch_model._debugGetPopartIR())) # pylint: disable=protected-access AdamVarUpdate = 0 AdamUpdater = 0 SGD0VarUpdate = 0 for name in as_json: assert name == "maingraph" for op in as_json[name]: if op['type'] == "AdamUpdater": AdamUpdater += 1 elif op['type'] == "AdamVarUpdate": AdamVarUpdate += 1 elif op['type'] == "SGD0VarUpdate": SGD0VarUpdate += 1 if opt in (optim.SGD, poptorch.optim.SGD): assert SGD0VarUpdate == 2 assert AdamVarUpdate == 0 and AdamUpdater == 0 else: assert SGD0VarUpdate == 0 assert AdamVarUpdate == 2 and AdamUpdater == 2 @helpers.printCapfdOnExit @pytest.mark.parametrize("opt", helpers.onlyFirstIfReduced( (poptorch.optim.Adam, poptorch.optim.AdamW, AdamWNoBias, poptorch.optim.LAMB, LAMBNoBias))) @pytest.mark.parametrize("accum_type", (torch.float16, torch.float)) @pytest.mark.parametrize("first_order_type", (torch.float16, torch.float)) @pytest.mark.parametrize("second_order_type", (torch.float16, torch.float)) @helpers.overridePoptorchLogLevel("DEBUG") def test_adam_accum_type(capfd, opt, accum_type, first_order_type, second_order_type): def torchTypeToStr(dt): t = str(dt) assert t in ["torch.float32", "torch.float16"] return t.split(".")[1] torch.manual_seed(42) model = OptimizerTestModel() # "Train" with learning rate of zero and check the loss remains the same. optimizer = opt(model.parameters(), lr=0.01, accum_type=accum_type, first_order_momentum_accum_type=first_order_type, second_order_momentum_accum_type=second_order_type) model.setOptimizer(optimizer) model.run() testlog = helpers.LogChecker(capfd) testlog.assert_matches( "graph optimizer", "accumType=" + torchTypeToStr(accum_type), "firstOrderMomentumAccumType=" + torchTypeToStr(first_order_type), "secondOrderMomentumAccumType=" + torchTypeToStr(second_order_type)) @helpers.printCapfdOnExit @pytest.mark.parametrize("accum_type", (torch.float16, torch.float)) @pytest.mark.parametrize("velocity_accum_type", (torch.float16, torch.float)) @helpers.overridePoptorchLogLevel("DEBUG") def test_sgd_accum_type(capfd, accum_type, velocity_accum_type): def torchTypeToStr(dt): t = str(dt) assert t in ["torch.float32", "torch.float16"] return t.split(".")[1] torch.manual_seed(42) model = OptimizerTestModel() # "Train" with learning rate of zero and check the loss remains the same. optimizer = poptorch.optim.SGD(model.parameters(), lr=0.01, use_combined_accum=False, accum_type=accum_type, velocity_accum_type=velocity_accum_type) model.setOptimizer(optimizer) model.run() testlog = helpers.LogChecker(capfd) testlog.assert_matches( "graph optimizer", "accumType=" + torchTypeToStr(accum_type), "firstOrderMomentumAccumType=" + torchTypeToStr(velocity_accum_type)) @pytest.mark.parametrize("use_combined_accum", (True, False)) def test_velocity_scaling_copy(use_combined_accum): torch.manual_seed(42) model = OptimizerTestModel() # "Train" with learning rate of zero and check the loss remains the same. optimizer = poptorch.optim.SGD( model.parameters(), lr=0.05, loss_scaling=0.05, velocity_scaling=128.1 if use_combined_accum else None, use_combined_accum=use_combined_accum) model.setOptimizer(optimizer) model.run() # Check copy.copy preserves optimizer PopTorch attributes o = copy.copy(optimizer) model.setOptimizer(o) model.run() @pytest.mark.parametrize( "opt", { optim.SGD, poptorch.optim.SGD #, optim.Adam, optim.AdamW, optim.RMSprop, #poptorch.optim.Adam, poptorch.optim.AdamW, AdamWNoBias, #poptorch.optim.RMSprop, poptorch.optim.LAMB, LAMBNoBias }) def test_optimizer_groups(opt): torch.manual_seed(42) model = OptimizerTestModel(num_groups=2) # Parameter is a soft copy by default oddly. weight1 = model.model.base_model[0].weight.clone() bias1 = model.model.base_model[0].bias.clone() weight2 = model.model.base_model[1].weight.clone() bias2 = model.model.base_model[1].bias.clone() def get_optims(run_time): first_group_lr = 0.0 if run_time == 0 else 0.1 second_group_lr = 0.1 if run_time == 2 else 0.0 if opt == poptorch.optim.SGD: return opt([{ 'params': model.model.base_model[0].parameters(), "lr": first_group_lr }, { 'params': model.model.base_model[1].parameters(), "lr": second_group_lr }], lr=0.1, use_combined_accum=False) return opt([{ 'params': model.model.base_model[0].parameters(), "lr": first_group_lr }, { 'params': model.model.base_model[1].parameters(), "lr": second_group_lr }], lr=0.1) # Start the optimizer as zero for both groups. model.setOptimizer(get_optims(run_time=0)) _, original_loss = model.run() for _ in range(0, 10): out, loss = model.run() weight1_post, bias1_post = model.model.base_model[0].parameters() weight2_post, bias2_post = model.model.base_model[1].parameters() # Nothing should have changed. helpers.assert_allequal(expected=weight1, actual=weight1_post) helpers.assert_allequal(expected=weight2, actual=weight2_post) helpers.assert_allequal(expected=bias1, actual=bias1_post) helpers.assert_allequal(expected=bias2, actual=bias2_post) # Check we have not trained the model assert loss == original_loss # Now update the optimizer to train just one weight model.setOptimizer(get_optims(run_time=1)) _, original_loss = model.run() for _ in range(0, 10): out, loss = model.run() weight1_post, bias1_post = model.model.base_model[0].parameters() weight2_post, bias2_post = model.model.base_model[1].parameters() assert loss != original_loss assert not torch.equal(weight1, weight1_post) helpers.assert_allequal(expected=weight2, actual=weight2_post) assert not torch.equal(bias1, bias1_post) helpers.assert_allequal(expected=bias2, actual=bias2_post) # Now update the optimizer to train just both weight model.setOptimizer(get_optims(run_time=2)) _, original_loss = model.run() # Actually try and train here. for _ in range(0, 2000): out, loss = model.run() weight2_post, bias2_post = model.model.base_model[1].parameters() assert not torch.equal(weight2, weight2_post) assert not torch.equal(bias2, bias2_post) # Check we've trained the model. assert torch.argmax(out) == model.label def test_optimizer_groups_none_args(): torch.manual_seed(42) model = OptimizerTestModel(num_groups=2) # Parameter is a soft copy by default oddly. weight1 = model.model.base_model[0].weight.clone() bias1 = model.model.base_model[0].bias.clone() weight2 = model.model.base_model[1].weight.clone() bias2 = model.model.base_model[1].bias.clone() # Start the optimizer as zero for both groups. model.setOptimizer( optim.AdamW([{ 'params': model.model.base_model[0].parameters(), "lr": 0.0 }, { 'params': model.model.base_model[1].parameters(), "lr": 0.0 }], lr=0.1)) for _ in range(0, 10): model.run() weight1_post, bias1_post = model.model.base_model[0].parameters() weight2_post, bias2_post = model.model.base_model[1].parameters() # Nothing should have changed. helpers.assert_allequal(expected=weight1, actual=weight1_post) helpers.assert_allequal(expected=weight2, actual=weight2_post) helpers.assert_allequal(expected=bias1, actual=bias1_post) helpers.assert_allequal(expected=bias2, actual=bias2_post) @helpers.printCapfdOnExit @helpers.overridePoptorchLogLevel("DEBUG") def test_optimizer_SGD_separate_velocity_scale_matched(capfd): model = OptimizerTestModel() optimizer = poptorch.optim.SGD(model.parameters(), loss_scaling=2.0, lr=1.0, use_combined_accum=False) model.setOptimizer(optimizer) model.run() testlog = helpers.LogChecker(capfd) testlog.assert_contains("lossScaling=2", "defaultVelocityScaling=2") def test_optimizer_SGD_nesterov(): torch.manual_seed(42) model = OptimizerTestModel() model.setOptimizer( optim.SGD(model.parameters(), nesterov=True, momentum=0.1, lr=0.001)) model.run() @pytest.mark.parametrize("opt", helpers.onlyFirstIfReduced(poptorch_optimizers)) def test_optimizer_const(opt): torch.manual_seed(42) model = OptimizerTestModel() # Initialise the optimiser with the default loss_scaling value if opt == poptorch.optim.SGD: optimizer = opt(model.parameters(), loss_scaling=1.0, lr=1.0, use_combined_accum=False) else: optimizer = opt(model.parameters(), loss_scaling=1.0, lr=1.0) model.setOptimizer(optimizer) model.run() optimizer.loss_scaling = 2.0 model.setOptimizer(optimizer) model.run() @pytest.mark.parametrize("opt", helpers.onlyFirstIfReduced(poptorch_optimizers)) def test_optimizer_mark_as_variable(opt): torch.manual_seed(42) model = OptimizerTestModel() # Initialise the optimiser with the default loss_scaling value if opt == poptorch.optim.SGD: optimizer = opt(model.parameters(), lr=1.0, use_combined_accum=False) else: optimizer = opt(model.parameters(), lr=1.0) optimizer.variable_attrs.markAsVariable("loss_scaling") model.setOptimizer(optimizer) model.run() optimizer.loss_scaling = 2.0 model.setOptimizer(optimizer) model.run() @pytest.mark.parametrize("opt", helpers.onlyFirstIfReduced( [poptorch.optim.LAMB, LAMBNoBias])) def test_lamb_max_weight_norm(opt): torch.manual_seed(42) model = OptimizerTestModel() optimizer = opt(model.parameters(), lr=0.01, max_weight_norm=100.0) model.setOptimizer(optimizer) _, original_loss = model.run() for _ in range(0, 1000): out, loss = model.run() # Check we have trained the "model" assert loss < original_loss assert loss < 0.03 assert torch.argmax(out, dim=1) == model.label # Run from scratch with max_weight_norm disabled. model = OptimizerTestModel() optimizer = opt(model.parameters(), lr=0.01, max_weight_norm=None) # Train model again model.setOptimizer(optimizer) for _ in range(0, 1000): out, loss = model.run() # Model should have trained like normal assert loss < original_loss assert loss < 0.03 assert torch.argmax(out, dim=1) == model.label @helpers.printCapfdOnExit @pytest.mark.parametrize("use_combined_accum", (True, False)) @helpers.overridePoptorchLogLevel("DEBUG") def test_variable_groups(capfd, use_combined_accum): model = OptimizerTestModel(num_groups=2) # Make sure all groups have the default values, and the values are not (const) params = [{ "params": model.model.base_model[0].parameters() }, { "params": model.model.base_model[1].parameters() }] o = poptorch.optim.SGD( params, lr=0.01, loss_scaling=2.0, velocity_scaling=2.0 if use_combined_accum else None, use_combined_accum=use_combined_accum) model.setOptimizer(o) model.run() testlog = helpers.LogChecker(capfd) testlog.assert_contains("graph optimizer with SGD", "defaultLearningRate=0.01,", "defaultVelocityScaling=2,", "lossScaling=2") testlog.assert_contains("group 0 optimizer with SGD", "learningRate=0.01,", "velocityScaling=2,") testlog.assert_contains("group 1 optimizer with SGD", "learningRate=0.01,", "velocityScaling=2,") # Make sure the loss_scaling can be changed, and individual velocityScaling can be set. o.loss_scaling = 4.0 o.param_groups[1]["velocity_scaling"] = 4.0 # onl for combined variant o.param_groups[0][ "loss_scaling"] = 4.0 # doesn't exist: loss scaling is not a group attribute model.setOptimizer(o) model.run() testlog = helpers.LogChecker(capfd) testlog.assert_contains("Ignoring unexpected group 0 attribute", "'loss_scaling'") if use_combined_accum: testlog.assert_contains("graph optimizer with SGD", "defaultLearningRate=0.01,", "defaultVelocityScaling=2,", "lossScaling=4") testlog.assert_contains("group 0 optimizer with SGD", "learningRate=0.01,", "velocityScaling=2,") else: testlog.assert_contains("Ignoring unexpected group 1 attribute", "'velocity_scaling'") testlog.assert_contains("group 0 optimizer with SGD", "learningRate=0.01,", "velocityScaling=4,") testlog.assert_contains("group 1 optimizer with SGD", "learningRate=0.01,", "velocityScaling=4,") # Make sure the the groups default to the new optimizer's default velocityScaling, manually set lr for both groups params = [{ "params": model.model.base_model[0].parameters() }, { "params": model.model.base_model[1].parameters() }] o = poptorch.optim.SGD( params, lr=0.01, loss_scaling=1.0, velocity_scaling=3.0 if use_combined_accum else None, use_combined_accum=use_combined_accum) o.lr = 0.5 # doesn't exit o.defaults["lr"] = 0.7 o.param_groups[0]["lr"] = 0.0 o.param_groups[1]["lr"] = 1.0 model.setOptimizer(o) model.run() testlog = helpers.LogChecker(capfd) testlog.assert_contains("Ignoring unexpected optimizer attribute", "'lr'") if use_combined_accum: testlog.assert_contains("graph optimizer with SGD", "defaultLearningRate=0.7,", "defaultVelocityScaling=3,", "lossScaling=1") testlog.assert_contains("group 0 optimizer with SGD", "learningRate=0,", "velocityScaling=3,") testlog.assert_contains("group 1 optimizer with SGD", "learningRate=1,", "velocityScaling=3,") else: testlog.assert_contains("graph optimizer with SGD", "defaultLearningRate=0.7,", "defaultVelocityScaling=1,", "lossScaling=1") testlog.assert_contains("group 0 optimizer with SGD", "learningRate=0,", "velocityScaling=1,") testlog.assert_contains("group 1 optimizer with SGD", "learningRate=1,", "velocityScaling=1,") @helpers.printCapfdOnExit @pytest.mark.parametrize( "opt", helpers.onlyFirstIfReduced(( (poptorch.optim.SGD, (("momentum", 0.0), ("dampening", 0.0), ("weight_decay", 0.0))), (poptorch.optim.Adam, (("betas", (0.9, 0.999)), ("eps", 1e-08), ("weight_decay", 0.0), ("amsgrad", False))), (poptorch.optim.AdamW, (("betas", (0.9, 0.999)), ("eps", 1e-08), ("weight_decay", 0.01), ("amsgrad", False))), (poptorch.optim.RMSprop, (("momentum", 0.0), ("alpha", 0.99), ("eps", 1e-08), ("weight_decay", 0.0))), ))) @helpers.overridePoptorchLogLevel("DEBUG") # pylint: disable=too-many-statements def test_variable_default(opt, capfd): def toCamelCase(string): """Convert a snake case string (Pytorch) to camel case (Popart)""" words = string.split("_") return words[0] + "".join(w.capitalize() for w in words[1:]) def toPopartName(name, default): if name == "lr": name = "learning_rate" # amsgrad doesn't get passed to the backend if name in ["amsgrad"]: return [] if name == "betas": return toPopartName("beta1", default) + toPopartName( "beta2", default) if default: name = "default_" + name return [toCamelCase(name)] def createExpr(attr, is_const=True): const_expr = r" \(const\)" if not is_const: const_expr = "(?!" + const_expr + ")" return r"%s=[^ ,]+%s" % (attr, const_expr) def genRegexp(attrs, default=False, is_const=False): if isinstance(attrs, str): attrs = [attrs] exprs = [] for a in attrs: for n in toPopartName(a, default): exprs.append(createExpr(n, is_const)) return exprs # All the attribute values in "opt" are the default pytorch values which # means if the user instantiate a pytorch optimizer with them, we'll # consider all these attributes as constant. # However if a poptorch optimizer is used then they will all be considered # as variable because they were explicitly passed to the constructor. poptorch_opt, opt_args_tuple = opt opt_args = dict(opt_args_tuple) pytorch_opt = poptorch_opt.__bases__[1] # Retrieve the upstream type # Learning rate is a special case: it's always variable so handle it separately. attrs = list(opt_args.keys()) # Test the torch Optimizer: check all the attributes are set to constant by default model = OptimizerTestModel() optimizer = pytorch_opt(model.parameters(), lr=1.0, **opt_args) model.setOptimizer(optimizer) model.run() testlog = helpers.LogChecker(capfd) testlog.assert_matches("graph optimizer", *genRegexp(attrs, default=True, is_const=True), *genRegexp("lr", default=True, is_const=False)) testlog.assert_matches("group 0 optimizer", *genRegexp(attrs, is_const=True), *genRegexp("lr", is_const=False)) # Create a default pytorch optimizer (It should be identical to the previous one) optimizer = pytorch_opt(model.parameters(), lr=1.0) model.setOptimizer(optimizer) model.run() testlog = helpers.LogChecker(capfd) # As the optimizer is identical it shouldn't trigger any update in the backend testlog.assert_no_matches("graph optimizer") testlog.assert_no_matches("group 0 optimizer") # Create a default poptorch optimizer (As we don't explicitly specify any attribute they will all be considered as constant) if poptorch_opt == poptorch.optim.SGD: optimizer = poptorch_opt(model.parameters(), lr=1.0, use_combined_accum=False) else: optimizer = poptorch_opt(model.parameters(), lr=1.0) model.setOptimizer(optimizer) model.run() testlog = helpers.LogChecker(capfd) # As the optimizer is identical it shouldn't trigger any update in the backend testlog.assert_no_matches("graph optimizer") testlog.assert_no_matches("group 0 optimizer") # Create a poptorch optimizer and set all the attributes manually: they should all be marked as variable # So let's now manually mark them as constant (This should result in the same optimizer as the default one) if poptorch_opt == poptorch.optim.SGD: optimizer = poptorch_opt(model.parameters(), lr=1.0, use_combined_accum=False, **opt_args) else: optimizer = poptorch_opt(model.parameters(), lr=1.0, **opt_args) for attr in opt_args.keys(): assert not optimizer.variable_attrs.isConstant(attr) optimizer.variable_attrs.markAsConstant(attr) model.setOptimizer(optimizer) model.run() # As the optimizer is identical it shouldn't trigger any update in the backend testlog.assert_no_matches("graph optimizer") testlog.assert_no_matches("group 0 optimizer") # Test the poptorch Optimizer: check all the manually set attributes are set to variable by default # Create a new model as the optimizers would otherwise mismatch model = OptimizerTestModel() if poptorch_opt == poptorch.optim.SGD: optimizer = poptorch_opt(model.parameters(), lr=1.0, **opt_args, use_combined_accum=False) else: optimizer = poptorch_opt(model.parameters(), lr=1.0, **opt_args) model.setOptimizer(optimizer) model.run() testlog = helpers.LogChecker(capfd) testlog.assert_matches("graph optimizer", *genRegexp(attrs, default=True, is_const=False), *genRegexp("lr", default=True, is_const=False)) testlog.assert_matches("group 0 optimizer", *genRegexp(attrs, is_const=False), *genRegexp("lr", is_const=False)) # Check the values can actually change new_opts = {} for k, v in opt_args.items(): if isinstance(v, float): new_opts[k] = v + 0.5 elif isinstance(v, tuple): new_opts[k] = tuple(elt / 2.0 for elt in v) else: new_opts[k] = v if poptorch_opt == poptorch.optim.SGD: optimizer = poptorch_opt(model.parameters(), lr=1.0, use_combined_accum=False, **new_opts) else: optimizer = poptorch_opt(model.parameters(), lr=1.0, **new_opts) model.setOptimizer(optimizer) model.run() testlog = helpers.LogChecker(capfd) testlog.assert_matches("graph optimizer", *genRegexp(attrs, default=True, is_const=False), *genRegexp("lr", default=True, is_const=False)) testlog.assert_matches("group 0 optimizer", *genRegexp(attrs, is_const=False), *genRegexp("lr", is_const=False)) # Check we can manually mark attributes as variable if poptorch_opt == poptorch.optim.SGD: optimizer = poptorch_opt(model.parameters(), lr=1.0, use_combined_accum=False) else: optimizer = poptorch_opt(model.parameters(), lr=1.0) for attr in opt_args.keys(): assert optimizer.variable_attrs.isConstant(attr) optimizer.variable_attrs.markAsVariable(attr) model.setOptimizer(optimizer) model.run() testlog = helpers.LogChecker(capfd) testlog.assert_matches("graph optimizer", *genRegexp(attrs, default=True, is_const=False), *genRegexp("lr", default=True, is_const=False)) testlog.assert_matches("group 0 optimizer", *genRegexp(attrs, is_const=False), *genRegexp("lr", is_const=False)) @pytest.mark.parametrize("reduction", helpers.onlyFirstIfReduced( (poptorch.ReductionType.Sum, poptorch.ReductionType.Mean))) def test_gradient_accum(reduction): torch.manual_seed(42) class Model(torch.nn.Module): def __init__(self): super().__init__() layers = [torch.nn.Linear(10, 10) for _ in range(3)] self.model = torch.nn.Sequential(*layers) self.loss = torch.nn.CrossEntropyLoss() def forward(self, x, target): fwd = self.model(x) return fwd, self.loss(fwd, target) accum = 20 opts = poptorch.Options() opts.Training.gradientAccumulation(accum) opts.Training.accumulationAndReplicationReductionType(reduction) model = Model() poptorch_model = poptorch.trainingModel(model, options=opts) ins = torch.randn([1, 10]).expand(accum, 10) target = torch.randint(0, 10, size=[1]).expand(accum) _, loss = poptorch_model(ins, target) for _ in range(0, 500): _, loss = poptorch_model(ins, target) # Check we have trained the "model" assert loss < 0.03 @pytest.mark.parametrize("reduction", helpers.onlyFirstIfReduced( (poptorch.ReductionType.Sum, poptorch.ReductionType.Mean))) def test_gradient_accum_new_api(reduction): torch.manual_seed(42) class Model(torch.nn.Module): def __init__(self): super().__init__() layers = [torch.nn.Linear(10, 10) for _ in range(3)] self.model = torch.nn.Sequential(*layers) self.loss = torch.nn.CrossEntropyLoss() def forward(self, x, target): fwd = self.model(x) return fwd, self.loss(fwd, target) accum = 20 opts = poptorch.Options() opts.Training.gradientAccumulation(accum) opts.Training.accumulationAndReplicationReductionType(reduction) model = Model() poptorch_model = poptorch.trainingModel(model, options=opts) ins = torch.randn([1, 10]).expand(accum, 10) target = torch.randint(0, 10, size=[1]).expand(accum) _, loss = poptorch_model(ins, target) for _ in range(0, 500): _, loss = poptorch_model(ins, target) # Check we have trained the "model" assert loss < 0.03 @helpers.printCapfdOnExit @pytest.mark.parametrize("use_combined_accum", (True, False)) @helpers.overridePoptorchLogLevel("WARN" ) # We only want warnings for this test def test_extra_attributes(capfd, use_combined_accum): model = OptimizerTestModel(num_groups=2) # Make sure all groups have the default values, and the values are not (const) params = [{ "params": model.model.base_model[0].parameters() }, { "params": model.model.base_model[1].parameters() }] o = poptorch.optim.SGD( params, lr=0.01, loss_scaling=2.0, velocity_scaling=2.0 if use_combined_accum else None, use_combined_accum=use_combined_accum) model.setOptimizer(o) model.run() o.step = 0 o.param_groups[0]["initial_lr"] = 0.1 o.param_groups[1]["initial_lr"] = 0.1 model.setOptimizer(o) model.run() testlog = helpers.LogChecker(capfd) testlog.assert_matches("unexpected optimizer attribute") testlog.assert_matches(r"unexpected group \d attribute") # loss_scaling = 3.0: Make sure optimizer is different to trigger update o.loss_scaling = 3.0 model.setOptimizer(o) model.run() # Ensure warnings are printed only once testlog = helpers.LogChecker(capfd) testlog.assert_no_matches("unexpected optimizer attribute") testlog.assert_no_matches(r"unexpected group \d attribute") @helpers.printCapfdOnExit @pytest.mark.parametrize("use_combined_accum", (True, False)) @helpers.overridePoptorchLogLevel("WARN" ) # We only want warnings for this test def test_extra_attributes2(capfd, use_combined_accum): opts = poptorch.Options() opts.relaxOptimizerAttributesChecks() model = OptimizerTestModel(num_groups=2, options=opts) # Make sure all groups have the default values, and the values are not (const) params = [{ "params": model.model.base_model[0].parameters() }, { "params": model.model.base_model[1].parameters() }] o = poptorch.optim.SGD( params, lr=0.01, loss_scaling=2.0, velocity_scaling=2.0 if use_combined_accum else None, use_combined_accum=use_combined_accum) model.setOptimizer(o) model.run() o.step = 0 o.param_groups[0]["initial_lr"] = 0.1 o.param_groups[1]["initial_lr"] = 0.1 model.setOptimizer(o) model.run() testlog = helpers.LogChecker(capfd) testlog.assert_no_matches("unexpected optimizer attribute") testlog.assert_no_matches(r"unexpected group \d attribute") @helpers.printCapfdOnExit @pytest.mark.parametrize("use_combined_accum", (True, False)) @helpers.overridePoptorchLogLevel("WARN" ) # We only want warnings for this test def test_extra_attributes3(capfd, use_combined_accum): model = OptimizerTestModel(num_groups=2) # Make sure all groups have the default values, and the values are not (const) params = [{ "params": model.model.base_model[0].parameters() }, { "params": model.model.base_model[1].parameters() }] o = poptorch.optim.SGD( params, lr=0.01, loss_scaling=2.0, velocity_scaling=2.0 if use_combined_accum else None, use_combined_accum=use_combined_accum) o.step = 0 o.param_groups[0]["initial_lr"] = 0.1 o.param_groups[1]["initial_lr"] = 0.1 model.setOptimizer(o) model.run() # If extra attributes are added before the first run # they shouldn't trigger any warning testlog = helpers.LogChecker(capfd) testlog.assert_no_matches("unexpected optimizer attribute") testlog.assert_no_matches(r"unexpected group \d attribute") # loss_scaling = 4.0: Make sure optimizer is different to trigger update o.loss_scaling = 4.0 # initial_lr is a group attribute: should trigger a warning. o.initial_lr = 0.2 # If they're added later then they should print a warning model.setOptimizer(o) model.run() testlog = helpers.LogChecker(capfd) testlog.assert_matches("unexpected optimizer attribute") testlog.assert_no_matches(r"unexpected group \d attribute") @pytest.mark.parametrize("use_tf_variant", [True, False]) def test_rmsprop_tf_variant(use_tf_variant): torch.manual_seed(0) # Make sure the TF flag is propagated correctly by comparing the # results of TF and non-TF versions. weight = torch.randn(10, 10) bias = torch.randn(10) input = torch.randn(1, 10) label = torch.randint(0, 10, [1]) model_pt = OptimizerTestModel() model_pt.model.base_model.weight = torch.nn.Parameter( weight.detach().clone()) model_pt.model.base_model.bias = torch.nn.Parameter(bias.detach().clone()) model_pt.input = input.detach().clone() model_pt.label = label.detach().clone() optimizer_pt = poptorch.optim.RMSprop(model_pt.parameters(), lr=0.02) model_pt.setOptimizer(optimizer_pt) model_tf = OptimizerTestModel() model_tf.model.base_model.weight = torch.nn.Parameter( weight.detach().clone()) model_tf.model.base_model.bias = torch.nn.Parameter(bias.detach().clone()) model_tf.input = input.detach().clone() model_tf.label = label.detach().clone() optimizer_tf = poptorch.optim.RMSprop(model_tf.parameters(), lr=0.02, use_tf_variant=use_tf_variant) model_tf.setOptimizer(optimizer_tf) helpers.assert_allequal(actual=model_pt.model.base_model.weight.data, expected=model_tf.model.base_model.weight.data) helpers.assert_allequal(actual=model_pt.model.base_model.bias.data, expected=model_tf.model.base_model.bias.data) for _ in range(5): out_pt, loss_pt = model_pt.run() out_tf, loss_tf = model_tf.run() if use_tf_variant: assert not torch.allclose(model_pt.model.base_model.weight.data, model_tf.model.base_model.weight.data) assert not torch.allclose(out_pt, out_tf) assert not torch.allclose(loss_pt, loss_tf) else: helpers.assert_allequal( actual=model_pt.model.base_model.weight.detach().clone(), expected=model_tf.model.base_model.weight.detach().clone()) helpers.assert_allequal(actual=out_pt, expected=out_tf) helpers.assert_allequal(actual=loss_pt, expected=loss_tf) @pytest.mark.parametrize("opt", all_optimizers) def test_optimizer_results(opt): torch.manual_seed(42) class Stepper: def __init__(self, model, lr, optimizer): self.lr = lr self.setup_cpu(model, optimizer) self.setup_ipu(model, optimizer) self.check_parameters() def setup_cpu(self, model, optimizer): self.cpu_model = copy.deepcopy(model) self.optimizer = optimizer(self.cpu_model.parameters(), lr=self.lr) def setup_ipu(self, model, optimizer): self.ipu_model = copy.deepcopy(model) ipu_optimizer = optimizer(self.ipu_model.parameters(), lr=self.lr) self.training_model = poptorch.trainingModel( self.ipu_model, optimizer=ipu_optimizer) def check_parameters(self): for cpu, ipu in zip(self.cpu_model.named_parameters(), self.ipu_model.named_parameters()): cpu = cpu[1] ipu = ipu[1] helpers.assert_allclose(actual=ipu, expected=cpu) def cpu_step(self, batch): self.optimizer.zero_grad() _, loss = self.cpu_model(batch) loss = loss.sum() loss.backward() self.optimizer.step() return loss def ipu_step(self, batch): _, loss = self.training_model(batch) return loss num_samples = 10 X = torch.rand(num_samples) lr = 0.01 num_steps = 10 cpu_loss = torch.empty(num_steps) ipu_loss = torch.empty(num_steps) stepper = Stepper(helpers.ModelWithWeights(torch.nn.LogSoftmax(), X.shape), lr=lr, optimizer=opt) for i in range(num_steps): cpu_loss[i] = stepper.cpu_step((X, )) ipu_loss[i] = stepper.ipu_step((X, )) stepper.check_parameters() helpers.assert_allclose(expected=cpu_loss, actual=ipu_loss, atol=1e-5, rtol=1e-5) @pytest.mark.parametrize("opt", [(optim.SGD, poptorch.optim.SGD), (optim.Adam, poptorch.optim.Adam), (optim.AdamW, poptorch.optim.AdamW)], ids=['SGD', 'Adam', 'AdamW']) def test_gradient_clipping(opt): torch.manual_seed(42) max_norm = 0.001 class Stepper: def __init__(self, model, lr, optimizer): self.lr = lr self.original_model = model self.setup_torch(model, optimizer[0]) self.setup_poptorch(model, optimizer[1]) self.check_parameters() def setup_torch(self, model, optimizer): self.torch_model = copy.deepcopy(model) self.optimizer = optimizer(self.torch_model.parameters(), lr=self.lr) def setup_poptorch(self, model, optimizer): self.ipu_model = copy.deepcopy(model) ipu_optimizer = optimizer(self.ipu_model.parameters(), lr=self.lr, max_grad_norm=max_norm) self.training_model = poptorch.trainingModel( self.ipu_model, optimizer=ipu_optimizer) def check_parameters(self): for expected, actual in zip( self.torch_model.named_parameters(), self.training_model.named_parameters()): expected = expected[1] actual = actual[1] helpers.assert_allclose(actual=actual, expected=expected, atol=1e-5, rtol=1e-5) def torch_step(self, batch): self.optimizer.zero_grad() _, loss = self.torch_model(batch) loss = loss.sum() loss.backward() torch.nn.utils.clip_grad_norm_(self.torch_model.parameters(), max_norm) self.optimizer.step() return loss def poptorch_step(self, batch): _, loss = self.training_model(batch) return loss num_samples = 10 X = torch.randn(num_samples) lr = 0.01 num_steps = 10 torch_loss = torch.empty(num_steps) poptorch_loss = torch.empty(num_steps) stepper = Stepper(helpers.ModelWithWeights(torch.nn.LogSoftmax(), X.shape), lr=lr, optimizer=opt) for i in range(num_steps): torch_loss[i] = stepper.torch_step((X, )) poptorch_loss[i] = stepper.poptorch_step((X, )) stepper.check_parameters() helpers.assert_allclose(expected=torch_loss, actual=poptorch_loss, atol=1e-5, rtol=1e-5) # TODO(T53152): remove this test. def test_gradient_clipping_with_pipelining(): torch.manual_seed(0) opts = poptorch.Options() opts.Training.gradientAccumulation(3) class Model(torch.nn.Module): def __init__(self): super().__init__() self.w0 = poptorch.BeginBlock(torch.nn.Linear(3, 3), "w0", ipu_id=0) self.w1 = poptorch.BeginBlock(torch.nn.Linear(3, 3), "w1", ipu_id=1) self.loss = torch.nn.NLLLoss(reduction="mean") def forward(self, x, y): x = self.w0(x) x = self.w1(x) loss = self.loss(x, y) return x, loss model = Model() optimizer = poptorch.optim.SGD( model.parameters(), lr=0.01, max_grad_norm=0.001, ) poptorch_model = poptorch.trainingModel(model, opts, optimizer=optimizer) poptorch_model(torch.randn((15, 3, 3)), torch.randint(0, 1, (15, 3))) @pytest.mark.parametrize("optim", poptorch_optimizers) def test_read_ipu_state(optim): torch.manual_seed(42) input = torch.randn(3) # A simple model with weights and a loss function model = helpers.ModelWithWeights(lambda x: x, input.shape) lr = 0.05 wd = 0.025 ls = 0.75 optimizer = optim(model.parameters(), lr=lr, weight_decay=wd, loss_scaling=ls) training_model = poptorch.trainingModel(model, optimizer=optimizer) # Before the model is compiled, the state_dict should be empty state = optimizer.state_dict() assert_is_ipu_optimizer_state(state, should_be_empty=True) # Compiling should populate the state_dict training_model.compile((input, )) s0 = optimizer.state_dict() assert_is_ipu_optimizer_state(s0, should_be_empty=False) sgd_param_keys = [ "scaledLearningRate0___specific___lin.bias", "scaledLearningRate0___specific___lin.weight", "weightDecayScaleFactor0___specific___lin.bias", "weightDecayScaleFactor0___specific___lin.weight" ] non_sgd_param_keys = [ "learningRate___specific___lin.bias", "learningRate___specific___lin.weight", "weightDecay___specific___lin.bias", "weightDecay___specific___lin.weight" ] # Check that shared keys are present and user provided values are read # back correctly if isinstance(optimizer, torch.optim.SGD): for k in sgd_param_keys: assert k in s0["ipu_param"].keys() # weightDecayScaleFactor0 = # 1 - lr * (1 - dm) * wd, dm = 0 wdsf0 = 1 - lr * wd helpers.assert_allclose( actual=s0["ipu_param"] ["weightDecayScaleFactor0___specific___lin.bias"], expected=torch.tensor(wdsf0)) # scaledLearningRate0 = # lr * (1 - dm) / ls, dm = 0 slr0 = lr / ls helpers.assert_allclose(actual=s0["ipu_param"] ["scaledLearningRate0___specific___lin.bias"], expected=torch.tensor(slr0)) else: # Only non-SGD optimisers have state tensors state_keys = ["Accl1___lin.weight", "Accl1___lin.bias"] for k in non_sgd_param_keys: assert k in s0["ipu_param"].keys() for k in state_keys: assert k in s0["ipu_state"].keys() helpers.assert_allclose( actual=s0["ipu_param"]["learningRate___specific___lin.bias"], expected=torch.tensor(lr)) helpers.assert_allclose( actual=s0["ipu_param"]["weightDecay___specific___lin.bias"], expected=torch.tensor(wd)) # Run the model, get the updated state dict and check optimiser state tensors have changed training_model((input, )) s1 = optimizer.state_dict() assert_is_ipu_optimizer_state(s1, should_be_empty=False) assert not all( torch.equal(s0["ipu_state"][k], s1["ipu_state"][k]) for k in s0["ipu_state"].keys()) helpers.assert_allclose(actual=s0["ipu_param"]["lossScaling_FLOAT"], expected=torch.tensor(ls)) @helpers.printCapfdOnExit @helpers.overridePoptorchLogLevel("DEBUG") def test_read_ipu_state_cached(caplog, capfd): input = torch.ones(3) # A simple model with weights and a loss function model = helpers.ModelWithWeights(lambda x: x, input.shape) optimizer = poptorch.optim.SGD(model.parameters(), lr=0.0) training_model = poptorch.trainingModel(model, optimizer=optimizer) training_model.compile((input, )) # Compilation should trigger an optimiser state IPU->host copy state = optimizer.state_dict() assert_is_ipu_optimizer_state(state, should_be_empty=False) log = helpers.LogChecker(capfd) log.assert_matches("Writing optimiser state tensors from IPU to host.") # The second invocation should use the cached state dict, since # the internal optimiser state hasn't changed state = optimizer.state_dict() assert_is_ipu_optimizer_state(state, should_be_empty=False) assert "Using cached optimiser state dict" in caplog.text @unittest.mock.patch.dict("os.environ", helpers.disableAllModels()) def test_read_ipu_state_offline(): input = torch.ones(3) # A simple model with weights and a loss function model = helpers.ModelWithWeights(lambda x: x, input.shape) optimizer = poptorch.optim.SGD(model.parameters(), lr=0.0) opts = poptorch.Options() opts.useOfflineIpuTarget() training_model = poptorch.trainingModel(model, opts, optimizer=optimizer) training_model.compile((input, )) state = optimizer.state_dict() assert_is_ipu_optimizer_state(state, should_be_empty=True) @helpers.printCapfdOnExit @helpers.overridePoptorchLogLevel("DEBUG") @pytest.mark.parametrize("optim", [poptorch.optim.SGD, torch.optim.SGD]) def test_read_ipu_state_on_detach(caplog, capfd, optim): input = torch.ones(3) # A simple model with weights and a loss function model = helpers.ModelWithWeights(lambda x: x, input.shape) optimizer = optim(model.parameters(), lr=0.0) training_model = poptorch.trainingModel(model, optimizer=optimizer) training_model.compile((input, )) training_model.detachFromDevice() # Detach should trigger an optimiser state IPU->host copy for PopTorch optimizers log = helpers.LogChecker(capfd) if isinstance(optimizer, poptorch.optim.Optimizer): log.assert_matches("Writing optimiser state tensors from IPU to host.") else: log.assert_no_matches( "Writing optimiser state tensors from IPU to host.") # The second invocation should use the cached state dict, since # the internal optimiser state hasn't changed state = optimizer.state_dict() log = helpers.LogChecker(caplog.text) if isinstance(optimizer, poptorch.optim.Optimizer): log.assert_matches("Using cached optimiser state dict") else: log.assert_no_matches("Using cached optimiser state dict") optimizer.load_state_dict(state) training_model.attachToDevice() # Detach should trigger an optimiser state IPU->host copy for PopTorch optimizers log = helpers.LogChecker(capfd) if isinstance(optimizer, poptorch.optim.Optimizer): log.assert_matches( "Writing optimiser state tensors from host to IPU memory") else: log.assert_no_matches( "Writing optimiser state tensors from host to IPU memory") @pytest.mark.parametrize("optim", poptorch_optimizers) @pytest.mark.parametrize("incomplete_state", [True, False]) def test_write_ipu_state(optim, incomplete_state): torch.manual_seed(42) input = torch.randn(3) # A simple model with weights and a loss function model = helpers.ModelWithWeights(lambda x: x, input.shape) # SGD requires LR to be specified but the value doesn't matter optimizer = optim(model.parameters(), lr=0.0) # Hacky way to make sure all the attributes are set to variable. optimizer.variable_attrs._variable_attributes = copy.deepcopy( # pylint: disable=protected-access optimizer.variable_attrs._allowed_attributes) # pylint: disable=protected-access training_model = poptorch.trainingModel(model, optimizer=optimizer) # Compiling should populate the state_dict training_model.compile((input, )) # The initial optimiser state s0 = optimizer.state_dict() deleted_param = None deleted_state = None if incomplete_state: # delete the first param deleted_param = next(iter(s0["ipu_param"].items())) del s0["ipu_param"][deleted_param[0]] deleted_state = next(iter(s0["ipu_state"].items())) del s0["ipu_state"][deleted_state[0]] # Just set values randomly so we can check they changed for k, v in s0["ipu_param"].items(): s0["ipu_param"][k] = torch.randn_like(v) for k, v in s0["ipu_state"].items(): s0["ipu_state"][k] = torch.randn_like(v) # Load the modified state dict optimizer.load_state_dict(s0) # Read it back into a new dict s1 = optimizer.state_dict() # Check that the values read back match the ones set for k, v in s0["ipu_param"].items(): helpers.assert_allequal(actual=s1["ipu_param"][k], expected=v) if deleted_param: # At that point we haven't used the new optimizer yet so the deleted keys haven't been restored. assert deleted_param[0] not in s1["ipu_param"] for k, v in s0["ipu_state"].items(): helpers.assert_allequal(actual=s1["ipu_state"][k], expected=v) if deleted_state: # At that point we haven't used the new optimizer yet so the deleted keys haven't been restored. assert deleted_state[0] not in s1["ipu_state"] # Use the model and check the two states have been merged. training_model((input, )) s1 = optimizer.state_dict() # Check that the values read back match the ones set for k, v in s0["ipu_param"].items(): helpers.assert_allequal(actual=s1["ipu_param"][k], expected=v) if deleted_param: helpers.assert_allequal(actual=s1["ipu_param"][deleted_param[0]], expected=deleted_param[1]) # Using the model will have changed the state, so we can only check the values have changed. for k, v in s0["ipu_state"].items(): assert not torch.allclose(v, s1["ipu_state"][k]) if deleted_state: assert not torch.allclose(deleted_state[1], s1["ipu_state"][deleted_state[0]]) def test_write_ipu_state_from_cpu(): input = torch.ones(2) lin = torch.nn.Linear(2, 1) optimizer = torch.optim.Adam(lin.parameters()) # Perform a CPU training step to populate torch optimiser state out = lin(input) out.backward() optimizer.step() pop_optimizer = poptorch.optim.Adam(lin.parameters()) # Try to load the CPU optimiser state onto the IPU with pytest.raises( RuntimeError, match="Only IPU optimizer states can be loaded onto the IPU."): pop_optimizer.load_state_dict(optimizer.state_dict()) @helpers.printCapfdOnExit @helpers.overridePoptorchLogLevel("DEBUG") def test_write_ipu_state_before_override(capfd): input = torch.ones(2) model = helpers.ModelWithWeights(lambda x: x, input.shape) optimizer = poptorch.optim.Adam(model.parameters()) training_model = poptorch.trainingModel(model, optimizer=optimizer) # Compile and run the model, get the state dict training_model((input, )) s1 = optimizer.state_dict() # destroy model so it can be rewrapped training_model.destroy() # Create a new optimiser and load the state dict new_optimizer = poptorch.optim.Adam(model.parameters()) new_optimizer.load_state_dict(s1) # Compile a new model with the new loaded optimiser new_training_model = poptorch.trainingModel(model, optimizer=new_optimizer) new_training_model.compile((input, )) # The state read back should match the initial state s2 = new_optimizer.state_dict() for k, v in s1["ipu_state"].items(): helpers.assert_allequal(actual=s2["ipu_state"][k], expected=v) # Confirm that an optimiser state IPU->host copy actually took place log = helpers.LogChecker(capfd) log.assert_matches("Writing optimiser state tensors from IPU to host.") @helpers.printCapfdOnExit @helpers.overridePoptorchLogLevel("DEBUG") def test_LR_scheduler(capfd): input = torch.ones(2) model = helpers.ModelWithWeights(lambda x: x, input.shape) optimizer = poptorch.optim.Adam(model.parameters(), lr=1.0) # Halve the LR after each training step scheduler = ExponentialLR(optimizer, 0.5) training_model = poptorch.trainingModel(model, optimizer=optimizer) # Compile and run the model training_model((input, )) log = helpers.LogChecker(capfd) # Initial optimizer upload log.assert_matches("Updating group 0 optimizer") # Step the scheduler for the next epoch # lr: 1.0 -> 0.5 scheduler.step() # Set the new LR training_model.setOptimizer(optimizer) log = helpers.LogChecker(capfd) log.assert_matches("Updating group 0 optimizer") # Updating the optimizer's parameter shouldn't trigger a sync of the weights. log.assert_no_matches("copyWeightsToHost()") # Run the model to use the new optimizer. training_model((input, )) s0 = optimizer.state_dict() log = helpers.LogChecker(capfd) log.assert_matches("Writing optimiser state tensors from IPU to host") # Run the model to make the IPU state dirty. training_model((input, )) log = helpers.LogChecker(capfd) # No data transfer should happen. log.assert_no_matches("Writing") optimizer.load_state_dict(s0) # Run the model to trigger the transfers. training_model((input, )) log = helpers.LogChecker(capfd).createIterator() # Updating the optimizer's state should trigger a backup of the IPU weights first. log.findNext("copyWeightsToHost()") # Then the new state should be uploaded log.findNext("Writing optimiser state tensors from host to IPU memory") def test_write_ipu_state_from_checkpoint(): input = torch.ones(2) model = helpers.ModelWithWeights(lambda x: x, input.shape) optimizer = poptorch.optim.Adam(model.parameters(), lr=1.0) # Halve the LR after each training step scheduler = ExponentialLR(optimizer, 0.5) training_model = poptorch.trainingModel(model, optimizer=optimizer) # Compile and run the model training_model((input, )) # Step the scheduler for the next epoch # lr: 1.0 -> 0.5 scheduler.step() # Set the new LR training_model.setOptimizer(optimizer) s1 = optimizer.state_dict() with tempfile.TemporaryDirectory() as d: path = os.path.join(d, "checkpoint.pt") # Save the state_dict to file torch.save({"optimizer_state_dict": s1}, path) # Load it back checkpoint = torch.load(path) # Create a new optimizer and load the checkpoint optimizer = poptorch.optim.Adam(model.parameters(), lr=0.1) optimizer.load_state_dict(checkpoint['optimizer_state_dict']) s2 = optimizer.state_dict() # Ensure the new optimizer state matches the one saved for k, v in s1["ipu_state"].items(): helpers.assert_allequal(actual=s2["ipu_state"][k], expected=v) for k, v in s1["ipu_param"].items(): helpers.assert_allequal(actual=s2["ipu_param"][k], expected=v) # Now continue training to test that the updated LR is used scheduler = ExponentialLR(optimizer, 0.5) training_model.setOptimizer(optimizer) # New LR is set internally when the model is run training_model((input, )) s3 = optimizer.state_dict() torch_lr = torch.tensor(s3["param_groups"][0]["lr"]) poptorch_lr = s3["ipu_param"]['learningRate___specific___lin.bias'] # Ensure the torch LR parameter is correct helpers.assert_allclose(actual=torch_lr, expected=torch.tensor(0.5)) # Ensure the internal LR parameter matches helpers.assert_allclose(actual=poptorch_lr, expected=torch_lr) def test_setOptimizer_frozen_options_ok(): input = torch.ones(2) opts = poptorch.Options() opts.Training.setMeanAccumulationAndReplicationReductionStrategy( poptorch.MeanReductionStrategy.Post) model = helpers.ModelWithWeights(lambda x: x, input.shape) # This will freeze the options data = poptorch.DataLoader(opts, [(torch.ones(2), )]) optimizer = poptorch.optim.Adam(model.parameters(), lr=0.5, accum_type=torch.half) # will set the reduction strategy to Running training_model = poptorch.trainingModel(model, optimizer=optimizer, options=opts) training_model.compile(tuple(next(iter(data)))) assert training_model.options.Training.meanAccumulationAndReplicationReductionStrategy == poptorch.MeanReductionStrategy.Running # pylint: disable=line-too-long optimizer.param_groups[0]['lr'] = 0.01 training_model.setOptimizer(optimizer) def test_setOptimizer_frozen_options_broken(): input = torch.ones(2) opts = poptorch.Options() opts.Training.setMeanAccumulationAndReplicationReductionStrategy( poptorch.MeanReductionStrategy.Post) model = helpers.ModelWithWeights(lambda x: x, input.shape) # This will freeze the options data = poptorch.DataLoader(opts, [(torch.ones(2), )]) optimizer = poptorch.optim.Adam(model.parameters(), lr=0.5) # will set the reduction strategy to Running training_model = poptorch.trainingModel(model, optimizer=optimizer, options=opts) training_model.compile(tuple(next(iter(data)))) assert training_model.options.Training.meanAccumulationAndReplicationReductionStrategy == poptorch.MeanReductionStrategy.Post # pylint: disable=line-too-long optimizer.param_groups[0]['lr'] = 0.01 optimizer.accum_type = torch.half with pytest.raises(ValueError, match="is already compiled"): training_model.setOptimizer(optimizer) @helpers.printCapfdOnExit @pytest.mark.parametrize("opt", all_optimizers) @pytest.mark.parametrize("subclassed", [True, False]) def test_optimizer_warnings(capfd, opt, subclassed): # The NoBias classes defined in this file are subclasses and # should be warned about, so skip the 'not subclassed' case for these if 'NoBias' in str(opt) and not subclassed: pytest.skip() if subclassed: class SubclassedOpt(opt): pass opt = SubclassedOpt model = OptimizerTestModel() model.setOptimizer(opt(model.parameters(), lr=0.01)) expected_warning = "Poptorch does not run Python optimizer code directly" testlog = helpers.LogChecker(capfd) if subclassed: testlog.assert_contains(expected_warning) else: testlog.assert_not_contains(expected_warning) ================================================ FILE: tests/options_test.py ================================================ #!/usr/bin/env python3 # Copyright (c) 2020 Graphcore Ltd. All rights reserved. import copy import unittest.mock import tempfile import os import threading import torch import torch.nn as nn import pytest import helpers import poptorch from poptorch.enums import OutputMode, MeanReductionStrategy def test_set_options(): # pylint: disable=protected-access # Create our model. opts = poptorch.Options() opts.outputMode(poptorch.enums.OutputMode.All) # Just set a bunch of options and check they're successfully parsed. with tempfile.TemporaryDirectory() as tmp: opts.deviceIterations(1).setExecutionStrategy( poptorch.PipelinedExecution()).replicationFactor(1).logDir( tmp).enableSyntheticData(True).maxRepeatLogs(None) poptorch.poptorch_core._validateOptions(opts.toDict()) class TestSetOptionsFromEnvironment: """Checks that we can set options through environment variables""" def test_block(self): ref = poptorch.Options() opts = poptorch.Options() # Just set a bunch of options and check they're successfully parsed. with tempfile.TemporaryDirectory() as tmp: opts.deviceIterations(2).replicationFactor(1).logDir( tmp).enableSyntheticData(True) try: os.environ["POPTORCH_DEFAULT_OPTIONS"] = ( '{"deviceIterations":2,' f'"replicationFactor":1,"logDir":"{tmp}",' '"enableSyntheticData":true}') init_set = poptorch.Options() finally: del os.environ["POPTORCH_DEFAULT_OPTIONS"] assert f"{ref}" != f"{opts}" assert f"{opts}" == f"{init_set}" def test_dotted_access(self): ref = poptorch.Options() opts = poptorch.Options() opts.Precision.enableFloatingPointExceptions(True) try: os.environ["POPTORCH_DEFAULT_OPTIONS"] = ( '{"Precision.enableFloatingPointExceptions":true}') init_set = poptorch.Options() finally: del os.environ["POPTORCH_DEFAULT_OPTIONS"] assert f"{ref}" != f"{init_set}" assert f"{opts}" == f"{init_set}" def test_enum_conversion(self): ref = poptorch.Options() opts = poptorch.Options() opts.connectionType(poptorch.ConnectionType.OnDemand) try: os.environ["POPTORCH_DEFAULT_OPTIONS"] = ( '{"connectionType":"ConnectionType.OnDemand"}') init_set = poptorch.Options() finally: del os.environ["POPTORCH_DEFAULT_OPTIONS"] assert f"{ref}" != f"{init_set}" assert f"{opts}" == f"{init_set}" def test_setting_popart_options(self): ref = poptorch.Options() opts = poptorch.Options() # pylint: disable=protected-access opts._Popart.set("saveInitializersToFile", "my_file.onnx") try: os.environ["POPTORCH_DEFAULT_OPTIONS"] = ( '{"_Popart.set":["saveInitializersToFile", "my_file.onnx"]}') init_set = poptorch.Options() finally: del os.environ["POPTORCH_DEFAULT_OPTIONS"] assert f"{ref}" != f"{init_set}" assert f"{opts}" == f"{init_set}" @pytest.mark.parametrize("key, value, expected_str", [ ("asdfasdf", True, r"Unknown .* option .*"), ("dotChecks", torch.empty(1, 1), r"Unknown value type .* for option .*"), ("asdfasdf", torch.empty( 1, 1), r"(Unknown .* option .*|Unknown value type .* for option .*)"), ]) def test_invalid_options(key, value, expected_str): # pylint: disable=protected-access opts = poptorch.Options() opts.outputMode(poptorch.enums.OutputMode.All) opts._Popart.set(key, value) with pytest.raises(poptorch.Error, match=expected_str): poptorch.poptorch_core._validateOptions(opts.toDict()) @helpers.printCapfdOnExit @helpers.overridePoptorchLogLevel("DEBUG") def test_set_options_from_file(capfd): class LogChecker(helpers.LogChecker): def validate(self): # pylint: disable=line-too-long self.assert_contains( "poptorch.Options set replication_factor to value 1") self.assert_contains( "poptorch.Options set device_iterations to value 1") self.assert_contains( "poptorch.Options set execution_mode to value 1") self.assert_contains( "poptorch.Options set syntheticDataMode to value 2") class Network(nn.Module): def forward(self, x, y): return x + y options_list = [ "deviceIterations(1)", "setExecutionStrategy(poptorch.ShardedExecution())", " replicationFactor(1)", # Whitespace should be stripped " ", # Empty lines should be skipped "enableSyntheticData(True) # Inline comments should be ignored", "# Comments should be ignored" ] options_list = "\n".join(options_list) with tempfile.TemporaryDirectory() as tmp: filepath = os.path.join(tmp, "tmp.conf") f = open(filepath, "w") # Write the options to file f.write(options_list) f.close() opts = poptorch.Options() # Read the options back opts.loadFromFile(filepath) # Ensure that a useful error message is output on malformed input f = open(filepath, "a") f.write("\noutputMode(poptorch.OutputMode.All") f.close() with pytest.raises(poptorch.options.ConfigFileError) as e: opts.loadFromFile(filepath) assert "SyntaxError at line 5 of tmp.conf: unexpected EOF " \ "while parsing\n" \ "> options.outputMode(poptorch.OutputMode.All" in str(e.value) # Create the model model = Network() inference_model = poptorch.inferenceModel(model, opts) x = torch.ones(2) y = torch.zeros(2) # Run the model inference_model(x, y) testlog = LogChecker(capfd) # Ensure the options were actually set testlog.validate() @helpers.printCapfdOnExit @helpers.overridePoptorchLogLevel("DEBUG") def test_override_options_from_file(capfd): class LogChecker(helpers.LogChecker): def validate(self): # pylint: disable=line-too-long self.assert_contains( "poptorch.Options set replication_factor to value 2") self.assert_contains( "poptorch.Options set device_iterations to value 1") self.assert_contains( "poptorch.Options set execution_mode to value 1") self.assert_contains( "poptorch.Options set syntheticDataMode to value 2") class Network(nn.Module): def forward(self, x, y): return x + y options_list = [ "deviceIterations(2)", "setExecutionStrategy(poptorch.ShardedExecution())", "replicationFactor(2)", "enableSyntheticData(True)", ] options_list_override = [ "deviceIterations(1)", ] options_list = "\n".join(options_list) options_list_override = "\n".join(options_list_override) with tempfile.TemporaryDirectory() as tmp: filepath = os.path.join(tmp, "tmp.conf") f = open(filepath, "w") # Write the options to file f.write(options_list) f.close() filepath_override = os.path.join(tmp, "tmp_override.conf") f = open(filepath_override, "w") # Write the options to file f.write(options_list_override) f.close() opts = poptorch.Options() # Read the options back opts.loadFromFile(filepath) # Read the override options opts.loadFromFile(filepath_override) # Create the model model = Network() inference_model = poptorch.inferenceModel(model, opts) assert inference_model.options.replication_factor == 2 assert inference_model.options.device_iterations == 1 x = torch.ones(2) y = torch.zeros(2) # Run the model inference_model(x, y) testlog = LogChecker(capfd) # Ensure the options were set correctly # The override should ONLY override options that were actually set testlog.validate() @helpers.printCapfdOnExit @helpers.overridePoptorchLogLevel("DEBUG") def test_set_popart_options(capfd): # pylint: disable=protected-access opts = poptorch.Options() opts.outputMode(poptorch.enums.OutputMode.All) opts._Popart.set("hardwareInstrumentations", set([0, 1])) opts._Popart.set("dotChecks", ["FINAL", "ALL"]) opts._Popart.set("engineOptions", { "debug.allowOutOfMemory": "true", }) opts._Popart.set("reportOptions", {"reportOptA": "A", "reportOptB": "B"}) opts._Popart.set("convolutionOptions", {"convOptA": "A", "convOptB": "B"}) opts._Popart.set("matmulOptions", {"matOptA": "A", "matOptB": "B"}) opts._Popart.set("lstmOptions", {"lstmOptA": "A", "lstmOptB": "B"}) opts._Popart.set("gclOptions", {"gclOptA": "A", "gclOptB": "B"}) opts._Popart.set("customCodelets", []) opts._Popart.set("autoRecomputation", 1) opts._Popart.set("enableOutlining", True) opts._Popart.set("batchSerializationSettings.factor", 1) opts._Popart.set("batchSerializationSettings.concatOnVirtualGraphChange", True) opts._Popart.set("batchSerializationSettings.concatOnExecutionPhaseChange", True) opts._Popart.set("batchSerializationSettings.concatOnPipelineStageChange", True) opts._Popart.set("batchSerializationSettings.transformContext", 0) opts._Popart.set("batchSerializationSettings.method", 0) opts._Popart.set("batchSerializationSettings.batchSchedule", 1) opts._Popart.set("accumulateOuterFragmentSettings.schedule", 1) opts._Popart.set("accumulateOuterFragmentSettings.excludedVirtualGraphs", ["0", "1"]) opts._Popart.set("enableExplicitIR", True) opts._Popart.set( "automaticLossScalingSettings.gradientTensorTrackingMethod", 1) opts._Popart.set("updatableNamedBuffers", ["t1", "t2"]) poptorch.poptorch_core._validateOptions(opts.toDict()) log = helpers.LogChecker(capfd) log.assert_contains("poptorch.Options added 0 to hardwareInstrumentations") log.assert_contains("poptorch.Options added 1 to hardwareInstrumentations") log.assert_contains("poptorch.Options added FINAL to dotChecks") log.assert_contains("poptorch.Options added ALL to dotChecks") log.assert_contains( "poptorch.Options set engineOptions[debug.allowOutOfMemory] to true") log.assert_contains("poptorch.Options set reportOptions[reportOptA] to A") log.assert_contains("poptorch.Options set reportOptions[reportOptB] to B") log.assert_contains( "poptorch.Options set convolutionOptions[convOptA] to A") log.assert_contains( "poptorch.Options set convolutionOptions[convOptB] to B") log.assert_contains("poptorch.Options set matmulOptions[matOptA] to A") log.assert_contains("poptorch.Options set matmulOptions[matOptB] to B") log.assert_contains("poptorch.Options set lstmOptions[lstmOptA] to A") log.assert_contains("poptorch.Options set lstmOptions[lstmOptB] to B") log.assert_contains("poptorch.Options set gclOptions[gclOptA] to A") log.assert_contains("poptorch.Options set gclOptions[gclOptB] to B") log.assert_contains("poptorch.Options set autoRecomputation to value 1") log.assert_contains("poptorch.Options set enableOutlining to value true") log.assert_contains( "poptorch.Options set batchSerializationSettings.factor to value 1") log.assert_contains( "poptorch.Options set " "batchSerializationSettings.concatOnVirtualGraphChange to value true") log.assert_contains( "poptorch.Options set " "batchSerializationSettings.concatOnExecutionPhaseChange to value true" ) log.assert_contains( "poptorch.Options set " "batchSerializationSettings.concatOnPipelineStageChange to value true") log.assert_contains( "poptorch.Options set " "batchSerializationSettings.transformContext to value 0") log.assert_contains( "poptorch.Options set batchSerializationSettings.method to value 0") log.assert_contains( "poptorch.Options set batchSerializationSettings.batchSchedule " "to value 1") log.assert_contains( "poptorch.Options set accumulateOuterFragmentSettings.schedule " "to value 1") log.assert_contains( "poptorch.Options added 0 to " "accumulateOuterFragmentSettings.excludedVirtualGraphs") log.assert_contains( "poptorch.Options added 1 to " "accumulateOuterFragmentSettings.excludedVirtualGraphs") log.assert_contains("poptorch.Options set enableExplicitIR to value true") log.assert_contains( "poptorch.Options set " "automaticLossScalingSettings.gradientTensorTrackingMethod to value 1") log.assert_contains("poptorch.Options added t1 to updatableNamedBuffers") log.assert_contains("poptorch.Options added t2 to updatableNamedBuffers") def test_popart_patterns(): # pylint: disable=protected-access # Create our model. opts = poptorch.Options() opts.outputMode(poptorch.enums.OutputMode.All) patterns = {"PadSum": True} opts._Popart.setPatterns(patterns, 0) poptorch.poptorch_core._validateOptions(opts.toDict()) @helpers.printCapfdOnExit @pytest.mark.parametrize("dtype", [torch.half, torch.float]) @pytest.mark.parametrize("ptype", [torch.half, torch.float]) @helpers.overridePoptorchLogLevel("TRACE") def test_popart_partials(capfd, dtype, ptype): # pylint: disable=protected-access torch.manual_seed(42) x = torch.randn((1, 16, 16), dtype=dtype) model = torch.nn.Sequential() model.add_module('lin', torch.nn.Linear(16, 16, dtype=dtype)) model.add_module('conv', torch.nn.Conv1d(16, 16, 1)) opts = poptorch.Options() opts.Precision.setPartialsType(ptype) poptorch_model = poptorch.inferenceModel(model, opts) poptorch_model(x) log = helpers.LogChecker(capfd) if ptype == torch.float: log.assert_contains( 'poptorch.Options set partialsTypeMatMuls to value float') log.assert_contains( 'poptorch.Options set convolutionOptions[partialsType] to float') log.assert_contains('"partialsType":"MatMulPartialsType::FLOAT"') log.assert_contains('"partialsType[0]":"float"') else: log.assert_contains( 'poptorch.Options set partialsTypeMatMuls to value half') log.assert_contains( 'poptorch.Options set convolutionOptions[partialsType] to half') log.assert_contains('"partialsType":"MatMulPartialsType::HALF"') log.assert_contains('"partialsType[0]":"half"') @pytest.mark.parametrize("optim", [ poptorch.optim.SGD, poptorch.optim.Adam, poptorch.optim.AdamW, poptorch.optim.RMSprop, poptorch.optim.LAMB, ]) @pytest.mark.parametrize("initial_ls", [1.0, 2.0]) def test_automatic_loss_scaling(optim, initial_ls): input = torch.ones(5) # Just a simple model with weights and a loss function model = helpers.ModelWithWeights(lambda x: x, input.shape) # Weights need to be in fp16, since fp32 gradients don't influence # the loss scaling factor model.half() opts = poptorch.Options() opts.Training.setAutomaticLossScaling(True) # Anchor the final loss scale to compare against the update factor in ipu_state opts.anchorTensor("ls_final", "finalLossScale", poptorch.OutputMode.Final) # The lr value doesn't matter here, we just want to check the loss scale is updated optimizer_args = { "params": model.parameters(), "lr": 0.0, "loss_scaling": initial_ls } if optim == poptorch.optim.SGD: optimizer_args["use_combined_accum"] = False optimizer = optim(**optimizer_args) training_model = poptorch.trainingModel(model, opts, optimizer) # Compile the model first, so that we can get the ipu_state before running the model training_model.compile((input, )) for _ in range(5): # Get the update factor before running the model. This is the value used to # compute ls_final ls_update_factor = optimizer.state_dict( )['ipu_state']['lossScaleUpdateFactor'] training_model((input, )) ls_final = training_model.getAnchoredTensor("ls_final") # ls_final = ls_update_factor * initial_ls helpers.assert_allclose(actual=initial_ls * ls_update_factor, expected=ls_final) @pytest.mark.ipuHardwareRequired def test_real_ipu_selection(): class Network(nn.Module): def forward(self, x, y): return x + y model = Network() # Force-disable the IPU model opts = poptorch.Options().useIpuModel(False) inference_model = poptorch.inferenceModel(model, opts) x = torch.ones(2) y = torch.zeros(2) inference_model(x, y) @pytest.mark.ipuHardwareRequired def test_ipu_id_selection(): class Network(nn.Module): def forward(self, x, y): return x + y model = Network() # Force-disable the IPU model opts = poptorch.Options().useIpuId(0) inference_model = poptorch.inferenceModel(model, opts) x = torch.ones(2) y = torch.zeros(2) inference_model(x, y) @unittest.mock.patch.dict("os.environ", helpers.disableAllModels()) def test_offline_ipu(): class Network(nn.Module): def forward(self, x, y): return x + y model = Network() # Force-disable the IPU model opts = poptorch.Options().useOfflineIpuTarget() inference_model = poptorch.inferenceModel(model, opts) x = torch.ones(2) y = torch.zeros(2) with pytest.raises(AssertionError, match="Trying to run a model on an offline device"): inference_model(x, y) @unittest.mock.patch.dict("os.environ", {}) def test_export_proto_file(): class Network(nn.Module): def forward(self, x, y): return x + y with tempfile.TemporaryDirectory() as tmp: file = os.path.join(tmp, "my_dir", "my_model.proto") os.environ["POPTORCH_EXPORT_PROTO_FILE"] = file model = Network() inference_model = poptorch.inferenceModel(model) x = torch.ones(2) y = torch.zeros(2) inference_model(x, y) assert os.path.isfile(file) def test_tensor_location(): class Network(nn.Module): def forward(self, x, y): return x + y model = Network() opts = poptorch.Options() opts.TensorLocations.setActivationLocation( poptorch.TensorLocationSettings().minElementsForOffChip( 4).useOnChipStorage(True)) opts.TensorLocations.setWeightLocation( poptorch.TensorLocationSettings().useIOTilesToStore( True).useReplicatedTensorSharding(False)) opts.TensorLocations.setOptimizerLocation( poptorch.TensorLocationSettings().useIOTilesToLoad( False).useReplicatedTensorSharding( True).minElementsForReplicatedTensorSharding(4)) opts.TensorLocations.setAccumulatorLocation( poptorch.TensorLocationSettings().useOnChipStorage(False)) inference_model = poptorch.inferenceModel(model, opts) x = torch.ones(2) y = torch.zeros(2) inference_model(x, y) @helpers.printCapfdOnExit @pytest.mark.parametrize("dtype", [torch.half, torch.float]) @helpers.overridePoptorchLogLevel("TRACE") def test_running_statistics(capfd, dtype): x = torch.randn((16, 16), dtype=dtype) model = torch.nn.Sequential() model.add_module('lin', torch.nn.Linear(16, 16)) model.add_module('bn', torch.nn.BatchNorm1d(16)) if dtype == torch.half: model.half() poptorch_model = poptorch.inferenceModel(model) poptorch_model(x) log = helpers.LogChecker(capfd) dtype_str = "Float" if dtype == torch.float else "Half" device = "ipu:0" log.assert_contains( f" : {dtype_str}(16, strides=[1], requires_grad=0, device={device}) " "-> bn.running_var") def test_copying_options(): # pylint: disable=protected-access opts = poptorch.Options() locationOnChip = poptorch.TensorLocationSettings() locationOnChip.useOnChipStorage(True) locationOutsideChip = poptorch.TensorLocationSettings() locationOutsideChip.useOnChipStorage(False) opts.deviceIterations(5) opts.Distributed.configureProcessId(5, 15) opts.anchorTensor("t1", "tensor1", OutputMode.EveryN, 2) opts._Popart.set("autoRecomputation", 3) opts._Popart.set("dummyKey", 5) opts.Training.gradientAccumulation(4) opts.TensorLocations.setWeightLocation(locationOnChip) deep_copy = copy.deepcopy(opts) opts.deviceIterations(4) opts.Distributed.configureProcessId(2, 15) opts.anchorTensor("t2", "tensor2", OutputMode.Final) opts._Popart.set("autoRecomputation", 2) opts.TensorLocations.setWeightLocation(locationOutsideChip) assert opts.device_iterations != deep_copy.device_iterations assert opts.anchored_tensors != deep_copy.anchored_tensors assert opts.replication_factor == deep_copy.replication_factor assert opts.log_dir == deep_copy.log_dir assert opts.auto_round_num_ipus == deep_copy.auto_round_num_ipus assert opts.output_mode == deep_copy.output_mode assert opts.output_return_period == deep_copy.output_return_period assert opts.connection_type == deep_copy.connection_type assert opts.sync_pattern == deep_copy.sync_pattern assert (opts.available_memory_proportion == deep_copy.available_memory_proportion) assert (opts.Distributed.distributed_process_id != deep_copy.Distributed.distributed_process_id) assert (opts.Distributed.num_distributed_processes == deep_copy.Distributed.num_distributed_processes) assert deep_copy.TensorLocations.location_weight["onChip"] assert not opts.TensorLocations.location_weight["onChip"] assert (opts._Popart.options["autoRecomputation"] != deep_copy._Popart.options["autoRecomputation"]) assert (opts._Popart.options["dummyKey"] == deep_copy._Popart.options["dummyKey"]) assert (opts.Training.gradient_accumulation == deep_copy.Training.gradient_accumulation) def test_preserving_options_intact(): class ExampleModel(torch.nn.Module): def __init__(self): super().__init__() self.bias = torch.nn.Parameter(torch.zeros(())) def forward(self, x): return torch.cat([ 100 * torch.nn.LeakyReLU()(-x + self.bias), 100 * torch.nn.LeakyReLU()(x - self.bias) ], dim=-1) class ExampleModelWithLoss(torch.nn.Module): def __init__(self): super().__init__() self.model = ExampleModel() def forward(self, input, target): out = self.model(input) return (torch.nn.functional.softmax(out), torch.nn.CrossEntropyLoss(reduction="mean")(out, target)) model = ExampleModelWithLoss() opts = poptorch.Options() training = poptorch.trainingModel(model, opts) inference = poptorch.inferenceModel(model, opts) assert opts.defaultOutputMode() assert training.options.output_mode == OutputMode.Final assert inference.options.output_mode == OutputMode.All @helpers.printCapfdOnExit @helpers.overridePoptorchLogLevel("DEBUG") @pytest.mark.parametrize("namescopes_enabled", [True, False]) def test_name_scope_hook_disabled(capfd, namescopes_enabled): class Network(torch.nn.Module): def __init__(self): super().__init__() self.layer1 = torch.nn.Sequential(torch.nn.Conv2d(1, 4, 5), torch.nn.MaxPool2d(2), torch.nn.ReLU()) self.layer2 = torch.nn.Sequential(torch.nn.Linear(40, 10), torch.nn.ReLU()) self.softmax = torch.nn.LogSoftmax(1) def forward(self, x): x = self.layer1(x) x = x.view(5, 40) x = self.layer2(x) x = self.softmax(x) return x model = Network() options = poptorch.Options() if not namescopes_enabled: options.disableModuleNamescope() poptorch_model = poptorch.inferenceModel(model, options) input = torch.randn(2, 1, 15, 15) _ = poptorch_model(input) ir = poptorch_model._debugGetPopartIR() # pylint: disable=protected-access expected_namescopes = [ 'layer1/0/', 'layer1/1/', 'layer1/1/', 'layer2/0/', 'layer2/1/', 'softmax' ] base_names = ['Conv', 'MaxPool', 'Relu', 'MatMul', 'Relu', 'LogSoftmax'] assert len(expected_namescopes) == len(base_names) for i, name in enumerate(base_names): namescope = expected_namescopes[i] if namescopes_enabled else '' expected_output = f'"name":"{namescope}{name}' assert ir.find(expected_output) testlog = helpers.LogChecker(capfd) it = testlog.createIterator() it.findNext("lowered to PopART") # Ensure none of the scope names are actually lowered to PopART # They should have been handled by the compiler and not be part # of the graph anymore. it.assert_not_contains("Char") def test_ipu_context_flag(): class Network(nn.Module): def forward(self, x, y): if poptorch.isRunningOnIpu(): output = x + y else: output = x * y return output model = Network() options = poptorch.Options() inference_model = poptorch.inferenceModel(model, options) x = torch.tensor([50]) y = torch.tensor([2]) assert inference_model(x, y) == 52 assert model(x, y) == 100 @pytest.mark.ipuHardwareRequired @pytest.mark.parametrize("enabled", [True, False, None]) @helpers.overridePoptorchLogLevel("INFO") def test_ipu_model(enabled, capfd): class Model(nn.Module): def forward(self, x, y): return x + y model = Model() opts = poptorch.Options() if enabled is not None: opts.useIpuModel(enabled) poptorch_model = poptorch.inferenceModel(model, opts) x = torch.tensor([50]) y = torch.tensor([2]) poptorch_model(x, y) log = helpers.LogChecker(capfd) if enabled is None: log.assert_not_contains("From the user configuration: Ipu model") elif enabled: log.assert_contains("From the user configuration: Ipu model: Enabled") else: log.assert_contains("From the user configuration: Ipu model: Disabled") @pytest.mark.ipuHardwareRequired @helpers.overridePoptorchLogLevel("DEBUG") def test_log_cycle_count(capfd): class LogChecker(helpers.LogChecker): def validate(self): self.assert_contains("Total number of IPU cycles: ") class Network(nn.Module): def forward(self, x, y): return x + y opts = poptorch.Options().logCycleCount(True) inference_model = poptorch.inferenceModel(Network(), opts) x = torch.tensor([1]) y = torch.tensor([2]) inference_model(x, y) assert inference_model.cycleCount() > 0 log = LogChecker(capfd) log.validate() def test_profile_report_with_model_name(): def test(dirname): model = torch.nn.Linear(100, 100) opts = poptorch.Options() opts.modelName("tommyflowers") opts.enableProfiling(dirname) poptorch_model = poptorch.inferenceModel(model, opts) x = torch.randn(100, 100) poptorch_model(x) dirname = tempfile.mkdtemp() x = threading.Thread(target=test, args=(dirname, )) x.start() x.join() assert os.path.exists(os.path.join(dirname, "tommyflowers", "profile.pop")) def test_profile_report(): def test(dirname): model = torch.nn.Linear(100, 100) opts = poptorch.Options() opts.enableProfiling(dirname) poptorch_model = poptorch.inferenceModel(model, opts) x = torch.randn(100, 100) poptorch_model(x) dirname = tempfile.mkdtemp() x = threading.Thread(target=test, args=(dirname, )) x.start() x.join() assert os.path.exists(os.path.join(dirname, "inference", "profile.pop")) mean_reduction_strategy_params = [ # accum_type, training, combined_accum, correct_strategy # Post should be the float32 default (torch.float32, True, False, MeanReductionStrategy.Post), # Running should be the float16 default (torch.float16, True, False, MeanReductionStrategy.Running), # Running is not supported for combined_accum, so Post should be used (torch.float16, True, True, MeanReductionStrategy.Post), # The default accum_type is float32 so strategy should be Post when this is None (None, True, False, MeanReductionStrategy.Post), # The option isn't used in inference so it should remain as Post by default (None, False, False, MeanReductionStrategy.Post), ] @pytest.mark.parametrize( "accum_type, training, combined_accum, correct_strategy", mean_reduction_strategy_params) def test_mean_reduction_strategy_implicit(accum_type, training, combined_accum, correct_strategy): t1 = torch.tensor([1.]) t2 = torch.tensor([2.]) # A simple adder model just to test the correct strategy is set model = helpers.ModelWithWeights(lambda x, y: x + y, t1.shape) options = poptorch.Options() optimizer = poptorch.optim.SGD(model.parameters(), lr=0.01, accum_type=accum_type, use_combined_accum=combined_accum) poptorch_model = poptorch.trainingModel( model, options, optimizer) if training else poptorch.inferenceModel( model, options) poptorch_model.compile((t1, t2)) assert (getattr( poptorch_model.options.Training, "meanAccumulationAndReplicationReductionStrategy") == correct_strategy) def test_mean_reduction_strategy_explicit(): t1 = torch.tensor([1.]) t2 = torch.tensor([2.]) # A simple adder model just to test the correct strategy is set model = helpers.ModelWithWeights(lambda x, y: x + y, t1.shape) options = poptorch.Options() options.Training.setMeanAccumulationAndReplicationReductionStrategy( MeanReductionStrategy.Running) poptorch_model = poptorch.trainingModel(model, options) poptorch_model.compile((t1, t2)) assert (getattr(options.Training, "meanAccumulationAndReplicationReductionStrategy") == MeanReductionStrategy.Running) def test_num_io_tiles(): options = poptorch.Options() error_msg = "numIOTiles must be an even number between 32 and 192." with pytest.raises(AssertionError, match=error_msg): options.TensorLocations.numIOTiles(10) with pytest.raises(AssertionError, match=error_msg): options.TensorLocations.numIOTiles(193) with pytest.raises(AssertionError, match=error_msg): options.TensorLocations.numIOTiles(33) options.TensorLocations.numIOTiles(32) options.TensorLocations.numIOTiles(192) options.TensorLocations.numIOTiles(100) # pylint: disable=protected-access def test_options_change_after_use(): model = helpers.ModelWithWeights(torch.nn.Linear(10, 10), torch.Size((5, 10)), loss_fn=torch.nn.CrossEntropyLoss()) opts = poptorch.Options() poptorch_model = poptorch.trainingModel(model, options=opts) with pytest.raises(Exception): opts.randomSeed(42) with pytest.raises(Exception): poptorch_model.options.set(random_seed=42) with pytest.raises(Exception): opts.Training.gradientAccumulation(0) with pytest.raises(Exception): popart_opts = opts._Popart opts._Popart.set("groupNormStridedChannelGrouping", True) opts = poptorch.Options() features = torch.randn([100, 1, 128, 128]) labels = torch.empty([100], dtype=torch.long).random_(10) dataset = torch.utils.data.TensorDataset(features, labels) poptorch_data_loader = poptorch.DataLoader( opts, dataset=dataset, ) with pytest.raises(Exception): opts.randomSeed(42) with pytest.raises(Exception): poptorch_data_loader.options.set(random_seed=42) with pytest.raises(Exception): poptorch_data_loader.options.Training.gradientAccumulation(0) with pytest.raises(Exception): popart_opts = poptorch_data_loader.options._Popart popart_opts.set("groupNormStridedChannelGrouping", True) def test_copied_options_unfrozen(): opts = poptorch.Options() # Freeze the opts. _ = poptorch.DataLoader( opts, dataset=torch.utils.data.TensorDataset( torch.randn([100, 1, 128, 128]), torch.empty([100], dtype=torch.long).random_(10), ), ) copied_opts = copy.deepcopy(opts) # Make sure that no 'Can't modify frozen Options' errors are raised. copied_opts.deviceIterations(5) copied_opts.Distributed.configureProcessId(5, 15) copied_opts._Popart.set("autoRecomputation", 3) copied_opts.Training.gradientAccumulation(4) copied_opts.TensorLocations.setWeightLocation( poptorch.TensorLocationSettings().useIOTilesToStore(True)) copied_opts.Precision.setPartialsType(torch.float16) def test_wrap_options(): """Popdist wraps poptorch Options using something similar""" class _Distributed(poptorch.options._DistributedOptions): pass opts = poptorch.Options() opts.Distributed.__class__ = _Distributed def test_options_printing(capsys): """Check that the Options class displays meaningful information""" opts = poptorch.Options() opts.replicationFactor(4) print(opts) captured = capsys.readouterr() id_string = f"{id(opts):x}" # Default printing is hexadecimal ID of object error_str = "The Options class should be printing meaningful informations" assert id_string not in captured.out, error_str assert captured.out.startswith("Options(") assert "replication_factor=4" in captured.out ================================================ FILE: tests/other_ops_test.py ================================================ #!/usr/bin/env python3 # Copyright (c) 2020 Graphcore Ltd. All rights reserved. import json import re import torch import pytest import helpers import poptorch torch.manual_seed(42) params_einsum = [ ('i->', (torch.randn(5), )), ('ij->i', (torch.randn(5, 4), )), ('i,j->j', (torch.randn(5), torch.randn(4))), ('i,j->ji', (torch.randn(5), torch.randn(4))), ('bij,bjk->bik', (torch.randn(3, 2, 5), torch.randn(3, 5, 4))), ('bn,anm,bm->ba', (torch.randn(2, 5), torch.randn(3, 5, 4), torch.randn(2, 4))), ('bfnd,ndh->bfh', (torch.randn(2, 3, 4, 5), torch.randn(4, 5, 6))), ('nmku,buvm->bnkv', (torch.randn(2, 3, 4, 5), torch.randn(6, 5, 7, 3))), ] def default_assert_fn(native_out, poptorch_out): if isinstance(native_out, tuple): for native, pop in zip(native_out, poptorch_out): helpers.assert_allclose(expected=native, actual=pop) else: helpers.assert_allclose(expected=native_out, actual=poptorch_out) def op_harness(op, *inputs, assert_fn=None, out_fn=None): model = helpers.ModelWithWeights(op, inputs[0].shape, out_fn=out_fn) poptorch_model = poptorch.trainingModel(model) # Run on CPU native_out, _ = model(inputs) # Run on IPU poptorch_out, _ = poptorch_model(inputs) if assert_fn is None: assert_fn = default_assert_fn # Inference test - check outputs assert_fn(native_out, poptorch_out) # Training test - check weights changed poptorch_model.assert_weights_changed() return model, poptorch_model def op_harness_inference(model, *inputs): poptorch_model = poptorch.inferenceModel(model) native_out = model(*inputs) poptorch_out = poptorch_model(*inputs) default_assert_fn(native_out, poptorch_out) @pytest.mark.parametrize("params", params_einsum) @pytest.mark.parametrize("implicit_rhs", {True, False}) def test_einsum(params, implicit_rhs): eq = params[0].split('->')[0] if implicit_rhs else params[0] op = lambda *xs: torch.einsum(eq, *xs) op_harness(op, *params[1]) def test_einsum_chained(): torch.manual_seed(42) def op(x, y, z): r = torch.einsum('b u k m, b u v m -> b k v', x, y) return torch.einsum('b h k n, b k v -> b h v n', z, r) inputs = [torch.randn(1, 4, 16, 4, dtype=torch.float) for _ in range(3)] def assert_fn(native_out, poptorch_out): helpers.assert_allclose(expected=native_out, actual=poptorch_out, rtol=1e-3, atol=1e-3) op_harness(op, *inputs, assert_fn=assert_fn) def test_einsum_transpose(): torch.manual_seed(42) def op(x): return torch.einsum('n c h w -> n h w c', x) inputs = [torch.randn(2, 3, 4, 5, dtype=torch.float)] def assert_fn(native_out, poptorch_out): helpers.assert_allclose(expected=native_out, actual=poptorch_out, rtol=1e-3, atol=1e-3) op_harness(op, *inputs, assert_fn=assert_fn) @pytest.mark.parametrize("arr_lengths", ([3], [3, 3], [2, 4], [3, 2, 4], [5, 2, 3, 4])) def test_meshgrid(arr_lengths): torch.manual_seed(42) inputs = [torch.randn(arr_length) for arr_length in arr_lengths] op_harness(torch.meshgrid, *inputs, out_fn=lambda x: x[0]) @pytest.mark.parametrize("arr_lengths", ([3], [3, 3], [2, 4], [3, 2, 4], [5, 2, 3, 4])) def test_cartesian_prod(arr_lengths): torch.manual_seed(42) inputs = [torch.randn(arr_length) for arr_length in arr_lengths] op_harness(torch.cartesian_prod, *inputs) @pytest.mark.parametrize("dims", (2, ([2], [0]), ([2, 3], [0, 1]))) def test_tensordot(dims): torch.manual_seed(42) op = lambda a, b: torch.tensordot(a, b, dims) x = torch.randn(2, 3, 5, 4) y = torch.randn(5, 4, 1) op_harness(op, x, y) @pytest.mark.parametrize("inplace", [True, False]) @pytest.mark.parametrize("dim", range(-3, 3)) def test_scatter_add(inplace, dim): class Model(torch.nn.Module): def __init__(self, dim, dim_size): super().__init__() self.dim = dim self.dim_size = dim_size self.inplace = inplace def forward(self, src, index): sz = list(src.shape) sz[self.dim] = self.dim_size out = torch.ones(sz) if self.inplace: return out.scatter_add_(self.dim, index, src) return out.scatter_add(self.dim, index, src) torch.manual_seed(42) x = torch.randn(4, 8, 16) dim_size = x.shape[dim] // 2 index = torch.randint_like(x, high=dim_size).long() op_harness(Model(dim, dim_size), x, index) @pytest.mark.parametrize("dim", range(-3, 3)) @pytest.mark.parametrize("reduce", ['sum', 'amin', 'amax', 'mean', 'prod']) @pytest.mark.parametrize("include_self", [True, False]) def test_scatter_reduce(dim, reduce, include_self): class Model(torch.nn.Module): def __init__(self, dim, reduce, include_self): super().__init__() self.dim = dim self.reduce = reduce self.include_self = include_self def forward(self, inp, index, src): output = inp.scatter_reduce(self.dim, index, src, reduce=self.reduce, include_self=self.include_self) return output torch.manual_seed(42) src = torch.randn(4, 8, 16) dim_size = src.shape[dim] // 2 sz = list(src.shape) sz[dim] = dim_size inp = torch.randn(sz) index = torch.randint_like(src, high=dim_size).long() op_harness(Model(dim, reduce, include_self), inp, index, src) @pytest.mark.parametrize("reduce", ['sum', 'amin', 'amax', 'mean', 'prod']) @pytest.mark.parametrize("include_self", [True, False]) def test_scatter_reduce_fusable(reduce, include_self): dim = 0 torch.manual_seed(42) class Model(torch.nn.Module): def __init__(self, dim, reduce, include_self): super().__init__() self.dim = dim self.reduce = reduce self.include_self = include_self def forward(self, inp, index, src): output = [] for i in range(3): output.append( inp.scatter_reduce(self.dim, index, src[i], reduce=self.reduce, include_self=self.include_self)) return torch.cat(output, dim=1) src = [torch.randn(8, 16) for i in range(3)] inp = torch.randn(torch.Size([4, 16])) index = torch.randint_like(src[0], high=4).long() _, poptorch_model = op_harness(Model(dim, reduce, include_self), inp, index, src) all_ops = json.loads(poptorch_model._debugGetPopartIR())['maingraph'] # pylint: disable=protected-access scatter_reduce_ops = [ op for op in all_ops if op['type'] == 'ScatterReduce' ] expected_scatter_len = 1 if include_self else 2 assert len(scatter_reduce_ops) == expected_scatter_len expected_group_size = 6 if reduce == 'mean' else 3 assert int(scatter_reduce_ops[0]['attributes'] ['group_size']) == expected_group_size @pytest.mark.parametrize("reduce", ['sum', 'amin', 'amax', 'mean', 'prod']) @pytest.mark.parametrize("include_self", [True, False]) def test_scatter_reduce_should_not_apply_grouped_fuse(reduce, include_self): dim = 0 torch.manual_seed(42) num_scatters = 3 class Model(torch.nn.Module): def __init__(self, dim, reduce, include_self, num_scatters): super().__init__() self.dim = dim self.reduce = reduce self.include_self = include_self self.num_scatters = num_scatters def forward(self, inp, index, src): output = [] for i in range(self.num_scatters): output.append(inp[i].scatter_reduce( self.dim, index, src[i], reduce=self.reduce, include_self=self.include_self)) return torch.cat(output, dim=1) src = [torch.randn(8, 16 + i) for i in range(num_scatters)] inp = [torch.randn(torch.Size([8, 16 + i])) for i in range(num_scatters)] index = torch.randint(low=0, high=8, size=[8, 1]).long() model = Model(dim, reduce, include_self, num_scatters) poptorch_model = poptorch.inferenceModel(model) poptorch_model.compile(inp, index, src) default_assert_fn(model(inp, index, src), poptorch_model(inp, index, src)) all_ops = json.loads(poptorch_model._debugGetPopartIR())['maingraph'] # pylint: disable=protected-access scatter_reduce_ops = [ op for op in all_ops if op['type'] == 'ScatterReduce' ] assert len(scatter_reduce_ops) >= 3 @pytest.mark.parametrize("dim", range(-3, 3)) @pytest.mark.parametrize("reduce", ["mean", "amax", "amin", "prod"]) @pytest.mark.parametrize("include_self", [True, False]) def test_index_reduce(dim, reduce, include_self): class Model(torch.nn.Module): def __init__(self, dim, reduce, include_self): super().__init__() self.dim = dim self.reduce = reduce self.include_self = include_self def forward(self, inp, index, src): output = inp.index_reduce_(self.dim, index, src, reduce=self.reduce, include_self=self.include_self) return output torch.manual_seed(17) inp = torch.randn(5, 8, 11) dim_size = inp.shape[dim] // 2 sz = list(inp.shape) sz[dim] = dim_size src = torch.randn(sz) index = torch.randint(high=dim_size, size=(dim_size, )).long() op_harness(Model(dim, reduce, include_self), inp, index, src) @helpers.printCapfdOnExit @helpers.overridePoptorchLogLevel("TRACE") @pytest.mark.parametrize("reduce", ['sum', 'amin', 'amax', 'mean']) @pytest.mark.parametrize("expand_as", [True, False]) @pytest.mark.parametrize("include_self", [True, False]) def test_2d_scatter_reduce_with_index_expansion(capfd, reduce, expand_as, include_self): class Model(torch.nn.Module): def __init__(self, reduce, include_self): super().__init__() self.reduce = reduce self.include_self = include_self def forward(self, inp, index, src): if expand_as: index = index.expand_as(src) else: index = index.expand(src.shape) output = inp.scatter_reduce(-2, index, src, reduce=self.reduce, include_self=self.include_self) return output model = Model(reduce, include_self) poptorch_model = poptorch.inferenceModel(model) torch.manual_seed(0) index = torch.randint(0, 5, (6, 1), dtype=torch.long) src = torch.rand((6, 3)) inp = torch.randn((5, 3)) out = model(inp, index, src) poptorch_out = poptorch_model(inp, index, src) helpers.assert_allclose(actual=poptorch_out, expected=out) # Make sure the expand op is removed. look_for = "aten::expand_as" if expand_as else "aten::expand" log = helpers.LogChecker(capfd) it = log.createIterator() it.findNext("Removing index expansion node:") it.assert_not_contains(look_for) @helpers.printCapfdOnExit @helpers.overridePoptorchLogLevel("TRACE") @pytest.mark.parametrize("expand_as", [True, False]) def test_2d_scatter_add_with_index_expansion(capfd, expand_as): class Model(torch.nn.Module): def forward(self, index, src): if expand_as: index = index.expand_as(src) else: index = index.expand(src.shape) return torch.zeros((5, 3)).scatter_add_( dim=-2, index=index, src=src, ) model = Model() poptorch_model = poptorch.inferenceModel(model) torch.manual_seed(0) index = torch.randint(0, 5, (6, 1), dtype=torch.long) src = torch.rand((6, 3)) out = model(index, src) poptorch_out = poptorch_model(index, src) helpers.assert_allclose(actual=poptorch_out, expected=out) # Make sure the expand op is removed. look_for = "aten::expand_as" if expand_as else "aten::expand" log = helpers.LogChecker(capfd) it = log.createIterator() it.findNext("Removing index expansion node:") it.assert_not_contains(look_for) @helpers.printCapfdOnExit @helpers.overridePoptorchLogLevel("TRACE") @pytest.mark.parametrize("expand_as", [True, False]) @pytest.mark.parametrize("params", [ { "shape": (3, 5), "gather_dim": 0, "expand_dim": 0, "should_optimise": False }, { "shape": (3, 5), "gather_dim": 0, "expand_dim": 1, "should_optimise": True }, { "shape": (3, 5), "gather_dim": 1, "expand_dim": 0, "should_optimise": True }, { "shape": (3, 5), "gather_dim": 1, "expand_dim": 1, "should_optimise": False }, { "shape": (1, 1, 3, 1, 5, 1), "gather_dim": 3, "expand_dim": 2, "should_optimise": False }, { "shape": (1, 1, 3, 1, 5, 1), "gather_dim": 2, "expand_dim": 4, "should_optimise": True }, { "shape": (1, 1, 3, 1, 5, 1), "gather_dim": 4, "expand_dim": 2, "should_optimise": True }, { "shape": (1, 1, 3, 1, 5, 1), "gather_dim": 4, "expand_dim": 1, "should_optimise": False }, { "shape": (3, 4, 5), "gather_dim": 0, "expand_dim": 1, "should_optimise": False }, ]) def test_gather_with_index_expansion(capfd, expand_as, params): # Work out params to model. torch.manual_seed(42) data = torch.randint(10, params["shape"], dtype=torch.int) indices_shape = list(data.shape) indices_shape[params["expand_dim"]] = 1 indices = torch.randint(high=data.shape[params["gather_dim"]], size=indices_shape) # Make model. class Model(torch.nn.Module): def forward(self, data, indices): if expand_as: indices = indices.expand_as(data) else: indices = indices.expand(data.shape) # Also do an `add`, to check we can pipe the results onward. return torch.gather(data, params["gather_dim"], indices).add(8) model = Model() poptorch_model = poptorch.inferenceModel(model) # Run model, check result is still correct. cpu_out = model(data, indices) ipu_out = poptorch_model(data, indices) helpers.assert_allclose(actual=ipu_out, expected=cpu_out) log = helpers.LogChecker(capfd) it = log.createIterator() # Look for the log saying we did the optimisation, only if we should have. if params["should_optimise"]: it.findNext("Optimising gather:") # Look for the (non-)presence of the expand op that should be removed. # Note: aten::expand_as might be intercepted as aten::expand by the dispatcher # so only check for "expand" remove_if_optimised = "aten::expand" if params["should_optimise"]: it.assert_not_contains(remove_if_optimised) else: it.findNext(remove_if_optimised) @helpers.printCapfdOnExit @helpers.overridePoptorchLogLevel("TRACE") def test_available_memory_scatter_add(capfd): class Model(torch.nn.Module): def __init__(self, dim, dim_size): super().__init__() self.dim = dim self.dim_size = dim_size def forward(self, src, index): sz = list(src.shape) sz[self.dim] = self.dim_size out = torch.ones(sz) sa = out.scatter_add(self.dim, index, src) am = poptorch.set_available_memory(sa, 0.9) return am dim = 2 torch.manual_seed(42) x = torch.randn(4, 8, 16) dim_size = x.shape[dim] // 2 index = torch.randint_like(x, high=dim_size).long() model = Model(dim, dim_size) poptorch_model = poptorch.inferenceModel(model) poptorch_model(x, index) log = helpers.LogChecker(capfd) it = log.createIterator() it.findNext("Graph before lowering to PopART:") # Assert that the set_available_memory node references the scatterreduce, # not the add. sa_line = it.findNext("popart::scatterreduce").strip() sa_var = sa_line.partition(" ")[0] sam_line = it.findNext("poptorch::set_available_memory").strip() # Check we have set_available_memory[...](%XX) where XX is the result of scatterreduce assert re.search(r"set_available_memory\[.*\]\(\{}\)".format(sa_var), sam_line) basic_test_data = [ ([[3, 6, 9], [3, 6, 10], [-1, 0, 1], [8, 9, 140]], [1, 3, 5, 7, 9]), ([[2, 5, 10], [6, 8, 3]], [1, 5, 7, 8, 10]), (1, [1, 5, 7, 8, 10]), ([1, 2, 3, 4, 5, 6, 7, 8, 9], [1, 2, 3, 4, 5, 6]), ([[[1, 3, 5], [2, 4, 6]], [[1, 2, 3], [4, 5, 6]]], [1, 2, 3, 4, 5, 6]), ([1, 2, 3, 4, 5, 6, 7, 8, 9], [1, 2, 3, 6, 4, 5]), ] def bucketize_op_test_body(right, data, dtypes): input_data, boundaries_data = data input_dtype, boundaries_dtype = dtypes input = torch.tensor(input_data, dtype=input_dtype) boundaries = torch.tensor(boundaries_data, dtype=boundaries_dtype) class Model(torch.nn.Module): def __init__(self, right): super().__init__() self.right = right def forward(self, input, boundaries): return torch.bucketize(input, boundaries, right=self.right) op_harness_inference(Model(right), input, boundaries) @pytest.mark.parametrize("right", [True, False]) @pytest.mark.parametrize("dtypes", [(torch.float32, torch.float32), (torch.float32, torch.float32), (torch.int32, torch.float32), (torch.float32, torch.int32)]) @pytest.mark.parametrize("data", basic_test_data) def test_bucketize_basic(right, data, dtypes): bucketize_op_test_body(right, data, dtypes) fp_test_data = [ ([1, 2, 3, 4, 5, 6, 7, 8, 9], [0.9, 1, 2, 2, 3, 3, 4, 4.1, 9, 9]), ( [[[1, 3, 5], [2, 4, 6]], [[1, 2, 3], [4, 5, 6]]], [0.9, 1, 2, 2, 3, 3, 4, 4.1, 9, 9], ), ] @pytest.mark.parametrize("right", [True, False]) @pytest.mark.parametrize("data", basic_test_data) def test_bucketize_fp(right, data): bucketize_op_test_body(right, data, (torch.float32, torch.int32)) @pytest.mark.parametrize("out_int32", [True, False]) def test_bucketize_inplace(out_int32): input = torch.tensor([[2, 5, 10], [6, 8, 3]], dtype=torch.int32) boundaries = torch.tensor([1, 5, 7, 8, 10], dtype=torch.int32) out_dtype = torch.int32 if out_int32 else torch.int64 out_poptorch = torch.zeros(2, 3, dtype=out_dtype) out_native = out_poptorch.clone() class Model(torch.nn.Module): def forward(self, input, boundaries, out): return torch.bucketize(input, boundaries, out_int32=out_int32, out=out) model = Model() returned_out_native = model(input, boundaries, out_native) poptorch_model = poptorch.inferenceModel(model) returned_out_poptorch = poptorch_model(input, boundaries, out_poptorch) default_assert_fn(returned_out_poptorch, out_poptorch) default_assert_fn(out_native, out_poptorch) default_assert_fn(returned_out_native, returned_out_poptorch) ================================================ FILE: tests/outputs_test.py ================================================ #!/usr/bin/env python3 # Copyright (c) 2020 Graphcore Ltd. All rights reserved. import torch import torch.nn as nn import helpers import poptorch def test_multiple_tensors(): class Network(nn.Module): def forward(self, x, y): t1 = (x + y) t2 = (t1, x * y) return t2[0], y - x, t2[1] + t1 # Create our model. model = Network() inference_model = poptorch.inferenceModel(model) x = torch.ones(2) y = torch.zeros(2) ipu = inference_model(x, y) ref = model(x, y) helpers.assert_allclose(actual=ipu, expected=ref) def test_simple_list(): class Network(nn.Module): def forward(self, x, y): t1 = (x + y) t2 = (t1, x * y) return [t2[0], y - x, t2[1] + t1] # Create our model. model = Network() inference_model = poptorch.inferenceModel(model) x = torch.ones(2) y = torch.zeros(2) ipu = inference_model(x, y) ref = model(x, y) helpers.assert_allclose(actual=ipu, expected=ref) def test_simple_tuple(): class Network(nn.Module): def forward(self, x, y): t1 = (x + y) t2 = (t1, x * y) return (t2[0], y - x, t2[1] + t1) # Create our model. model = Network() inference_model = poptorch.inferenceModel(model) x = torch.ones(2) y = torch.zeros(2) ipu = inference_model(x, y) ref = model(x, y) helpers.assert_allclose(actual=ipu, expected=ref) def test_nested_tuples(): class Network(nn.Module): def forward(self, x, y): t1 = (x + y) t2 = (t1, x * y) return x, (t2, y - x, t2[1] + t1), (y, ((t1 * 2.0))) # Create our model. model = Network() inference_model = poptorch.inferenceModel(model) x = torch.ones(2) y = torch.zeros(2) ipu = inference_model(x, y) ref = model(x, y) helpers.assert_allclose(actual=ipu, expected=ref) def test_same_tensor(): class Network(nn.Module): def forward(self, x, y): t1 = (x + y) t2 = (t1, x * y) return t1, (t1, t2, t1) # Create our model. model = Network() inference_model = poptorch.inferenceModel(model) x = torch.ones(2) y = torch.zeros(2) ipu = inference_model(x, y) ref = model(x, y) helpers.assert_allclose(actual=ipu, expected=ref) def test_dict(): class Network(nn.Module): def forward(self, x, y): t1 = (x + y) t2 = (x * y) # Note: keys are not in alphabetical order return {'b': t1, 'a': t2} # Create our model. cpu_model = Network() ipu_model = poptorch.inferenceModel(cpu_model) x = torch.ones(2) y = torch.zeros(2) cpu_res = cpu_model(x, y) ipu_res = ipu_model(x, y) # Check the outputs are the same assert cpu_res.keys() == ipu_res.keys() for k in cpu_res.keys(): assert torch.allclose(cpu_res[k], ipu_res[k]) ================================================ FILE: tests/overlapped_io_test.py ================================================ #!/usr/bin/env python3 # Copyright (c) 2021 Graphcore Ltd. All rights reserved. import torch import pytest import poptorch INPUT_SIZE = 64 def get_model(num_mat_muls, input_a_overlap=poptorch.OverlapMode.NoOverlap, input_b_overlap=poptorch.OverlapMode.NoOverlap, loss_overlap=poptorch.OverlapMode.NoOverlap, sum_all_overlap=poptorch.OverlapMode.NoOverlap): class Model(torch.nn.Module): def __init__(self): super().__init__() for idx in range(num_mat_muls): self.register_parameter( "a" + str(idx), torch.nn.Parameter( torch.randn([1, INPUT_SIZE, INPUT_SIZE], dtype=torch.float32))) self.register_parameter( "b" + str(idx), torch.nn.Parameter( torch.randn([1, INPUT_SIZE, INPUT_SIZE], dtype=torch.float32))) self.loss = torch.nn.CrossEntropyLoss() def forward(self, input_a, input_b, labels): with poptorch.Block(ipu_id=0): # Set overlap settings input_a = poptorch.set_overlap_for_input( input_a, input_a_overlap) input_b = poptorch.set_overlap_for_input( input_b, input_b_overlap) # remove leading 1 dim input_a = input_a.squeeze() input_b = input_b.squeeze() to_sum = [] for idx in range(num_mat_muls): to_sum.append( torch.matmul(self.get_parameter("a" + str(idx)), input_a)) to_sum.append( torch.matmul(self.get_parameter("b" + str(idx)), input_b)) sum_all = torch.sum(torch.stack(to_sum, dim=0), dim=0) loss = self.loss(sum_all.unsqueeze(dim=0), labels) loss = poptorch.set_overlap_for_output(loss, loss_overlap) sum_all = poptorch.set_overlap_for_output( sum_all, sum_all_overlap) return loss, sum_all return Model() @pytest.mark.ipuHardwareRequired def test_io_input(): num_mat_muls = 20 model = get_model(num_mat_muls, poptorch.OverlapMode.OverlapAccumulationLoop, poptorch.OverlapMode.OverlapAccumulationLoop) num_grad_accumulations = 10 num_device_iterations = 20 opts = poptorch.Options() opts.outputMode(poptorch.OutputMode.All) opts.deviceIterations(num_device_iterations) opts.setExecutionStrategy(poptorch.ShardedExecution()) opts.TensorLocations.numIOTiles(32) opts.Training.gradientAccumulation(num_grad_accumulations) poptorch_model = poptorch.trainingModel(model, options=opts) total_batch_size = num_grad_accumulations * num_device_iterations input_a = torch.randn((total_batch_size, INPUT_SIZE)) input_b = torch.randn((total_batch_size, INPUT_SIZE)) labels = torch.randint(0, 1, (total_batch_size, INPUT_SIZE)) poptorch_model(input_a, input_b, labels) @pytest.mark.ipuHardwareRequired def test_input_error_messages(): class DoubleInputUseModel(torch.nn.Module): def forward(self, x): y = x + 1 x2 = poptorch.set_overlap_for_input( x, poptorch.OverlapMode.OverlapAccumulationLoop) return y, x2 model = DoubleInputUseModel() poptorch_model = poptorch.inferenceModel(model) label = r"[0-9]+" err_msg = (r"poptorch.set_overlap_for_input must be the only op applied " r"to an input. This is not the case for input " + label + r" to the model.") with pytest.raises(poptorch.poptorch_core.Error, match=err_msg): poptorch_model(torch.tensor([1.0])) class NotOnInputModel(torch.nn.Module): def forward(self, x): y = x + 1 y2 = poptorch.set_overlap_for_input( y, poptorch.OverlapMode.OverlapAccumulationLoop) return y, y2 model = NotOnInputModel() poptorch_model = poptorch.inferenceModel(model) err_msg = (r"poptorch.set_overlap_for_input applied on a node which is " r"not a tensor input to the model.") with pytest.raises(poptorch.poptorch_core.Error, match=err_msg): poptorch_model(torch.tensor([1.0])) class NormalModel(torch.nn.Module): def forward(self, x): x2 = poptorch.set_overlap_for_input( x, poptorch.OverlapMode.OverlapAccumulationLoop) y = x2 + 1 return y model = NormalModel() poptorch_model = poptorch.inferenceModel(model) err_msg = (r"Overlapped IO is not supported with poptorch.Pipelined" r"Execution. If you are using only one IPU, please switch to " r"poptorch.ShardedExecution.") with pytest.raises(poptorch.poptorch_core.Error, match=err_msg): poptorch_model(torch.tensor([1.0])) opts = poptorch.Options() opts.setExecutionStrategy(poptorch.ShardedExecution()) poptorch_model = poptorch.inferenceModel(model, options=opts) err_msg = (r"No IO tiles allocated. You must allocate at least 32 IO tiles" r" using poptorch.Options\(\).TensorLocations.numIOTiles.") with pytest.raises(poptorch.poptorch_core.Error, match=err_msg): poptorch_model(torch.tensor([1.0])) opts = opts.clone() opts.TensorLocations.numIOTiles(32) poptorch_model = poptorch.inferenceModel(model, options=opts) poptorch_model(torch.tensor([1.0])) @pytest.mark.ipuHardwareRequired def test_overlap_host_io_output(): num_mat_muls = 20 model = get_model(num_mat_muls, poptorch.OverlapMode.NoOverlap, poptorch.OverlapMode.NoOverlap, poptorch.OverlapMode.OverlapAccumulationLoop, poptorch.OverlapMode.OverlapAccumulationLoop) num_grad_accumulations = 10 num_device_iterations = 20 opts = poptorch.Options() opts.outputMode(poptorch.OutputMode.All) opts.deviceIterations(num_device_iterations) opts.setExecutionStrategy(poptorch.ShardedExecution()) opts.TensorLocations.numIOTiles(32) opts.Training.gradientAccumulation(num_grad_accumulations) poptorch_model = poptorch.trainingModel(model, options=opts) total_batch_size = num_grad_accumulations * num_device_iterations input_a = torch.randn((total_batch_size, INPUT_SIZE)) input_b = torch.randn((total_batch_size, INPUT_SIZE)) labels = torch.randint(0, 1, (total_batch_size, INPUT_SIZE)) poptorch_model(input_a, input_b, labels) @pytest.mark.ipuHardwareRequired def test_output_error_messages(): class DoubleOutputUseModel(torch.nn.Module): def forward(self, x): y = x + 1 y2 = poptorch.set_overlap_for_output( y, poptorch.OverlapMode.OverlapAccumulationLoop) return y, y2 model = DoubleOutputUseModel() poptorch_model = poptorch.inferenceModel(model) err_msg = ( r"poptorch.set_overlap_for_output cannot be used with a tensor that " r"is returned twice. Please check all returned tensors including " r"those nested in tuples/lists.") with pytest.raises(poptorch.poptorch_core.Error, match=err_msg): poptorch_model(torch.tensor([1.0])) opts = poptorch.Options() opts.setExecutionStrategy(poptorch.ShardedExecution()) opts.TensorLocations.numIOTiles(32) class MarkedOutputReuseBeforeModel(torch.nn.Module): def forward(self, x): y = x + 1 z = y + 1 y2 = poptorch.set_overlap_for_output( y, poptorch.OverlapMode.OverlapAccumulationLoop) return y2, z model = MarkedOutputReuseBeforeModel() poptorch_model = poptorch.inferenceModel(model, options=opts) poptorch_model(torch.tensor([1.0])) class MarkedOutputReuseAfterModel(torch.nn.Module): def forward(self, x): y = x + 1 y2 = poptorch.set_overlap_for_output( y, poptorch.OverlapMode.OverlapAccumulationLoop) z = y2 + 1 return y2, z model = MarkedOutputReuseAfterModel() poptorch_model = poptorch.inferenceModel(model, options=opts) poptorch_model(torch.tensor([1.0])) class NonOutputMarked(torch.nn.Module): def forward(self, x): x = poptorch.set_overlap_for_output( x, poptorch.OverlapMode.OverlapAccumulationLoop) y = x + 1 return y model = NonOutputMarked() poptorch_model = poptorch.inferenceModel(model, options=opts) err_msg = (r"poptorch.set_overlap_for_output applied on a node which is " r"not a tensor output to the model.") with pytest.raises(poptorch.poptorch_core.Error, match=err_msg): poptorch_model(torch.tensor([1.0])) def test_overlap_both_non_input_marked(): class NotOnInputModel(torch.nn.Module): def forward(self, x): x = poptorch.set_overlap_for_input( x, poptorch.OverlapMode.OverlapAccumulationLoop) y = x + 1 y2 = poptorch.set_overlap_for_input( y, poptorch.OverlapMode.OverlapAccumulationLoop) return y, y2 opts = poptorch.Options() opts.setExecutionStrategy(poptorch.ShardedExecution()) opts.TensorLocations.numIOTiles(32) model = NotOnInputModel() poptorch_model = poptorch.inferenceModel(model, opts) err_msg = (r"poptorch.set_overlap_for_input applied on a node which is " r"not a tensor input to the model.") with pytest.raises(poptorch.poptorch_core.Error, match=err_msg): poptorch_model(torch.tensor([1.0])) def test_overlap_both_non_output_marked(): class OutputBeforeLoss(torch.nn.Module): def forward(self, x): x = poptorch.set_overlap_for_input( x, poptorch.OverlapMode.OverlapAccumulationLoop) x = x + torch.ones_like(x) x = poptorch.set_overlap_for_output( x, poptorch.OverlapMode.OverlapAccumulationLoop) return torch.mean(x) model = OutputBeforeLoss() opts = poptorch.Options() opts.setExecutionStrategy(poptorch.ShardedExecution()) opts.TensorLocations.numIOTiles(32) inference_model = poptorch.inferenceModel(model, opts) err_msg = (r"poptorch.set_overlap_for_output applied on a node which is " r"not a tensor output to the model.") with pytest.raises(poptorch.Error, match=err_msg): inference_model(torch.tensor([1.0])) @pytest.mark.ipuHardwareRequired def test_overlap_tuple(): class Model(torch.nn.Module): def forward(self, xs): xs = poptorch.set_overlap_for_input( xs, poptorch.OverlapMode.OverlapDeviceIterationLoop) x = torch.cat(xs) + 1 xs = x.chunk(2) return poptorch.set_overlap_for_output( xs, poptorch.OverlapMode.OverlapAccumulationLoop) opts = poptorch.Options() opts.setExecutionStrategy(poptorch.ShardedExecution()) opts.TensorLocations.numIOTiles(32) model = poptorch.inferenceModel(Model(), opts) xs = torch.arange(8).reshape(4, 2).chunk(2) model(xs) ================================================ FILE: tests/phased_execution_test.py ================================================ #!/usr/bin/env python3 # Copyright (c) 2020 Graphcore Ltd. All rights reserved. import torch import torch.nn.functional as F import pytest import helpers import poptorch # Model: 2x2 S1 ExecutionPhase, repeated N times: # _____________________________________________________________________________ # phase 0: IPU 0 | IPU 2 # in0 ---- Slice/Slice -----------------------------. # | | | # w0 ----- MatMul | MatMul ----- w1 # | | | # ReLU | ReLU # | | | # +------------------------.|.-----------+ #______________________________________X__(inter-phase cross-IPU copy)_________ # phase 1: IPU 1 /|\ IPU 3 # .-----------------------' | '----------. # | | | # w2 ----- MatMul | MatMul ----- w3 # | | | # ReLU | ReLU # | | | # +------------------------.|.-----------+ # X (intra-phase cross-IPU copy) # /|\ # .-----------------------' | '----------. # | | | # w4 ----- MatMul | MatMul ----- w5 # | | | # ReLU | ReLU # | | | # +------------------------.|.-----------+ #______________________________________X_______________________________________ # phase 2: IPU 0 /|\ IPU 2 # ...... | # ...... | #______________________________________X__(inter-phase cross-IPU copy)_________ # phase N*2-1: IPU 1 /|\ IPU 3 # .-----------------------' | '----------. # | | | # w2 ----- MatMul | MatMul ----- w3 # | | | # ReLU | ReLU # | | | # +------------------------.|.-----------+ # X (intra-phase cross-IPU copy) # /|\ # .-----------------------' | '----------. # | | | # w4 ----- MatMul | MatMul ----- w5 # | | | # ReLU | ReLU # | | | # +------------------------------------ Sum ----- L1Loss #______________________________________|_______________________________________ class LogChecker(helpers.LogChecker): def validate_2x2_parallel_phased_execution(self): # pylint: disable=line-too-long self.assert_contains("enablePipelining set to value 0") self.assert_contains("executionPhaseSettings.stages set to value 2") self.assert_contains("executionPhaseSettings.phases set to value 6") self.assert_contains( "location_activation set to value useOnChipStorage(False)") self.assert_contains( "location_weight set to value useOnChipStorage(False)") self.assert_contains( "location_optimizer set to value useOnChipStorage(False)") self.assert_contains( "location_accumulator set to value useOnChipStorage(False)") self.assert_contains( "Slice:0 [float32(10, 1), mode(Phased), ipu(0), phase(0)]") self.assert_contains( "Slice:0/1 [float32(10, 1), mode(Phased), ipu(0), phase(0)]") self.assert_contains( "MatMul:0 [float32(10, 1), mode(Phased), ipu(0), phase(0)]") self.assert_contains( "Relu:0 [float32(10, 1), mode(Phased), ipu(0), phase(0)]") self.assert_contains( "MatMul:0/1 [float32(10, 1), mode(Phased), ipu(2), phase(0)]") self.assert_contains( "Relu:0/1 [float32(10, 1), mode(Phased), ipu(2), phase(0)]") self.assert_contains( "MatMul:0/2 [float32(10, 1), mode(Phased), ipu(1), phase(1)]") self.assert_contains( "Relu:0/2 [float32(10, 1), mode(Phased), ipu(1), phase(1)]") self.assert_contains( "MatMul:0/3 [float32(10, 1), mode(Phased), ipu(3), phase(1)]") self.assert_contains( "Relu:0/3 [float32(10, 1), mode(Phased), ipu(3), phase(1)]") self.assert_contains( "MatMul:0/4 [float32(10, 1), mode(Phased), ipu(1), phase(1)]") self.assert_contains( "Relu:0/4 [float32(10, 1), mode(Phased), ipu(1), phase(1)]") self.assert_contains( "MatMul:0/5 [float32(10, 1), mode(Phased), ipu(3), phase(1)]") self.assert_contains( "Relu:0/5 [float32(10, 1), mode(Phased), ipu(3), phase(1)]") self.assert_contains( "MatMul:0/6 [float32(10, 1), mode(Phased), ipu(0), phase(2)]") self.assert_contains( "Relu:0/6 [float32(10, 1), mode(Phased), ipu(0), phase(2)]") self.assert_contains( "MatMul:0/7 [float32(10, 1), mode(Phased), ipu(2), phase(2)]") self.assert_contains( "Relu:0/7 [float32(10, 1), mode(Phased), ipu(2), phase(2)]") self.assert_contains( "MatMul:0/8 [float32(10, 1), mode(Phased), ipu(1), phase(3)]") self.assert_contains( "Relu:0/8 [float32(10, 1), mode(Phased), ipu(1), phase(3)]") self.assert_contains( "MatMul:0/9 [float32(10, 1), mode(Phased), ipu(3), phase(3)]") self.assert_contains( "Relu:0/9 [float32(10, 1), mode(Phased), ipu(3), phase(3)]") self.assert_contains( "MatMul:0/10 [float32(10, 1), mode(Phased), ipu(1), phase(3)]") self.assert_contains( "Relu:0/10 [float32(10, 1), mode(Phased), ipu(1), phase(3)]") self.assert_contains( "MatMul:0/11 [float32(10, 1), mode(Phased), ipu(3), phase(3)]") self.assert_contains( "Relu:0/11 [float32(10, 1), mode(Phased), ipu(3), phase(3)]") self.assert_contains( "MatMul:0/12 [float32(10, 1), mode(Phased), ipu(0), phase(4)]") self.assert_contains( "Relu:0/12 [float32(10, 1), mode(Phased), ipu(0), phase(4)]") self.assert_contains( "MatMul:0/13 [float32(10, 1), mode(Phased), ipu(2), phase(4)]") self.assert_contains( "Relu:0/13 [float32(10, 1), mode(Phased), ipu(2), phase(4)]") self.assert_contains( "MatMul:0/14 [float32(10, 1), mode(Phased), ipu(1), phase(5)]") self.assert_contains( "Relu:0/14 [float32(10, 1), mode(Phased), ipu(1), phase(5)]") self.assert_contains( "MatMul:0/15 [float32(10, 1), mode(Phased), ipu(3), phase(5)]") self.assert_contains( "Relu:0/15 [float32(10, 1), mode(Phased), ipu(3), phase(5)]") self.assert_contains( "MatMul:0/16 [float32(10, 1), mode(Phased), ipu(1), phase(5)]") self.assert_contains( "Relu:0/16 [float32(10, 1), mode(Phased), ipu(1), phase(5)]") self.assert_contains( "MatMul:0/17 [float32(10, 1), mode(Phased), ipu(3), phase(5)]") self.assert_contains( "Relu:0/17 [float32(10, 1), mode(Phased), ipu(3), phase(5)]") self.assert_contains( "Add:0 [float32(10, 1), mode(Phased), ipu(3), phase(5)]") self.assert_contains( "Sub:0 [float32(10, 1), mode(Phased), ipu(3), phase(5)]") self.assert_contains( "L1:0 [float32(), mode(Phased), ipu(3), phase(5)]") self.assert_contains( "IdentityLoss:0 [float32(), mode(Phased), ipu(3), phase(5)]") # pylint: enable=line-too-long def validate_2x2_parallel_phased_execution_small(self): # pylint: disable=line-too-long self.assert_contains("enablePipelining set to value 0") self.assert_contains("executionPhaseSettings.stages set to value 2") self.assert_contains("executionPhaseSettings.phases set to value 2") self.assert_contains( "location_activation set to value useOnChipStorage(False)") self.assert_contains( "location_weight set to value useOnChipStorage(False)") self.assert_contains( "location_optimizer set to value useOnChipStorage(False)") self.assert_contains( "location_accumulator set to value useOnChipStorage(False)") self.assert_contains( "Slice:0 [float32(10, 1), mode(Phased), ipu(0), phase(0)]") self.assert_contains( "Slice:0/1 [float32(10, 1), mode(Phased), ipu(0), phase(0)]") self.assert_contains( "MatMul:0 [float32(10, 1), mode(Phased), ipu(0), phase(0)]") self.assert_contains( "Relu:0 [float32(10, 1), mode(Phased), ipu(0), phase(0)]") self.assert_contains( "MatMul:0/1 [float32(10, 1), mode(Phased), ipu(2), phase(0)]") self.assert_contains( "Relu:0/1 [float32(10, 1), mode(Phased), ipu(2), phase(0)]") self.assert_contains( "MatMul:0/2 [float32(10, 1), mode(Phased), ipu(1), phase(1)]") self.assert_contains( "Relu:0/2 [float32(10, 1), mode(Phased), ipu(1), phase(1)]") self.assert_contains( "MatMul:0/3 [float32(10, 1), mode(Phased), ipu(3), phase(1)]") self.assert_contains( "Relu:0/3 [float32(10, 1), mode(Phased), ipu(3), phase(1)]") self.assert_contains( "MatMul:0/4 [float32(10, 1), mode(Phased), ipu(1), phase(1)]") self.assert_contains( "Relu:0/4 [float32(10, 1), mode(Phased), ipu(1), phase(1)]") self.assert_contains( "MatMul:0/5 [float32(10, 1), mode(Phased), ipu(3), phase(1)]") self.assert_contains( "Relu:0/5 [float32(10, 1), mode(Phased), ipu(3), phase(1)]") self.assert_contains( "Add:0 [float32(10, 1), mode(Phased), ipu(3), phase(1)]") self.assert_contains( "Sub:0 [float32(10, 1), mode(Phased), ipu(3), phase(1)]") self.assert_contains( "L1:0 [float32(), mode(Phased), ipu(3), phase(1)]") self.assert_contains( "IdentityLoss:0 [float32(), mode(Phased), ipu(3), phase(1)]") # pylint: enable=line-too-long def validate_serial_tensor_liveness(self, liveness): # 'phases' does not include the bwd pass, so to calculate, # sum the number of phases in the fwd pass, plus any phase # gap between the end of the fwd and start of the bwd pass if liveness == poptorch.Liveness.AlwaysLive: # fwd: bwd: # phase 0 -> phase 4 # phase 1 -> phase 3 # phase 2 -> phase 2 phases = 3 stride = 1 elif liveness == poptorch.Liveness.OffChipAfterFwd: # fwd: bwd: # phase 0 -> phase 8 # phase 1 -> phase 7 # phase 2 -> phase 6 phases = 6 stride = 1 elif liveness == poptorch.Liveness.OffChipAfterFwdNoOverlap: # fwd: bwd: # phase 0 -> phase 12 # phase 2 -> phase 10 # phase 4 -> phase 8 phases = 8 stride = 2 elif liveness == poptorch.Liveness.OffChipAfterEachPhase: # fwd: bwd: # phase 0 -> phase 20 # phase 4 -> phase 16 # phase 8 -> phase 12 phases = 12 stride = 4 self.assert_contains('set serial_phases_execution to value true') self.assert_contains('executionPhaseSettings.stages set to value 1') self.assert_contains( 'executionPhaseSettings.phases set to value {}'.format(phases)) for phase in range(3): op_label = ':0' self.assert_contains( 'Transpose{} [float32({}, {}), mode(Phased), ipu(0), phase({})]' .format(op_label, 8 - phase, 7 - phase, phase * stride)) self.assert_matches( (r'(MatMul|Gemm){} \[(float32\({}{}\)|undefined\(shape ' r'inference failed\)), mode\(Phased\), ipu\(0\), phase\({}\)]' ).format(op_label, "1, ", 7 - phase, phase * stride)) @helpers.printCapfdOnExit @helpers.overridePoptorchLogLevel("DEBUG") def test_2x2_parallel_phased_execution_inline(capfd): N = 3 size = 10 class Model(torch.nn.Module): def __init__(self): super().__init__() self.weights = torch.nn.ParameterList([ torch.nn.Parameter(torch.rand(size, size), requires_grad=True) for n in range(N * 6) ]) def forward(self, in0, target=None): phase = 0 with poptorch.Block("0", ipu_id=0): ins = torch.split(in0, size) weight = iter(self.weights) for n in range(N * 3): out = [] for ipu in range(2): x = ins[ipu] # Alternate between 0-2 and 1-3 ipu = (phase % 2) + ipu * 2 with poptorch.Block(f"{phase}", ipu_id=ipu): x = torch.matmul(next(weight), x) out.append(F.relu(x)) ins = out[1], out[0] # We want 2 matmuls in the same phase if n % 3 != 1: phase += 1 with poptorch.Block(f"{N*2-1}", ipu_id=3): res = ins[0] + ins[1] if target is None: return res return res, torch.nn.L1Loss(reduction="mean")(res, target) input = torch.rand(size * 2, 1) target = torch.rand(size, 1) model = Model() phases = [] phases = [f"{n}" for n in range(2 * N)] opts = poptorch.Options() opts.setExecutionStrategy(poptorch.ParallelPhasedExecution(*phases)) poptorch_model = poptorch.trainingModel(model, opts) poptorch_model.compile(input, target) testlog = LogChecker(capfd) testlog.validate_2x2_parallel_phased_execution() @helpers.printCapfdOnExit @helpers.overridePoptorchLogLevel("DEBUG") def test_2x2_parallel_phased_execution_opts(capfd): N = 3 size = 10 class Model(torch.nn.Module): def __init__(self): super().__init__() self.weights = torch.nn.ParameterList([ torch.nn.Parameter(torch.rand(size, size), requires_grad=True) for n in range(N * 6) ]) def forward(self, in0, target=None): phase = 0 weight = iter(self.weights) with poptorch.Block("phase0_ipu0"): ins = torch.split(in0, size) for n in range(N * 3): out = [] for ipu in range(2): x = ins[ipu] with poptorch.Block(f"phase{phase}_ipu{ipu}"): x = torch.matmul(next(weight), x) out.append(F.relu(x)) ins = out[1], out[0] # We want 2 matmuls in the same phase if n % 3 != 1: phase += 1 with poptorch.Block(f"phase{N*2-1}_ipu1"): res = ins[0] + ins[1] if target is None: return res return res, torch.nn.L1Loss(reduction="mean")(res, target) input = torch.rand(size * 2, 1) target = torch.rand(size, 1) model = Model() phases = [] # Alternate between 0-2 and 1-3 for n in range(N): phases.append([ poptorch.Stage(f"phase{2*n}_ipu0").ipu(0), poptorch.Stage(f"phase{2*n}_ipu1").ipu(2) ]) phases.append([ poptorch.Stage(f"phase{2*n+1}_ipu0").ipu(1), poptorch.Stage(f"phase{2*n+1}_ipu1").ipu(3) ]) opts = poptorch.Options() opts.setExecutionStrategy(poptorch.ParallelPhasedExecution(*phases)) poptorch_model = poptorch.trainingModel(model, opts) poptorch_model.compile(input, target) testlog = LogChecker(capfd) testlog.validate_2x2_parallel_phased_execution() @helpers.printCapfdOnExit @helpers.overridePoptorchLogLevel("DEBUG") def test_2x2_parallel_phased_execution_small_opts(capfd): size = 10 class Model(torch.nn.Module): def __init__(self): super().__init__() self.weights = torch.nn.ParameterList([ torch.nn.Parameter(torch.rand(size, size), requires_grad=True) for n in range(6) ]) def forward(self, in0, target=None): poptorch.Block.useAutoId() weight = iter(self.weights) # Phase 0 / ipu 0 with poptorch.Block(): in0, in1 = torch.split(in0, size) x = torch.matmul(next(weight), in0) out0 = F.relu(x) # Phase 0 / ipu 2 with poptorch.Block(): x = torch.matmul(next(weight), in1) out1 = F.relu(x) in0, in1 = out1, out0 # Phase 1 / ipu 1 with poptorch.Block(): x = torch.matmul(next(weight), in0) out0 = F.relu(x) # Phase 1 / ipu 3 with poptorch.Block(): x = torch.matmul(next(weight), in1) out1 = F.relu(x) in0, in1 = out1, out0 # Phase 1 / ipu 1 - part 2 with poptorch.Block(): x = torch.matmul(next(weight), in0) out0 = F.relu(x) # Phase 1 / ipu 3 - part 2 with poptorch.Block(): x = torch.matmul(next(weight), in1) out1 = F.relu(x) res = out0 + out1 if target is None: return res return res, torch.nn.L1Loss(reduction="mean")(res, target) input = torch.rand(size * 2, 1) target = torch.rand(size, 1) model = Model() strategy = poptorch.ParallelPhasedExecution( [poptorch.Stage("0"), poptorch.Stage("1")], [poptorch.Stage("2", "4"), poptorch.Stage("3", "5")]) # Alternate between 0-2 and 1-3 strategy.phase(0).ipus(0, 2) strategy.phase(1).ipus(1, 3) opts = poptorch.Options() opts.setExecutionStrategy(strategy) poptorch_model = poptorch.trainingModel(model, opts) poptorch_model.compile(input, target) testlog = LogChecker(capfd) testlog.validate_2x2_parallel_phased_execution_small() @pytest.mark.parametrize("liveness", list(poptorch.Liveness)) @helpers.printCapfdOnExit @helpers.overridePoptorchLogLevel("DEBUG") def test_serial_tensor_liveness(capfd, liveness): class Model(torch.nn.Module): def __init__(self): super().__init__() self.fc1 = torch.nn.Linear(8, 7) self.fc2 = torch.nn.Linear(7, 6) self.fc3 = torch.nn.Linear(6, 5) def forward(self, x): with poptorch.Block("B1"): x = self.fc1(x) with poptorch.Block("B2"): x = self.fc2(x) with poptorch.Block("B3"): x = self.fc3(x) return x strategy = poptorch.SerialPhasedExecution("B1", "B2", "B3") strategy.stage("B1").ipu(0) strategy.stage("B2").ipu(0) strategy.stage("B3").ipu(0) strategy.setTensorsLiveness(liveness) opts = poptorch.Options() opts.setExecutionStrategy(strategy) model = Model() model = poptorch.inferenceModel(model, opts) input = torch.randn(8) model.compile(input) testlog = LogChecker(capfd) testlog.validate_serial_tensor_liveness(liveness) def test_phased_api(): # Try to pass a list of Phases poptorch.SerialPhasedExecution( poptorch.Phase('layer1'), poptorch.Phase('layer2'), ) # Try to pass a list of stages poptorch.SerialPhasedExecution( poptorch.Stage('layer1'), poptorch.Stage('layer2'), ) # Try to pass a list of list of stages poptorch.SerialPhasedExecution( [poptorch.Stage('layer1'), poptorch.Stage('layer1.b')], [poptorch.Stage('layer2'), poptorch.Stage('layer2.b')]) # Try to pass a list of list of block IDs poptorch.SerialPhasedExecution(["layer1"], ["layer2"]) ================================================ FILE: tests/pipelining_test.py ================================================ #!/usr/bin/env python3 # Copyright (c) 2020 Graphcore Ltd. All rights reserved. import copy import io import json import re import subprocess import tempfile import torch import pytest import helpers import poptorch @helpers.overridePoptorchLogLevel("DEBUG") def test_missing_block(): class Model(torch.nn.Module): def forward(self, x): poptorch.Block.useAutoId() with poptorch.Block(ipu_id=0): x = x * 4 x = x * 4 return x m = Model() opts = poptorch.Options() opts.deviceIterations(2) opts.setExecutionStrategy( poptorch.PipelinedExecution(poptorch.AutoStage.AutoIncrement)) m = poptorch.inferenceModel(m, opts) with pytest.raises(poptorch.Error, match="No active Block"): m.compile(torch.randn(2, 5)) assert not poptorch.poptorch_core.isCompilingWithDispatcher(), ( "[Internal] Clean up failed: dispatcher still active") @helpers.printCapfdOnExit @helpers.overridePoptorchLogLevel("DEBUG") @pytest.mark.parametrize("use_scope", [True, False]) def test_api_inline(capfd, use_scope): if use_scope: class Model(torch.nn.Module): def forward(self, x): poptorch.Block.useAutoId() with poptorch.Block(ipu_id=0): x = x * 4 with poptorch.Block(ipu_id=1): x = x * 2 return x else: class Model(torch.nn.Module): def forward(self, x): poptorch.Block.useAutoId() poptorch.Block.start(ipu_id=0) x = x * 4 poptorch.Block.start(ipu_id=1) x = x * 2 return x m = Model() opts = poptorch.Options() opts.deviceIterations(2) m = poptorch.inferenceModel(m, opts) m(torch.randn(2, 5)) log = helpers.LogChecker(capfd) log.assert_contains("enablePipelining set to value 1") log.assert_contains(" Mul:0 ", " mode(Pipelined), ipu(0), stage(0)") log.assert_contains(" Mul:0/1 ", " mode(Pipelined), ipu(1), stage(1)") @helpers.overridePoptorchLogLevel("DEBUG") def run_recomputation_checkpoint_test(size, model_cls, exp_num_stash_ckpted): # pylint: disable=protected-access dev_its = 2 grad_accum = 3 opts = poptorch.Options() opts.deviceIterations(dev_its) opts.Training.gradientAccumulation(grad_accum) opts._Popart.set("autoRecomputation", 3) # All forward pipeline stages. m = poptorch.trainingModel(model_cls(False), opts) m.compile(torch.randn(dev_its * grad_accum, size, 1), torch.randn(dev_its * grad_accum, size, 1)) ir = json.loads(m._debugGetPopartIR()) assert not any("Checkpoint" in node["name"] for node in ir["maingraph"]), ( "Popart IR shouldn't contain any checkpoint") assert sum(["Stash" in node["type"] for node in ir["maingraph"] ]) == 1, ("Only the graph input should be stashed") native_ckpted = model_cls(True) m = poptorch.trainingModel(native_ckpted, opts) m.compile(torch.randn(dev_its * grad_accum, size, 1), torch.randn(dev_its * grad_accum, size, 1)) ir = json.loads(m._debugGetPopartIR()) # pylint: disable=protected-access assert any( "Checkpoint" in node["name"] for node in ir["maingraph"]), ("Popart IR should contain a checkpoint") assert sum([ "Stash" in node["type"] for node in ir["maingraph"] ]) == exp_num_stash_ckpted, ("Both the graph input and the checkpoint(s) " "should be stashed") def test_recomputation_checkpoint_tensor(): pytest.skip("TODO(T65559): AssertionError: Popart IR should contain a " "checkpoint") size = 3 class Model(torch.nn.Module): def __init__(self, checkpoint=False): super().__init__() self.checkpoint = checkpoint weight = torch.nn.Parameter(torch.rand(size, size), requires_grad=True) self.register_parameter("weight", weight) def forward(self, x, target): poptorch.Block.useAutoId() with poptorch.Block(ipu_id=0): x = torch.matmul(self.weight, x) if self.checkpoint: x = poptorch.recomputationCheckpoint(x) x = torch.matmul(self.weight, x) with poptorch.Block(ipu_id=1): x = x * 2 return x, torch.nn.functional.l1_loss(x, target) run_recomputation_checkpoint_test(size, Model, 2) def test_recomputation_checkpoint_tensor_two_inputs(): pytest.skip("TODO(T65559): AssertionError: Popart IR should contain a " "checkpoint") size = 3 class Model(torch.nn.Module): def __init__(self, checkpoint=False): super().__init__() self.checkpoint = checkpoint weight_1 = torch.nn.Parameter(torch.rand(size, size), requires_grad=True) self.register_parameter("weight_1", weight_1) weight_2 = torch.nn.Parameter(torch.rand(size, size), requires_grad=True) self.register_parameter("weight_2", weight_2) def forward(self, x, target): poptorch.Block.useAutoId() with poptorch.Block(ipu_id=0): x = torch.matmul(self.weight_1, x) y = torch.matmul(self.weight_2, x) if self.checkpoint: x, y = poptorch.recomputationCheckpoint(x, y) x = torch.matmul(self.weight_1, x + y) with poptorch.Block(ipu_id=1): x = x * 2 return x, torch.nn.functional.l1_loss(x, target) run_recomputation_checkpoint_test(size, Model, 3) def test_recomputation_checkpoint_tensor_tuple_inputs(): pytest.skip("TODO(T65559): AssertionError: Popart IR should contain a " "checkpoint") size = 3 class Model(torch.nn.Module): def __init__(self, checkpoint=False): super().__init__() self.checkpoint = checkpoint weight_1 = torch.nn.Parameter(torch.rand(size, size), requires_grad=True) self.register_parameter("weight_1", weight_1) weight_2 = torch.nn.Parameter(torch.rand(size, size), requires_grad=True) self.register_parameter("weight_2", weight_2) def forward(self, x, target): poptorch.Block.useAutoId() with poptorch.Block(ipu_id=0): x = torch.matmul(self.weight_1, x) y = torch.matmul(self.weight_2, x) if self.checkpoint: x, y = poptorch.recomputationCheckpoint((x, y)) x = torch.matmul(self.weight_1, x + y) with poptorch.Block(ipu_id=1): x = x * 2 return x, torch.nn.functional.l1_loss(x, target) run_recomputation_checkpoint_test(size, Model, 3) @helpers.printCapfdOnExit @helpers.overridePoptorchLogLevel("DEBUG") def test_api_wrap(capfd): """ stage "0" ipu(0) stage(0) l0 l1 l2 """ class Block(torch.nn.Module): def forward(self, x): return x * 6 class Model(torch.nn.Module): def __init__(self): super().__init__() self.l1 = Block() self.l2 = Block() def forward(self, x): x = self.l1(x) x = self.l2(x) return x m = Model() poptorch.BeginBlock(m.l1, ipu_id=0) poptorch.BeginBlock(m.l2, ipu_id=0) opts = poptorch.Options() opts.deviceIterations(2) m = poptorch.inferenceModel(m, opts) m(torch.randn(2, 5)) log = helpers.LogChecker(capfd) log.assert_contains("enablePipelining set to value 0") log.assert_contains(" l1/Mul:0 ", " mode(Pipelined), ipu(0), stage(0)") log.assert_contains(" l2/Mul:0 ", " mode(Pipelined), ipu(0), stage(0)") @helpers.printCapfdOnExit @helpers.overridePoptorchLogLevel("DEBUG") def test_api_wrap_2stages(capfd): """ stage "0" ipu(0) stage(0) l0 stage "1" ipu(1) stage(1) l1 / l2 """ class Block(torch.nn.Module): def forward(self, x): return x * 6 class Model(torch.nn.Module): def __init__(self): super().__init__() self.l0 = Block() self.l1 = Block() self.l2 = Block() def forward(self, x): x = self.l0(x) x = self.l1(x) x = self.l2(x) return x m = Model() poptorch.BeginBlock(m.l1, ipu_id=1) poptorch.BeginBlock(m.l2, ipu_id=1) opts = poptorch.Options() opts.deviceIterations(2) m = poptorch.inferenceModel(m, opts) m(torch.randn(2, 5)) log = helpers.LogChecker(capfd) log.assert_contains("enablePipelining set to value 1") log.assert_contains(" l0/Mul:0 ", " mode(Pipelined), ipu(0), stage(0)") log.assert_contains(" l1/Mul:0 ", " mode(Pipelined), ipu(1), stage(1)") log.assert_contains(" l2/Mul:0 ", " mode(Pipelined), ipu(1), stage(1)") def test_begin_block_printing(): class Block(torch.nn.Module): def forward(self, x): return x * 6 class Model(torch.nn.Module): def __init__(self): super().__init__() self.l1 = Block() self.l2 = Block() def forward(self, x): x = self.l1(x) x = self.l2(x) return x m = Model() begin_l1 = re.compile(r'\(l1\):\s*BeginBlock\(user_id=None, ipu_id=1\)') begin_l2 = re.compile(r'\(l2\):\s*BeginBlock\(user_id=None, ipu_id=1\)') module_repr = poptorch.module_repr(m) assert not "BeginBlock(" in module_repr assert not begin_l1.search(module_repr) assert not begin_l2.search(module_repr) poptorch.BeginBlock(m.l1, ipu_id=1) module_repr = poptorch.module_repr(m) assert begin_l1.search(module_repr) assert not begin_l2.search(module_repr) poptorch.BeginBlock(m.l2, ipu_id=1) module_repr = poptorch.module_repr(m) assert begin_l1.search(module_repr) assert begin_l2.search(module_repr) opts = poptorch.Options() opts.deviceIterations(2) module_repr = repr(poptorch.inferenceModel(m, opts)) assert begin_l1.search(module_repr) assert begin_l2.search(module_repr) @helpers.printCapfdOnExit @helpers.overridePoptorchLogLevel("DEBUG") def test_inline_AutoIncrement(capfd): class Model(torch.nn.Module): def forward(self, x): poptorch.Block.useAutoId() with poptorch.Block(ipu_id=0): x = x * 2 with poptorch.Block(ipu_id=1): x = x * 3 with poptorch.Block(ipu_id=2): x = x * 4 with poptorch.Block(ipu_id=1): x = x * 5 return x m = Model() opts = poptorch.Options() opts.deviceIterations(4).autoRoundNumIPUs(True) opts.setExecutionStrategy( poptorch.PipelinedExecution(poptorch.AutoStage.AutoIncrement)) m = poptorch.inferenceModel(m, opts) m.compile(torch.randn(4, 5)) log = helpers.LogChecker(capfd) log.assert_contains("enablePipelining set to value 1") log.assert_contains(" Mul:0 ", " mode(Pipelined), ipu(0), stage(1)") log.assert_contains(" Mul:0/1 ", " mode(Pipelined), ipu(1), stage(2)") log.assert_contains(" Mul:0/2 ", " mode(Pipelined), ipu(2), stage(3)") log.assert_contains(" Mul:0/3 ", " mode(Pipelined), ipu(1), stage(4)") @helpers.printCapfdOnExit @helpers.overridePoptorchLogLevel("DEBUG") def test_api_AutoIncrement(capfd): class Block(torch.nn.Module): def forward(self, x): return x * 6 class Model(torch.nn.Module): def __init__(self): super().__init__() self.l1 = Block() self.l2 = Block() self.l3 = Block() self.l4 = Block() def forward(self, x): x = self.l1(x) x = self.l2(x) x = self.l3(x) x = self.l4(x) return x m = Model() m.l2 = poptorch.BeginBlock(m.l2, ipu_id=1) m.l3 = poptorch.BeginBlock(m.l3, ipu_id=2) m.l4 = poptorch.BeginBlock(m.l4, ipu_id=1) opts = poptorch.Options() opts.deviceIterations(4).autoRoundNumIPUs(True) opts.setExecutionStrategy( poptorch.PipelinedExecution(poptorch.AutoStage.AutoIncrement)) m = poptorch.inferenceModel(m, opts) m(torch.randn(4, 5)) log = helpers.LogChecker(capfd) log.assert_contains("enablePipelining set to value 1") log.assert_contains(" l1/Mul:0 ", " mode(Pipelined), ipu(0), stage(0)") log.assert_contains(" l2/Mul:0 ", " mode(Pipelined), ipu(1), stage(1)") log.assert_contains(" l3/Mul:0 ", " mode(Pipelined), ipu(2), stage(2)") log.assert_contains(" l4/Mul:0 ", " mode(Pipelined), ipu(1), stage(3)") @pytest.mark.ipuHardwareRequired def test_ipu_round_up_error(): class Block(torch.nn.Module): def forward(self, x): return x * 6 class Model(torch.nn.Module): def __init__(self): super().__init__() self.l1 = Block() self.l2 = Block() self.l3 = Block() def forward(self, x): x = self.l1(x) x = self.l2(x) x = self.l3(x) return x m = Model() poptorch.BeginBlock(m.l1, ipu_id=0) poptorch.BeginBlock(m.l2, ipu_id=1) poptorch.BeginBlock(m.l3, ipu_id=2) opts = poptorch.Options() opts.setExecutionStrategy( poptorch.PipelinedExecution(poptorch.AutoStage.AutoIncrement)) m = poptorch.inferenceModel(m, opts) error_msg = ( ".+The model specifies the use of 3 IPUs, however PopTorch must " "reserve a minimum of 4 in order to allow the model to run, " "because PopTorch must reserve a power of 2 or maximum of 64 IPUs per " r"process\. Please reconfigure your model to use a different number of " r"IPUs or set poptorch\.Options\(\)\.autoRoundNumIPUs\(True\)\.") with pytest.raises(poptorch.Error, match=error_msg): m(torch.randn(4, 5)) class BlockFnModel(torch.nn.Module): def forward(self, x): poptorch.Block.useAutoId() x = self.mult_4(x) x = self.mult_2(x) return x @poptorch.BlockFunction(ipu_id=0) def mult_4(self, x): return x * 4 @poptorch.BlockFunction(ipu_id=1) def mult_2(self, x): return x * 2 @helpers.printCapfdOnExit @helpers.overridePoptorchLogLevel("DEBUG") def test_block_function(capfd): m = BlockFnModel() opts = poptorch.Options() opts.deviceIterations(2) m = poptorch.inferenceModel(m, opts) m(torch.randn(2, 5)) log = helpers.LogChecker(capfd) log.assert_contains("enablePipelining set to value 1") log.assert_contains(" Mul:0 ", " mode(Pipelined), ipu(0), stage(0)") log.assert_contains(" Mul:0/1 ", " mode(Pipelined), ipu(1), stage(1)") def test_block_function_saving(): m = BlockFnModel() m = poptorch.inferenceModel(m) with tempfile.TemporaryFile() as f: torch.save(m, f) def test_begin_block_functionality(): class Block(torch.nn.Module): def __init__(self): super().__init__() self.relu = torch.nn.ReLU() self.l1 = torch.nn.Linear(3, 5) self.l2 = torch.nn.Linear(5, 5) self.l3 = torch.nn.Linear(5, 3) def forward(self, x): x = self.relu(self.l1(x)) x = self.relu(self.l2(x)) x = self.relu(self.l3(x)) return x class Model(torch.nn.Module): def __init__(self): super().__init__() self.l1 = Block() self.l2 = Block() def forward(self, x): x = self.l1(x) with poptorch.Block(ipu_id=2): x = self.l2(x) return x m = Model() old_all_names = [n for n, _ in m.named_parameters()] old_state_dict = m.state_dict() m_l1_wrapped = poptorch.BeginBlock(m.l1, ipu_id=1) # The return is for backward compatibility assert m_l1_wrapped is m.l1 assert m.l2.__class__ is Block poptorch.BeginBlock(m.l2, ipu_id=2) new_all_names = [n for n, _ in m.named_parameters()] new_state_dict = m.state_dict() assert old_all_names == new_all_names sorted_state_dict_keys = sorted(old_state_dict.keys()) assert sorted_state_dict_keys == sorted(new_state_dict.keys()) for k in sorted_state_dict_keys: helpers.assert_allequal(expected=old_state_dict[k], actual=new_state_dict[k]) # Strict=True is a sufficient test in itself m.load_state_dict(old_state_dict, strict=True) # Test dir does not raise an exception dir(m.l1) # Test registering a buffer m.l1.register_buffer("a_buff", torch.nn.parameter.Parameter(torch.zeros(2, 2))) buffer_names = [b[0] for b in m.named_buffers()] assert "l1.a_buff" in buffer_names # Test registering a param m.l1.register_parameter("a_param", torch.nn.parameter.Parameter(torch.zeros(2, 2))) param_names = [p[0] for p in m.named_parameters()] assert "l1.a_param" in param_names # Test the model can still be saved f = io.BytesIO() torch.save(m.state_dict(), f) def run_in_python_and_get_block_details(model_file_path): python_script = (b"import poptorch\n" b"import torch\n" b"with open(\"" + model_file_path.encode('utf-8') + b"\", \"rb\") as f:\n" b" m = torch.load(f)\n" b"print(poptorch.module_repr(m))\n") s = subprocess.Popen(["python3"], stdin=subprocess.PIPE, stdout=subprocess.PIPE) return s.communicate(python_script, timeout=10)[0].decode("utf-8") def test_saving_of_begin_block(): m = torch.nn.Sequential(torch.nn.Conv2d(3, 10, 5), torch.nn.ReLU(), torch.nn.Conv2d(10, 10, 5), torch.nn.ReLU()) with tempfile.NamedTemporaryFile() as f: torch.save(m, f) out = run_in_python_and_get_block_details(f.name) assert 'Sequential(' in out poptorch.BeginBlock(m, user_id=1, ipu_id=2) model_class_before_save = m.__class__ after_block_save = io.BytesIO() torch.save(m, after_block_save) assert m.__class__ == model_class_before_save with tempfile.NamedTemporaryFile() as f: torch.save(m, f) out = run_in_python_and_get_block_details(f.name) assert 'BeginBlock(user_id=1, ipu_id=2)' in out def test_begin_block_copy(): b_1 = torch.nn.Sequential(torch.nn.Conv2d(4, 8, 3), torch.nn.ReLU(), torch.nn.Conv2d(8, 10, 3), torch.nn.ReLU()) b_2 = torch.nn.Sequential(torch.nn.Conv2d(10, 5, 5), torch.nn.ReLU(), torch.nn.Conv2d(5, 10, 5), torch.nn.ReLU()) poptorch.BeginBlock(b_1, user_id=1, ipu_id=1) poptorch.BeginBlock(b_2, user_id=2, ipu_id=2) m = torch.nn.Sequential(b_1, b_2) assert "BeginBlock(user_id=1, ipu_id=1)" in poptorch.module_repr(m[0]) assert "BeginBlock(user_id=2, ipu_id=2)" in poptorch.module_repr(m[1]) m_copy = copy.copy(m) assert "BeginBlock(user_id=1, ipu_id=1)" in poptorch.module_repr(m_copy[0]) assert "BeginBlock(user_id=2, ipu_id=2)" in poptorch.module_repr(m_copy[1]) m_deep_copy = copy.deepcopy(m) assert "BeginBlock(user_id=1, ipu_id=1)" in poptorch.module_repr( m_deep_copy[0]) assert "BeginBlock(user_id=2, ipu_id=2)" in poptorch.module_repr( m_deep_copy[1]) def model_fn(inputs): return inputs + 1.0 def test_begin_block_with_function(): # Legacy use block = poptorch.BeginBlock(model_fn, 1, 2) # pylint: disable=protected-access assert block._user_id == 1 assert block._ipu_id == 2 with tempfile.TemporaryFile() as f: torch.save(block, f) @helpers.printCapfdOnExit @helpers.overridePoptorchLogLevel("DEBUG") def test_removeBlocks(capfd): class Block(torch.nn.Module): def __init__(self): super().__init__() self.l1 = torch.nn.ReLU() self.l2 = torch.nn.ReLU() self.l3 = torch.nn.ReLU() def forward(self, x): x = self.l1(x) x = self.l2(x) x = self.l3(x) return x class Model(torch.nn.Module): def __init__(self): super().__init__() self.b1 = Block() self.b2 = Block() def forward(self, x): x = self.b1(x) x = self.b2(x) return x def compile_model(m): opts = poptorch.Options() opts.deviceIterations(10) if poptorch.ipuHardwareIsAvailable(): opts.useOfflineIpuTarget() poptorch_model = poptorch.inferenceModel(m, opts) poptorch_model.compile(torch.randn(10, 3)) def assert_is_not_pipelined(m): compile_model(m) log = helpers.LogChecker(capfd) log.assert_contains("enablePipelining set to value 0") log.assert_contains(" b1/l1/Relu:0 ", " mode(Pipelined), ipu(0), stage(0)") log.assert_contains(" b1/l2/Relu:0 ", " mode(Pipelined), ipu(0), stage(0)") log.assert_contains(" b1/l3/Relu:0 ", " mode(Pipelined), ipu(0), stage(0)") log.assert_contains(" b2/l1/Relu:0 ", " mode(Pipelined), ipu(0), stage(0)") log.assert_contains(" b2/l2/Relu:0 ", " mode(Pipelined), ipu(0), stage(0)") log.assert_contains(" b2/l3/Relu:0 ", " mode(Pipelined), ipu(0), stage(0)") def assert_is_pipelined(m): compile_model(m) log = helpers.LogChecker(capfd) log.assert_contains("enablePipelining set to value 1") log.assert_contains(" b1/l1/Relu:0 ", " mode(Pipelined), ipu(0), stage(0)") log.assert_contains(" b1/l2/Relu:0 ", " mode(Pipelined), ipu(1), stage(1)") log.assert_contains(" b1/l3/Relu:0 ", " mode(Pipelined), ipu(1), stage(1)") log.assert_contains(" b2/l1/Relu:0 ", " mode(Pipelined), ipu(2), stage(2)") log.assert_contains(" b2/l2/Relu:0 ", " mode(Pipelined), ipu(2), stage(2)") log.assert_contains(" b2/l3/Relu:0 ", " mode(Pipelined), ipu(2), stage(2)") m = Model() poptorch.BeginBlock(m.b1.l2, ipu_id=1) poptorch.BeginBlock(m.b2, ipu_id=2) assert_is_pipelined(m) with pytest.raises(poptorch.Error, match="module has already been assigned to a block"): poptorch.BeginBlock(m.b1.l2, ipu_id=1) poptorch.removeBlocks(m) assert_is_not_pipelined(m) poptorch.BeginBlock(m.b1.l2, ipu_id=1) poptorch.BeginBlock(m.b2, ipu_id=2) assert_is_pipelined(m) ================================================ FILE: tests/pooling_and_padding_test.py ================================================ #!/usr/bin/env python3 # Copyright (c) 2020 Graphcore Ltd. All rights reserved. import pytest import torch import helpers import poptorch # Pools pool_operators = [ torch.nn.MaxPool1d, torch.nn.MaxPool2d, torch.nn.MaxPool3d, torch.nn.MaxUnpool1d, torch.nn.MaxUnpool2d, torch.nn.MaxUnpool3d, torch.nn.AvgPool1d, torch.nn.AvgPool2d, torch.nn.AvgPool3d, torch.nn.FractionalMaxPool2d, torch.nn.LPPool1d, torch.nn.LPPool2d, torch.nn.AdaptiveMaxPool1d, torch.nn.AdaptiveMaxPool2d, torch.nn.AdaptiveMaxPool3d, torch.nn.AdaptiveAvgPool1d, torch.nn.AdaptiveAvgPool2d, torch.nn.AdaptiveAvgPool3d ] # Supported. pool_1D = [torch.nn.MaxPool1d, torch.nn.AvgPool1d] pool_2D = [torch.nn.MaxPool2d, torch.nn.AvgPool2d] pool_3D = [torch.nn.MaxPool3d, torch.nn.AvgPool3d] adaptive_avg_pool = [ (torch.nn.AdaptiveAvgPool1d, 1), # Op, N output dims (torch.nn.AdaptiveAvgPool2d, 2), (torch.nn.AdaptiveAvgPool3d, 3), ] # torch.nn.AdaptiveMaxPool2d] # Adaptive max pooling isn't supported due to returning 2 outputs, easy fix. # TODO (T22978) # TODO(T25617): PopART does not support PadGradOp when mode is not "constant". # Ops without grad implementations in PopART ops_grad_unsupported = ( torch.nn.ReflectionPad1d, torch.nn.ReflectionPad2d, torch.nn.ReplicationPad1d, torch.nn.ReplicationPad2d, torch.nn.ReplicationPad3d, ) def execute_and_check_wrapper(op, input, check_shape_only=False): model = helpers.ModelWithWeights(op, input.shape) # Run on CPU. native_out, _ = model((input, )) test_training = not isinstance(op, ops_grad_unsupported) # Run on IPU. poptorch_model = poptorch.trainingModel( model) if test_training else poptorch.inferenceModel(model) poptorch_out, _ = poptorch_model((input, )) if not check_shape_only: # Inference test - check outputs helpers.assert_allclose(actual=poptorch_out, expected=native_out) else: # This is due to adaptive pooling's process essentially being an implementation detail. assert poptorch_out.size() == native_out.size() if test_training: # Training test - check weights have changed poptorch_model.assert_weights_changed() @pytest.mark.parametrize("op", pool_2D) def test_pool2D(op): torch.manual_seed(42) input = torch.randn(1, 2, 10, 10) # pool of square window of size=3, stride=2 model = op(3, stride=2) execute_and_check_wrapper(model, input) # pool of square window of size=3, stride=2, ceil_mode=True model = op(3, stride=2, ceil_mode=True) execute_and_check_wrapper(model, input) # pool of non-square window model = op((3, 2), stride=(2, 1)) execute_and_check_wrapper(model, input) # pool of square window of size=3, stride=2, padding=1 model = op(3, stride=2, padding=1) execute_and_check_wrapper(model, input) if op == torch.nn.AvgPool2d: # pool of square window of size=3, stride=2, padding=1, pool excludes padding model = op(3, stride=2, padding=1, count_include_pad=False) execute_and_check_wrapper(model, input) @pytest.mark.parametrize("op, n_output_dims", adaptive_avg_pool) def test_adaptive_avg_pool(op, n_output_dims): torch.manual_seed(42) # AdaptiveAvgPool1d: [1, 2, 4] -> [1, 2, 2] # AdaptiveAvgPool2d: [1, 2, 4, 6] -> [1, 2, 2, 3] # AdaptiveAvgPool3d: [1, 2, 4, 6, 8] -> [1, 2, 2, 3, 4] # TODO(T31335): Match PyTorch's implementation so that we can test cases where # input dims are not divisible by corresponding output dims shape = [1, 2] shape.extend([2 * i + 4 for i in range(n_output_dims)]) input = torch.randn(shape) output_size = [i + 2 for i in range(n_output_dims)] model = op(output_size) execute_and_check_wrapper(model, input) # Padding one_d_pads = [ torch.nn.ReflectionPad1d, torch.nn.ReplicationPad1d, torch.nn.ConstantPad1d ] @pytest.mark.parametrize("op", one_d_pads) def test_1D_pads(op): torch.manual_seed(42) # torch.nn.ConstantPad1d, 'torch.nn.ConstantPad2d', 'torch.nn.ConstantPad3d', # One D case oneDTensor = torch.randn(1, 2, 4) # Pad evenly in both directions. if op == torch.nn.ConstantPad1d: model = op(2, 4.7) else: model = op(3) execute_and_check_wrapper(model, oneDTensor) # Pad unevenly in both directions. if op == torch.nn.ConstantPad1d: model = op((3, 2), 0.12456) else: model = op((3, 2)) execute_and_check_wrapper(model, oneDTensor) two_d_pads = [ torch.nn.ReflectionPad2d, torch.nn.ReplicationPad2d, torch.nn.ConstantPad2d, torch.nn.ZeroPad2d ] @pytest.mark.parametrize("op", two_d_pads) def test_2D_pads(op): # 2D Case twoDTensor = torch.randn(1, 2, 4, 4) # Pad evenly in all directions. if op == torch.nn.ConstantPad2d: model = op(6, 2.3) else: model = op(2) execute_and_check_wrapper(model, twoDTensor) # Pad unevenly in all directions. if op == torch.nn.ConstantPad2d: model = op((3, 2, 1, 5), 4.7) else: model = op((3, 2, 1, 3)) execute_and_check_wrapper(model, twoDTensor) three_d_pads = [torch.nn.ReplicationPad3d, torch.nn.ConstantPad3d] @pytest.mark.parametrize("op", three_d_pads) def test_3D_pads(op): # 3D Case threeDTensor = torch.randn(1, 2, 4, 4, 4) # Pad evenly in all directions. if op == torch.nn.ConstantPad3d: model = op(2, 6.4) else: model = op(3) execute_and_check_wrapper(model, threeDTensor) # Pad unevenly in all directions. if op == torch.nn.ConstantPad3d: model = op((3, 2, 1, 5, 3, 4), 7.2) else: model = op((3, 2, 1, 5, 3, 4)) execute_and_check_wrapper(model, threeDTensor) def test_constant_pad_less_dims(): torch.manual_seed(42) class Model(torch.nn.Module): def forward(self, x): # Only pad the last dimension of input return torch.nn.functional.pad(x, [1, 2]) x = torch.randn(1, 2, 3, 4) execute_and_check_wrapper(Model(), x) def test_constant_pad_n_dims(): torch.manual_seed(42) class Model(torch.nn.Module): def forward(self, x): # Pad left/right dims by 1 and 2 respectively, for every dim return torch.nn.functional.pad(x, [(i % 2) + 1 for i in range(8)]) x = torch.randn(1, 2, 3, 4) execute_and_check_wrapper(Model(), x) ================================================ FILE: tests/popdist_test.py ================================================ # Copyright (c) 2021 Graphcore Ltd. All rights reserved. import pytest import poptorch # pylint: disable=import-outside-toplevel def test_blocked_options(): try: import popdist.poptorch except ModuleNotFoundError: pytest.skip( "Unable to import popdist: possibly a Python version mismatch?") opts = popdist.poptorch.Options(ipus_per_replica=2) with pytest.raises( RuntimeError, match=r"Cannot call `useIpuId` with popdist\.poptorch\.Options"): opts.useIpuId(1) with pytest.raises(RuntimeError, match=r"Cannot call `replicationFactor` with " r"popdist\.poptorch\.Options"): opts.replicationFactor(1) with pytest.raises(RuntimeError, match=r"Cannot call `Distributed.disable` with " r"popdist\.poptorch\.Options"): opts.Distributed.disable() with pytest.raises(RuntimeError, match=r"Cannot call `Distributed.setEnvVarNames` with " r"popdist\.poptorch\.Options"): opts.Distributed.setEnvVarNames("A", "B") with pytest.raises( RuntimeError, match=r"Cannot call `Distributed.configureProcessId` with " r"popdist\.poptorch\.Options"): opts.Distributed.configureProcessId(1) # pylint: disable=import-outside-toplevel def test_getters(): try: import popdist.poptorch except ModuleNotFoundError: pytest.skip( "Unable to import popdist: possibly a Python version mismatch?") opts = popdist.poptorch.Options(ipus_per_replica=2) assert opts.Distributed.processId == 0 assert opts.Distributed.numProcesses == 1 # pylint: disable=protected-access,import-outside-toplevel @pytest.mark.ipuHardwareRequired def test_to_dict(): try: import popdist.poptorch except ModuleNotFoundError: pytest.skip( "Unable to import popdist: possibly a Python version mismatch?") opts = popdist.poptorch.Options(ipus_per_replica=2) opts.outputMode(poptorch.enums.OutputMode.All) opts.toDict() # Should not be frozen here opts.checkIsFrozen() opts._freeze() # Should unfeeze and freeze again opts.toDict() with pytest.raises(AttributeError, match=r"Can't modify frozen Options"): opts.checkIsFrozen() ================================================ FILE: tests/poplar_executor_test.py ================================================ #!/usr/bin/env python3 # Copyright (c) 2020 Graphcore Ltd. All rights reserved. import datetime import unittest.mock import os import re import tempfile import glob import warnings import pytest import torch import torch.multiprocessing as mp import helpers import poptorch @pytest.mark.ipuHardwareRequired @helpers.printCapfdOnExit @helpers.overridePoptorchLogLevel("DEBUG") def test_ExecutableCaching(capfd): class Model(torch.nn.Module): def forward(self, x): return x * 6 with tempfile.TemporaryDirectory() as cache: opts = poptorch.Options() opts.enableExecutableCaching(cache) m = poptorch.inferenceModel(Model(), opts) m.compile(torch.rand(2, 3)) m.destroy() log = helpers.LogChecker(capfd) log.assert_contains("set enableEngineCaching to value true") assert len(os.listdir(cache)) == 1, "No executable saved in the cache" n = poptorch.inferenceModel(Model(), opts) n.compile(torch.rand(2, 3)) log = helpers.LogChecker(capfd) log.assert_contains("set enableEngineCaching to value true") @pytest.mark.ipuHardwareRequired @helpers.printCapfdOnExit @helpers.overridePoptorchLogLevel("DEBUG") def test_ExecutableCaching_env(capfd): class Model(torch.nn.Module): def forward(self, x): return x * 6 with tempfile.TemporaryDirectory() as cache: os.environ["POPTORCH_CACHE_DIR"] = cache opts = poptorch.Options() m = poptorch.inferenceModel(Model(), opts) m.compile(torch.rand(2, 3)) m.destroy() log = helpers.LogChecker(capfd) log.assert_contains("set enableEngineCaching to value true") assert len(os.listdir(cache)) == 1, "No executable saved in the cache" n = poptorch.inferenceModel(Model(), opts) n.compile(torch.rand(2, 3)) log = helpers.LogChecker(capfd) log.assert_contains("set enableEngineCaching to value true") class Network(torch.nn.Module): def forward(self, x, y): return x + y def _create_model_and_export(opts, filename): model = Network() inference_model = poptorch.inferenceModel(model, opts) x = torch.ones(2) y = torch.zeros(2) inference_model.compileAndExport(filename, x, y) assert os.path.exists(filename) @unittest.mock.patch.dict("os.environ", helpers.disableAllModels()) def test_offline_ipu_compileAndExport_file(filename=None): # Force-disable the IPU model opts = poptorch.Options().useOfflineIpuTarget() with tempfile.TemporaryDirectory() as tmp: filename = os.path.join(tmp, "model.poptorch") _create_model_and_export(opts, filename) @pytest.mark.ipuHardwareRequired def test_precompile_then_load(): opts = poptorch.Options().useOfflineIpuTarget( poptorch.ipuHardwareVersion()) with tempfile.TemporaryDirectory() as tmp: filename = os.path.join(tmp, "model.poptorch") _create_model_and_export(opts, filename) poptorch_model = poptorch.load(filename) x = torch.tensor([1., 2.]) y = torch.tensor([3., 4.]) # Check the user model was restored helpers.assert_allclose(actual=poptorch_model.model(x, y), expected=torch.tensor([4., 6.])) helpers.assert_allclose(actual=poptorch_model(x, y), expected=torch.tensor([4., 6.])) @unittest.mock.patch.dict("os.environ", helpers.disableAllModels()) def test_offline_ipu_compileAndExport_dir(): class Network(torch.nn.Module): def forward(self, x, y): return x + y model = Network() # Force-disable the IPU model opts = poptorch.Options().useOfflineIpuTarget() poptorch.inferenceModel(model, opts) inference_model = poptorch.inferenceModel(model, opts) x = torch.ones(2) y = torch.zeros(2) with tempfile.TemporaryDirectory() as tmp: assert os.path.isdir(tmp) # Model is local to the function: it cannot be serialised so don't # export it. inference_model.compileAndExport(tmp, x, y, export_model=False) files = glob.glob(f"{tmp}/*") assert len(files) == 1, "Expected exactly 1 file" def test_inference_attributes(): class Model(torch.nn.Module): def __init__(self, attr): super().__init__() self.attr = attr def getAttr(self): return self.attr def forward(self, x, y): return x + y + 5 poptorch_model = poptorch.inferenceModel(Model("MyAttr")) t1 = torch.tensor([1.]) t2 = torch.tensor([2.]) poptorch_model(t1, t2) assert poptorch_model.getAttr() == poptorch_model.attr assert poptorch_model.attr == "MyAttr" def test_training_attributes(): def custom_loss(output, target): # Mean squared error with a scale loss = output - target loss = loss * loss * 5 return poptorch.identity_loss(loss, reduction="mean") class Model(torch.nn.Module): def __init__(self, attr): super().__init__() self.bias = torch.nn.Parameter(torch.zeros(())) self.attr = attr def getAttr(self): return self.attr def forward(self, x, target): x = x + 1 x = poptorch.ipu_print_tensor(x) + self.bias return x, custom_loss(x, target) model = Model("MyAttr") input = torch.tensor([1.0, 2.0, 3.0]) target = torch.tensor([30.0, 40.0, 50.0]) poptorch_model = poptorch.trainingModel(model) poptorch_model(input, target) assert poptorch_model.getAttr() == poptorch_model.attr assert poptorch_model.attr == "MyAttr" @pytest.mark.ipuHardwareRequired @pytest.mark.parametrize("use_half", [False]) def test_explicit_destroy(use_half): class ExampleModel(torch.nn.Module): def __init__(self): super().__init__() self.bias = torch.nn.Parameter(torch.zeros(())) def forward(self, x): x = x + 1 # It is important to make sure the result of the print is used. x = poptorch.ipu_print_tensor(x) return x + self.bias def custom_loss(output, target): # Mean squared error with a scale loss = output - target loss = loss * loss * 5 return poptorch.identity_loss(loss, reduction="mean") class ExampleModelWithCustomLoss(torch.nn.Module): def __init__(self): super().__init__() self.model = ExampleModel() def forward(self, input, target=None): out = self.model(input) if target is not None: return out, custom_loss(out, target) return out opts = poptorch.Options() # Both models will use the same IPU device. opts.useIpuId(1) model = ExampleModelWithCustomLoss() input = torch.tensor([1.0, 2.0, 3.0]) target = torch.tensor([30.0, 40.0, 50.0]) if use_half: model.half() input = input.half() target = target.half() training_model = poptorch.trainingModel(model, opts) inference_model = poptorch.inferenceModel(model, opts) training_model(input=input, target=target) training_model.destroy() error_msg = r"Model has not been compiled or has been destroyed." with pytest.raises(poptorch.Error, match=error_msg): training_model.copyWeightsToHost() with pytest.raises(poptorch.Error, match=error_msg): training_model.copyWeightsToDevice() inference_model(input) def _compile_model_offline(cache, pid, num_processes): poptorch.setLogLevel("DEBUG") # Force debug logging in worker process opts = poptorch.Options().useOfflineIpuTarget() opts.enableExecutableCaching(cache) # Disable compilation bar to avoid issues with capfd opts.showCompilationProgressBar(False) opts.deviceIterations(10) opts.Distributed.configureProcessId(pid, num_processes) class ModelWithLoss(torch.nn.Module): def __init__(self): super().__init__() self.linear = torch.nn.Linear(10, 10) self.loss = torch.nn.CrossEntropyLoss() def forward(self, data, target): out = self.linear(data) loss = self.loss(out, target) return out, loss model = ModelWithLoss() poptorch_model = poptorch.trainingModel(model, options=opts) # 10 Batches of 10. input = torch.randn(10, 10) # 10 batches of 1 label = torch.randint(0, 10, [1]) label = label.expand([10]) poptorch_model.compile(input, label) # Force-disable the IPU model @unittest.mock.patch.dict("os.environ", helpers.disableAllModels()) @helpers.printCapfdOnExit def test_distributed_compile(capfd): num_processes = 6 with tempfile.TemporaryDirectory() as tmp: cache = os.path.join(tmp, "poptorch_cache") ctx = mp.get_context('spawn') processes = [ ctx.Process(target=_compile_model_offline, args=(cache, pid, num_processes)) for pid in range(num_processes) ] for p in processes: p.start() for p in processes: p.join() def getTimestamp(line): m = re.match(r"\[([\d:.]+)\]", line) return datetime.datetime.strptime(m.group(1), "%H:%M:%S.%f") log = helpers.LogChecker(capfd).createIterator() includes_compilation = True for p in processes: start = getTimestamp(log.findNext("cache file locked")) end = getTimestamp(log.findNext("released the cache lock")) if includes_compilation: assert end - start > datetime.timedelta(seconds=1), ( "Expected the" " first process model compilation to take more than 1 " f"second but it took {end - start}") else: assert end - start < datetime.timedelta(seconds=1), ( "Expected " "processes to load the executable from the cache in under" f" 1 second but it took {end - start}") includes_compilation = False def test_cpu_output(): const1 = torch.tensor([1, 2]) const2 = torch.tensor([3, 4]) class Model(torch.nn.Module): def forward(self): return (const1 + const2, ([const1, const2], [const1, const2]), const2) model = Model() with warnings.catch_warnings(record=True) as filtered_warnings: poptorch.inferenceModel(model).compile() pop_warns = set(str(w.message) for w in filtered_warnings) expected_warning = "Output expected to be on the IPU but is on cpu" for r in pop_warns: assert expected_warning in r, (f"Compilation generated unexpected " f"warning.\nActual warning: {r}") @pytest.mark.ipuHardwareRequired def test_get_cycles_error_msgs(): class Model(torch.nn.Module): def forward(self, x, y): return x + y inference_model = poptorch.inferenceModel(Model()) error_msg = (r"Cycle count logging is disabled. Please set option " r"logCycleCount to True to enable.") with pytest.raises(poptorch.Error, match=error_msg): inference_model.cycleCount() opts = poptorch.Options() opts.logCycleCount(True) inference_model = poptorch.inferenceModel(Model(), options=opts) error_msg = (r"Please run the model at least once before obtaining cycle " r"count.") with pytest.raises(poptorch.Error, match=error_msg): inference_model.cycleCount() inference_model.compile(torch.Tensor([1.0]), torch.Tensor([2.0])) error_msg = (r"Please run the model at least once before obtaining cycle " r"count.") with pytest.raises(poptorch.Error, match=error_msg): inference_model.cycleCount() inference_model(torch.Tensor([3.0]), torch.Tensor([4.0])) assert inference_model.cycleCount() > 0 @pytest.mark.skipif(poptorch.ipuHardwareIsAvailable(), reason="Test error message when no hardware") def test_get_cycles_no_hw(): class Model(torch.nn.Module): def forward(self, x, y): return x + y inference_model = poptorch.inferenceModel(Model()) opts = poptorch.Options() opts.logCycleCount(True) inference_model = poptorch.inferenceModel(Model(), options=opts) error_msg = ( r"Cycle count logging is only supported on actual IPU hardware.") with pytest.raises(poptorch.Error, match=error_msg): inference_model(torch.Tensor([3.0]), torch.Tensor([4.0])) def test_get_compilation_time(): class Model(torch.nn.Module): def forward(self, x, y): return x + y no_compilation_time_opts = poptorch.Options() no_compilation_time_opts.showCompilationProgressBar(False) no_compilation_time_model = poptorch.inferenceModel( Model(), options=no_compilation_time_opts) compilation_time_opts = poptorch.Options() compilation_time_opts.showCompilationProgressBar(True) compilation_time_model = poptorch.inferenceModel( Model(), options=compilation_time_opts) error_msg = ( r"Please compile the model before obtaining compilation time.") with pytest.raises(poptorch.Error, match=error_msg): no_compilation_time_model.compilationTime() with pytest.raises(poptorch.Error, match=error_msg): compilation_time_model.compilationTime() error_msg = ( r"Please set showCompilationProgressBar option to obtain compilation " r"time.") with pytest.raises(poptorch.Error, match=error_msg): no_compilation_time_model(torch.Tensor([3.0]), torch.Tensor([4.0])) no_compilation_time_model.compilationTime() compilation_time_model(torch.Tensor([3.0]), torch.Tensor([4.0])) compilation_time = compilation_time_model.compilationTime() assert compilation_time > datetime.timedelta(seconds=1) @pytest.mark.parametrize("rewrap_executor", [True, False]) def test_rewrap_model(rewrap_executor): class Model(torch.nn.Module): def __init__(self): super().__init__() self.fc = torch.nn.Linear(1, 1) self.loss = torch.nn.L1Loss() def forward(self, x): y = self.fc(x) loss = self.loss(y, x + 1) return loss model = Model() # Normal running torch.nn.init.ones_(model.fc.weight) torch.nn.init.zeros_(model.fc.bias) opts = poptorch.Options() opts.deviceIterations(10) poptorch_model = poptorch.trainingModel(model, options=opts) poptorch_model(torch.ones([10])) bias_after_1000 = float(model.fc.bias) # Try rewrapping model half way torch.nn.init.ones_(model.fc.weight) torch.nn.init.zeros_(model.fc.bias) with pytest.raises(AssertionError): helpers.assert_allclose(actual=model.fc.bias, expected=bias_after_1000) model.destroy() opts = poptorch.Options() opts.deviceIterations(5) poptorch_model = poptorch.trainingModel(model, options=opts) poptorch_model(torch.ones([5])) err_msg = (r"Model has already been wrapped in 'poptorch.trainingModel'." r" Call model.destroy\(\) on the model to unwrap before " "wrapping again.") with pytest.raises(RuntimeError, match=err_msg): poptorch_model = poptorch.trainingModel(model, options=opts) # re-wrap for test if rewrap_executor: poptorch_model.destroy() poptorch_model = poptorch.trainingModel(poptorch_model, options=opts) else: model.destroy() poptorch_model = poptorch.trainingModel(model, options=opts) poptorch_model(torch.ones([5])) helpers.assert_allclose(actual=float(model.fc.bias), expected=bias_after_1000) ================================================ FILE: tests/precompilation_test.py ================================================ #!/usr/bin/env python3 # Copyright (c) 2021 Graphcore Ltd. All rights reserved. import os import re import marshal import subprocess import sys import json import pathlib import tempfile import unittest.mock import pytest import torch import helpers import poptorch class ExampleModelWithLoss(torch.nn.Module): def __init__(self): super().__init__() self.fc = torch.nn.Linear(10, 10) self.loss = torch.nn.CrossEntropyLoss() def forward(self, x, target=None): fc = self.fc(x) if self.training: return fc, self.loss(fc, target) return fc def _createExampleModel(training, offline_target=False): torch.manual_seed(42) model = ExampleModelWithLoss() opts = poptorch.Options() if offline_target: opts.useOfflineIpuTarget(poptorch.ipuHardwareVersion()) if training: model.train() poptorch_model = poptorch.trainingModel(model, opts) else: model.eval() poptorch_model = poptorch.inferenceModel(model, opts) return poptorch_model def _compileAndExport(filename, export_model=True, training=True): poptorch_model = _createExampleModel(training, True) input = torch.randn(1, 10) target = torch.randint(0, 10, [1]) if training: poptorch_model.compileAndExport(filename, input, target, export_model=export_model) else: poptorch_model.compileAndExport(filename, input, export_model=export_model) poptorch_model.destroy() return input, target @pytest.mark.ipuHardwareRequired def test_export_then_load_live_model(): with tempfile.TemporaryDirectory() as tmp: filename = os.path.join(tmp, "model.poptorch") poptorch_model = _createExampleModel(training=False) input = torch.randn(1, 10) # Running the model will trigger the executable compilation poptorch_model(input) # Save the executable and destroy the model poptorch_model.save(filename) poptorch_model.destroy() # Reload the model from file and run it. poptorch_model = poptorch.load(filename) poptorch_model(input) @pytest.mark.ipuHardwareRequired def test_export_then_load(): with tempfile.TemporaryDirectory() as tmp: filename = os.path.join(tmp, "model.poptorch") input, target = _compileAndExport(filename) poptorch_model = poptorch.load(filename) poptorch_model(input, target) @pytest.mark.ipuHardwareRequired def test_export_then_load_setIpu(): with tempfile.TemporaryDirectory() as tmp: filename = os.path.join(tmp, "model.poptorch") input, target = _compileAndExport(filename) def setIpuDevice(opts): opts.useIpuId(1) # always use IPU 1 poptorch_model = poptorch.load(filename, edit_opts_fn=setIpuDevice) poptorch_model(input, target) @pytest.mark.ipuHardwareRequired def test_export_no_python_then_load(): with tempfile.TemporaryDirectory() as tmp: filename = os.path.join(tmp, "model.poptorch") input, target = _compileAndExport(filename, export_model=False) # load_exe_start model = ExampleModelWithLoss() opts = poptorch.Options() poptorch_model = poptorch.trainingModel(model, opts) poptorch_model.loadExecutable(filename) poptorch_model(input, target) @pytest.mark.ipuHardwareRequired def test_export_train_validate_no_python(): with tempfile.TemporaryDirectory() as tmp: train_filename = os.path.join(tmp, "train.poptorch") valid_filename = os.path.join(tmp, "valid.poptorch") input, target = _compileAndExport(train_filename, export_model=False) _compileAndExport(valid_filename, export_model=False, training=False) model = ExampleModelWithLoss() options = poptorch.Options() training_model = poptorch.trainingModel(model, options=options) training_model.loadExecutable(train_filename) model.eval() validation_model = poptorch.inferenceModel(model, options) validation_model.loadExecutable(valid_filename) # Make sure the first run doesn't already pass the test. out, original_loss = training_model(input, target) assert torch.argmax(out, dim=1) != target out = validation_model(input) assert torch.argmax(out, dim=1) != target for _ in range(500): out, loss = training_model(input, target) # Check we have trained the model assert loss < original_loss assert loss < 0.05 assert torch.argmax(out, dim=1) == target # Check validation model has the weights out = validation_model(input) assert torch.argmax(out, dim=1) == target @pytest.mark.ipuHardwareRequired def test_export_train_validate(): with tempfile.TemporaryDirectory() as tmp: train_filename = os.path.join(tmp, "train.poptorch") valid_filename = os.path.join(tmp, "valid.poptorch") input, target = _compileAndExport(train_filename) _compileAndExport(valid_filename, training=False) training_model = poptorch.load(train_filename) options = poptorch.Options() validation_model = poptorch.inferenceModel(training_model, options) validation_model.model.eval() validation_model.loadExecutable(valid_filename) # Make sure the first run doesn't already pass the test. out, original_loss = training_model(input, target) assert torch.argmax(out, dim=1) != target out = validation_model(input) assert torch.argmax(out, dim=1) != target for _ in range(500): out, loss = training_model(input, target) # Check we have trained the model assert loss < original_loss assert loss < 0.05 assert torch.argmax(out, dim=1) == target # Check validation model has the weights out = validation_model(input) assert torch.argmax(out, dim=1) == target @pytest.mark.ipuHardwareRequired def test_export_train_save_validate(): with tempfile.TemporaryDirectory() as tmp: train_filename = os.path.join(tmp, "train.poptorch") valid_filename = os.path.join(tmp, "valid.poptorch") input, target = _compileAndExport(train_filename) training_model = poptorch.load(train_filename) opts = poptorch.Options() opts.useOfflineIpuTarget(poptorch.ipuHardwareVersion()) validation_model = poptorch.inferenceModel(training_model, opts) validation_model.model.eval() # Make sure the first run doesn't already pass the test. out, original_loss = training_model(input, target) assert torch.argmax(out, dim=1) != target # Now train the model for _ in range(500): out, loss = training_model(input, target) # Check we have trained the model assert loss < original_loss assert loss < 0.05 assert torch.argmax(out, dim=1) == target validation_model.compileAndExport(valid_filename, input) validation_model = poptorch.load(valid_filename) # Check validation model has the weights out = validation_model(input) assert torch.argmax(out, dim=1) == target @pytest.mark.ipuHardwareRequired def test_export_train_save_train(): with tempfile.TemporaryDirectory() as tmp: train_filename = os.path.join(tmp, "train.poptorch") weights_filename = os.path.join(tmp, "weights.poptorch") input, target = _compileAndExport(train_filename) training_model = poptorch.load(train_filename) # Make sure the first run doesn't already pass the test. out, original_loss = training_model(input, target) assert torch.argmax(out, dim=1) != target # Now train the model for _ in range(500): out, loss = training_model(input, target) # Check we have trained the model assert loss < original_loss assert loss < 0.05 assert torch.argmax(out, dim=1) == target torch.save(training_model.model.state_dict(), weights_filename) training_model.destroy() training_model = poptorch.load(train_filename) training_model.load_state_dict(torch.load(weights_filename)) # Check we still have the trained weights out, loss = training_model(input, target) assert loss < original_loss assert loss < 0.05 assert torch.argmax(out, dim=1) == target @pytest.mark.ipuHardwareRequired def test_export_train_save_validate_load_weights(): with tempfile.TemporaryDirectory() as tmp: train_filename = os.path.join(tmp, "train.poptorch") valid_filename = os.path.join(tmp, "valid.poptorch") weights_filename = os.path.join(tmp, "weights.poptorch") _compileAndExport(valid_filename, training=False) input, target = _compileAndExport(train_filename) training_model = poptorch.load(train_filename) # Make sure the first run doesn't already pass the test. out, original_loss = training_model(input, target) assert torch.argmax(out, dim=1) != target # Now train the model for _ in range(500): out, loss = training_model(input, target) # Check we have trained the model assert loss < original_loss assert loss < 0.05 assert torch.argmax(out, dim=1) == target torch.save(training_model.model, weights_filename) training_model.destroy() validation_model = poptorch.load(valid_filename) validation_model.load_state_dict( torch.load(weights_filename).state_dict()) # Check validation model has the weights out = validation_model(input) assert torch.argmax(out, dim=1) == target def process_to_generate_profiling_data(): """A function executed as a script running in a separate process. We need to do this because profiling data is only written to disk when a process exits. """ # pylint: disable=import-outside-toplevel # pylint: disable=reimported import poptorch import torch class Block(torch.nn.Module): def __init__(self, num_hidden): super().__init__() self.softmax = torch.nn.LogSoftmax(1) self.lstm = torch.nn.LSTM(3, num_hidden) def forward(self, x): x, _ = self.lstm(x) x = self.softmax(x) return x class Model(torch.nn.Module): def __init__(self, num_hidden): super().__init__() self.relu = torch.nn.ReLU() self.block0 = Block(num_hidden) def forward(self, x): x = self.block0(x) x = self.relu(x) loss = poptorch.identity_loss(x**2, reduction='sum') return x, loss input = torch.randn(1, 1, 3) model = Model(3) optimizer = poptorch.optim.SGD(model.parameters(), lr=0.0) opts = poptorch.Options() opts.useOfflineIpuTarget() training_model = poptorch.trainingModel(model, opts, optimizer=optimizer) training_model.compile(input) @unittest.mock.patch.dict( "os.environ", { **helpers.disableAllModels(), "POPLAR_ENGINE_OPTIONS": json.dumps({ "autoReport.directory": ".", "autoReport.all": "true", "autoReport.outputDebugInfo": "true", "autoReport.outputExecutionProfile": "false" }) }) @pytest.mark.mlirSupportRequired # pylint: disable=import-outside-toplevel def test_pva_annotations(): try: import pva except RuntimeError: pytest.skip( "Unable to import pva: possibly a Python version mismatch?") def findPoptorchLayer(op): layer = json.loads(op.layer)["layer"] if layer == "poptorch": return op assert op.parents, "Can't find 'poptorch' layer" return findPoptorchLayer(op.parents[0]) with tempfile.TemporaryDirectory() as tmp: os.chdir(tmp) subprocess.check_output( [ sys.executable, "-u", # needed to ensure messages are sent to stdout immediately "-c", f""" import os, marshal, types;code = marshal.loads({marshal.dumps(process_to_generate_profiling_data.__code__)}) fn = types.FunctionType(code, globals(), "generate_profiling_data") fn() """ ], universal_newlines=True, env=os.environ) debug = pathlib.Path("debug.cbor").resolve(strict=True) profile = pathlib.Path("training", "profile.pop").resolve(strict=True) # Read this file and find where the layers were called from to make # sure the line numbers are correct inside the profiling information. it = helpers.LogIterator(open(__file__, "r").read().split("\n")) lines = [] it.findNext(re.escape("def process_to_generate_profiling_data():")) for e in [ "self.lstm(", "self.softmax(", "self.relu(", "identity_loss(" ]: it.findNext(re.escape(e)) lines.append(it.lineNumber()) report = pva.openReport(str(profile), str(debug)) op_analysis = pva.OperationAnalysis(report) for op in op_analysis.operations: if not op.name or op.name == "Call": continue if op.replacedDebugContext: ctx = op.replacedDebugContext[0] else: ctx = op.debugContext pop_op = findPoptorchLayer(ctx) data = json.loads(pop_op.json) op_file = pop_op.location.fileName op_line = pop_op.location.lineNumber print(f"Name {op.name} {op_file}:{op_line} Debug {data}") # All the ops should be associated to this file assert os.path.realpath(op_file) == os.path.realpath(__file__) assert op.name == data["op_name"] # The identity loss is not a layer in the model therefore it won't have a prefix. if data["op_type"] in ["Pow", "Identityloss"]: assert data["op_name"] == data["op_type"] else: # All the other ops are stored in the model therefore they'll have prefix # "foo/op_type" where "foo" is the name of the attribute in the model. assert data["op_name"].endswith("/" + data["op_type"]) assert data["layer"] == "poptorch" if data["op_name"].startswith("block0/lstm"): assert op_line == lines[0] elif data["op_name"].startswith("block0/softmax"): assert op_line == lines[1] elif data["op_name"].startswith("relu/"): assert op_line == lines[2] elif data["op_name"] == data["op_type"]: # identity_loss(x**2) assert op_line == lines[3] else: raise ValueError("Unexpected op " + data["op_name"]) ================================================ FILE: tests/pyg_torch_scatter_test.py ================================================ #!/usr/bin/env python3 # Copyright (c) 2022 Graphcore Ltd. All rights reserved. # Tests for PyG torch_scatter ops integration with PopTorch from functools import partial import torch import pytest import helpers import poptorch if helpers.is_running_tests: from torch_scatter import scatter, scatter_log_softmax, scatter_softmax, scatter_std, scatter_logsumexp, scatter_add, scatter_max, scatter_min, scatter_mul else: def scatter(): pass def scatter_log_softmax(): pass def scatter_softmax(): pass def scatter_std(): pass def scatter_add(): pass def scatter_max(): pass def scatter_min(): pass def scatter_mul(): pass def scatter_logsumexp(): pass def torch_scatter_harness(func, src, index, out=None): dim_size = int(index.max()) + 1 class Model(torch.nn.Module): def forward(self, src, index, out=None): if out is None: return func(src, index, dim_size=dim_size) return func(src, index, out=out, dim_size=dim_size) model = Model() poptorch_model = poptorch.inferenceModel(model) out_in_plac_native = None if out is not None: out_in_plac_native = out.clone() native_out = func(src, index, out=out_in_plac_native, dim_size=dim_size) ipu_out = poptorch_model(src, index, out=out) else: native_out = func(src, index, dim_size=dim_size) ipu_out = poptorch_model(src, index) helpers.assert_allclose(actual=ipu_out, expected=native_out) if out is not None: helpers.assert_allclose(actual=out, expected=out_in_plac_native) poptorch_model.destroy() @pytest.mark.parametrize("reduce", ['sum', 'mean', 'max', 'min', 'mul']) def test_scatter(reduce): func = partial(scatter, reduce=reduce) src = torch.tensor([1, 3, 2, 4, 5, 6]).float() index = torch.tensor([0, 1, 0, 1, 1, 3]).long() torch_scatter_harness(func, src, index) @pytest.mark.parametrize( "func", [scatter_log_softmax, scatter_logsumexp, scatter_softmax, scatter_std]) def test_composites(func): src = torch.tensor([1, 3, 2, 4, 5, 6]).float() index = torch.tensor([0, 1, 0, 1, 5, 3]).long() torch_scatter_harness(func, src, index) @pytest.mark.parametrize("func", [scatter_max, scatter_min, scatter_mul]) def test_scatter_inplace(func): src = torch.tensor([1, 3, 2, 4, 5, 6]).float() index = torch.tensor([0, 1, 4, 2, 3, 5]).long() out = torch.tensor([10, 1, 11, 1, 23, 1]).float() torch_scatter_harness(func, src, index, out) @helpers.printCapfdOnExit @helpers.overridePoptorchLogLevel("TRACE") def test_scatter_add_zeros_optimized(capfd): src = torch.tensor([1, 3, 2, 4, 5, 6]).float() index = torch.tensor([0, 1, 0, 1, 1, 3]).long() torch_scatter_harness(scatter_add, src, index) it = helpers.LogChecker(capfd).createIterator() it.findNext("Removing zeros output to scatter_add") @helpers.printCapfdOnExit @helpers.overridePoptorchLogLevel("TRACE") def test_scatter_add_nd_expand_removed(capfd): torch.manual_seed(0) src = torch.randn(10, 6, 16) index = torch.tensor([0, 1, 0, 1, 1, 3]).long() func = partial(scatter_add, dim=1) torch_scatter_harness(func, src, index) it = helpers.LogChecker(capfd).createIterator() it.findNext("Removing index expansion node:") @pytest.mark.parametrize("shape", [(5, ), (2, 5), (2, 5, 5)]) @pytest.mark.parametrize("func", [scatter_max, scatter_min, scatter_mul]) def test_scatter_overloads(shape, func): torch.manual_seed(0) x = torch.rand(shape) ind = torch.randint(3, shape) torch_scatter_harness(func, x, ind) ================================================ FILE: tests/random_sampling_test.py ================================================ #!/usr/bin/env python3 # Copyright (c) 2020 Graphcore Ltd. All rights reserved. import torch import pytest import helpers import poptorch def int_mean(x): return torch.mean(x.to(torch.float)) def int_var(x): return torch.var(x.to(torch.float)) # Random Number Generation Harness # Checks that the IPU generated data with roughly the same summary # statistics as the CPU version. def rng_harness(rng_op, input, stat_funs, expected_dtype=torch.float): class Model(torch.nn.Module): def __init__(self): super().__init__() self.rng_op = rng_op def forward(self, x): torch.manual_seed(42) x = x + 0 # Ensure input is not modified in place return self.rng_op(x) model = Model() # Run on IPU and check that the result has the correct type opts = poptorch.Options().randomSeed(8) pop_model = poptorch.inferenceModel(model, opts) pop_out = pop_model(input) assert pop_out.dtype == expected_dtype if expected_dtype is torch.half: # Promote CPU model and input model = model.float() input = input.float() # promote IPU result to allow summary stat comparison pop_out = pop_out.float() native_out = model(input) assert native_out.size() == pop_out.size() # PRNG depends on HW implementation so we just check # that the distribution statistics are consistent print("Checking summary statistics for generated random numbers:") for ss in stat_funs: print(" {} = poptorch {}, native {}".format(ss.__name__, ss(pop_out), ss(native_out)), flush=True) helpers.assert_allclose(expected=ss(native_out), actual=ss(pop_out), atol=1e-2, rtol=0.1) # torch.rand @pytest.mark.ipuHardwareRequired def test_rand(): def rng_op(x): return torch.rand(x.size()) stat_funs = [torch.min, torch.max, torch.mean, torch.var] input = torch.empty(size=(3, 5, 100)) rng_harness(rng_op, input, stat_funs) # torch.distributions.Uniform @pytest.mark.ipuHardwareRequired def test_distributions_uniform(): def rng_op(x): ud = torch.distributions.Uniform(0.0, 10.0) return ud.sample(x.size()) sample_like = torch.empty(10, 10, 1000) stat_funs = [torch.min, torch.max, torch.mean, torch.var] rng_harness(rng_op, sample_like, stat_funs) # torch.uniform_ @pytest.mark.ipuHardwareRequired @pytest.mark.parametrize("dt", [torch.float, torch.half]) def test_uniform_(dt): def rng_op(x): return x.uniform_() input = torch.empty(size=(3, 4, 1000), dtype=dt) stat_funs = [torch.min, torch.max, torch.mean, torch.var] rng_harness(rng_op, input, stat_funs, expected_dtype=dt) # torch.normal @pytest.mark.ipuHardwareRequired def test_normal(): def rng_op(x): return torch.normal(mean=0.0, std=1.0, size=x.size()) input = torch.empty(6, 10, 1000) stat_funs = [torch.mean, torch.var] rng_harness(rng_op, input, stat_funs) # torch.normal_ @pytest.mark.ipuHardwareRequired @pytest.mark.parametrize("dt", [torch.float, torch.half]) def test_normal_(dt): def rng_op(x): return x.normal_(mean=1.0, std=2.0) input = torch.empty(size=(3, 5, 1000), dtype=dt) stat_funs = [torch.mean, torch.var] rng_harness(rng_op, input, stat_funs, expected_dtype=dt) # torch.normal with buffers and params @pytest.mark.ipuHardwareRequired def test_normal_buffers(): class Model(torch.nn.Module): def __init__(self): super().__init__() self.register_buffer("mean", torch.Tensor([1.0, 2.0, 3.0])) self.register_parameter( "std", torch.nn.Parameter(torch.Tensor([0.5, 1.0, 1.5]))) def forward(self, x): torch.manual_seed(42) return torch.normal(self.mean, 0.5) + torch.normal(1.0, self.std) + x model = Model() # Run on IPU and check that the result has the correct type opts = poptorch.Options().randomSeed(8) pop_model = poptorch.inferenceModel(model, opts) pop_out = pop_model(torch.tensor([0.0, 0.0, 0.0])) assert pop_out.dtype == torch.float native_out = model(torch.tensor([0.0, 0.0, 0.0])) assert native_out.size() == pop_out.size() # torch.distributions.Normal # The sample method uses torch.normal(Tensor mean, Tensor std) @pytest.mark.ipuHardwareRequired def test_distributions_normal(): def rng_op(x): h = torch.tensor([234.0, 100.0]) nd = torch.distributions.Normal(loc=h, scale=torch.sqrt(h)) return nd.sample(x.size()) input = torch.empty(10000, 5) mean = lambda x: torch.mean(x, dim=[0, 1]) mean.__name__ = "torch.mean(x, dim=[0, 1])" std = lambda x: torch.std(x, dim=[0, 1]) std.__name__ = "torch.std(x, dim=[0, 1])" stat_funs = [mean, std] rng_harness(rng_op, input, stat_funs) # torch.randn @pytest.mark.ipuHardwareRequired def test_randn(): def rng_op(x): return torch.randn(x.size()) input = torch.empty(3, 5, 10000) stat_funs = [torch.mean, torch.var] rng_harness(rng_op, input, stat_funs) # torch.random_ @pytest.mark.ipuHardwareRequired @pytest.mark.parametrize("input", [ torch.empty(3, 5, 10000, dtype=torch.float), torch.empty(3, 5, 10000, dtype=torch.int), ]) def test_random(input): def rng_op(x): return x.random_(5, 100) stat_funs = [torch.min, torch.max, int_mean, int_var] rng_harness(rng_op, input, stat_funs, input.dtype) # torch.randint @pytest.mark.ipuHardwareRequired @pytest.mark.parametrize("dtype", [None, torch.int32, torch.half, torch.float]) def test_randint(dtype): def rng_op(x): return torch.randint(5, 100, x.size(), dtype=dtype) input = torch.empty(3, 5, 10000) stat_funs = [torch.min, torch.max, int_mean, int_var] rng_harness(rng_op, input, stat_funs, torch.int32 if dtype is None else dtype) # torch.normal(Tensor mean, float std) @pytest.mark.ipuHardwareRequired def test_normal_tensor_mean(): def rng_op(x): return torch.normal(mean=x, std=3.0) mean = torch.full(size=(10000, 2), fill_value=4.0) stat_funs = [torch.mean, torch.std] rng_harness(rng_op, mean, stat_funs) # torch.normal(float mean, Tensor std) @pytest.mark.ipuHardwareRequired def test_normal_tensor_std(): def rng_op(x): return torch.normal(mean=3.0, std=x) std = torch.full(size=(10000, 2), fill_value=9.0) stat_funs = [torch.mean, torch.std] rng_harness(rng_op, std, stat_funs) # torch.bernoulli - test with both float and half types @pytest.mark.ipuHardwareRequired @pytest.mark.parametrize("t", [torch.float, torch.half]) def test_bernoulli(t): prob = torch.full(size=(3, 5, 100), dtype=t, fill_value=0.5) stat_funs = [torch.min, torch.max, torch.mean] rng_harness(torch.bernoulli, prob, stat_funs, expected_dtype=t) # torch.bernoulli - check expected output for probability limits. @pytest.mark.ipuHardwareRequired @pytest.mark.parametrize("p", [0.0, 1.0]) def test_bernoulli_limits(p): prob = torch.full(size=(3, 5, 1000), fill_value=p) func = lambda x: torch.all(x == p) func.__name__ = f"torch.all(x == {p})" rng_harness(torch.bernoulli, prob, [func]) # torch.bernoulli_ @pytest.mark.ipuHardwareRequired def test_bernoulli_(): def rng_op(x): return x.bernoulli_(p=0.3) input = torch.empty(3, 5, 100) stat_funs = [torch.min, torch.max, torch.mean] rng_harness(rng_op, input, stat_funs) # torch.distributions.Bernoulli @pytest.mark.ipuHardwareRequired def test_distributions_bernoulli(): def rng_op(x): bd = torch.distributions.Bernoulli(0.5) return bd.sample(x.size()) input = torch.empty(10, 10, 1000) stat_funs = [torch.min, torch.max, torch.mean] rng_harness(rng_op, input, stat_funs) # torch.exponential_ @pytest.mark.ipuHardwareRequired @pytest.mark.parametrize("lambd", [0.5, 1.0]) def test_exponential_(lambd): def rng_op(x): return x.exponential_(lambd=lambd) input = torch.empty(3, 5, 100) stat_funs = [torch.mean] rng_harness(rng_op, input, stat_funs) # torch.distributions.Exponential @pytest.mark.ipuHardwareRequired def test_distributions_exponential(): def rng_op(x): bd = torch.distributions.Exponential(0.5) return bd.sample(x.size()) input = torch.empty(10, 10, 1000) stat_funs = [torch.mean] rng_harness(rng_op, input, stat_funs) @pytest.mark.ipuHardwareRequired def test_randperm(): def rng_op(x): return torch.randperm(x.size(dim=0)) + 0 input = torch.arange(100) stat_funs = [torch.numel] rng_harness(rng_op, input, stat_funs, torch.int32) @pytest.mark.ipuHardwareRequired def test_random_seed_repeatability(): class Model(torch.nn.Module): def forward(self, x): x = x + 0 # Ensure input is not modified in place return x.normal_() # Run the model once with a random seed model = Model() opts = poptorch.Options().randomSeed(42) first_model = poptorch.inferenceModel(model, opts) first_run = first_model(torch.empty((2, 2))) # Second run with the same seed should produce identical results second_model = poptorch.inferenceModel(model, opts) second_run = second_model(torch.empty((2, 2))) helpers.assert_allequal(expected=first_run, actual=second_run) ================================================ FILE: tests/reduce_ops_test.py ================================================ #!/usr/bin/env python3 # Copyright (c) 2022 Graphcore Ltd. All rights reserved. import torch import pytest import numpy as np import helpers import poptorch # Reduce Ops Harness # Checks that the IPU reduce ops match the CPU version. def reduce_harness(func, input, **kwargs): # pass any reduce op kwargs only if they're set to # avoid named tensor errors op_kwargs = {name: val for name, val in kwargs.items() if val is not None} def reduce_op(x): return func(x, **op_kwargs) class Model(torch.nn.Module): def __init__(self): super().__init__() self.reduce_op = reduce_op def forward(self, x): # Ensure input is not modified in place x = x + 0 return self.reduce_op(x) model = Model() # Run on IPU and check that the result has the correct type pop_model = poptorch.inferenceModel(model) pop_out = pop_model(input) native_out = model(input) check_dtype = "dtype" in kwargs if torch.is_floating_point(native_out): helpers.assert_allclose(expected=native_out, actual=pop_out, check_dtype=check_dtype) else: helpers.assert_allequal(expected=native_out, actual=pop_out, check_dtype=check_dtype) # torch.all, torch.any @pytest.mark.parametrize("dim", [None, 0, -1]) @pytest.mark.parametrize("func", [torch.all, torch.any]) def test_any_all(func, dim): input = torch.randint(low=0, high=3, size=(32, 128)) reduce_harness(func, input, dim=dim) @pytest.mark.parametrize("dim", [None, 0, -1]) @pytest.mark.parametrize("func", [torch.sum, torch.mean]) def test_sum_mean(func, dim): input = torch.rand(32, 128) reduce_harness(func, input, dim=dim) @pytest.mark.parametrize("dim", (None, 0, -1, [1, 2])) def test_count_nonzero(dim): torch.manual_seed(42) input = torch.randint(10, (2, 3, 4, 5)) reduce_harness(torch.count_nonzero, input, dim=dim) @pytest.mark.parametrize("dim", (None, 0, -1, [0, 1])) @pytest.mark.parametrize("keepdim", [True, False]) @pytest.mark.parametrize("dtype", [torch.float32, torch.float16]) def test_nansum(dim, keepdim, dtype): np.random.seed(0) # Create a tensor that contains some nans - be careful in the # torch.float16 case to not overflow the float16 range shape = (10, 10) mask = np.random.randint(0, 2, size=shape).astype(bool) data = np.random.rand(*shape).astype(np.float32) data[mask] = np.nan input = torch.from_numpy(data) reduce_harness( torch.nansum, input, dim=dim, keepdim=keepdim, dtype=dtype) ================================================ FILE: tests/replicated_graph_test.py ================================================ # Copyright (c) 2020 Graphcore Ltd. All rights reserved. import torch import pytest import numpy as np import helpers import poptorch @pytest.mark.ipuHardwareRequired def test_weight_update_replicas(process_id=0, num_processes=1): localReplicationFactor = 2 opts = poptorch.Options() opts.replicationFactor(localReplicationFactor) opts.Distributed.configureProcessId(process_id, num_processes) replicationFactor = localReplicationFactor * opts.Distributed.numProcesses np.random.seed(42) A = np.random.rand(2, 4).astype(np.float32) B = np.ones((4, 6)).astype(np.float32) C = np.random.rand(2, 6).astype(np.float32) alpha = np.random.random(1).astype(np.float32)[0] beta = np.random.random(1).astype(np.float32)[0] class Model(torch.nn.Module): def __init__(self): super().__init__() self.b = torch.tensor(B, requires_grad=True) self.c = torch.tensor(C, requires_grad=True) # Create the weight tensors for pytorch self.B = torch.nn.Parameter(self.b, requires_grad=True) self.C = torch.nn.Parameter(self.c, requires_grad=True) self.matmul = torch.matmul self.loss = torch.nn.L1Loss(reduction="mean") def forward(self, input, target): # Perform the GEMM operation x = alpha * self.matmul(input, self.B) + beta * self.C loss = self.loss(x, target) return x, loss def reference(): module = Model() module.train() optimizer = torch.optim.SGD(module.parameters(), lr=0.01, weight_decay=0.0, momentum=0.0) a = torch.tensor(A, requires_grad=True) optimizer.zero_grad() outputs = () # graph with gradient accumlation i.e. only update the weights after x passes for _ in range(replicationFactor): target = torch.zeros(C.shape) out, loss = module(a, target) outputs = outputs + (out, ) loss.backward() # Update the weights optimizer.step() # Only keep the output slice corresponding to this process outputs = outputs[opts.Distributed.processId * localReplicationFactor:][:localReplicationFactor] return [torch.cat(outputs), module.B.data, module.C.data] model = Model() poptorch_model = poptorch.trainingModel(model, options=opts, optimizer=torch.optim.SGD( model.parameters(), lr=0.01, weight_decay=0.0, momentum=0.0)) ref_out = reference() ipu_A = np.concatenate([A for _ in range(localReplicationFactor)]) target = torch.zeros(2 * localReplicationFactor, 6) output, _ = poptorch_model(torch.tensor(ipu_A, requires_grad=True), target) out = [output, model.B.data, model.C.data] for idx, ref in enumerate(ref_out): print("Validating output %d" % idx) helpers.assert_allclose(actual=out[idx], expected=ref, rtol=1e-03, atol=1e-03) @pytest.mark.ipuHardwareRequired def test_too_many_ipus(): localReplicationFactor = 128 opts = poptorch.Options() opts.replicationFactor(localReplicationFactor) class Model(torch.nn.Module): def __init__(self): super().__init__() self.layer = torch.nn.Linear(128, 4) self.loss = torch.nn.L1Loss(reduction="mean") def forward(self, input, target): out = self.layer(input) loss = self.loss(out, target) return out, loss model = Model() poptorch_model = poptorch.trainingModel(model, options=opts, optimizer=torch.optim.SGD( model.parameters(), lr=0.01, weight_decay=0.0, momentum=0.0)) np.random.seed(42) input = np.random.rand(512, 128).astype(np.float32) labels = np.ones((128, 4)).astype(np.float32) with pytest.raises( poptorch.Error, match=r"Too many IPUs requested \(128\)\. Experiments that need .*" ): poptorch_model(torch.tensor(input, requires_grad=True), torch.tensor(labels)) class ModelWithLoss(torch.nn.Module): def __init__(self, W_init): super().__init__() self.W = torch.nn.Parameter(W_init) def forward(self, X): Z = X @ self.W return Z, poptorch.identity_loss(Z**2, reduction="mean") @pytest.mark.ipuHardwareRequired @pytest.mark.parametrize("orthogonalInput", [True, False]) def test_per_replica_variables(orthogonalInput): # Split the weight tensor into 4, and the input data tensor into 2. tensor_shards = 4 data_shards = 2 # Set up the problem random = np.random.RandomState(seed=100) prob_X = random.normal(size=(24, 40)).astype(np.float32) prob_W_init = random.normal(size=(40, 56)).astype( np.float32) * (5 * 8)**-0.5 prob_steps = 4 # Run on the CPU X = torch.tensor(prob_X) W = torch.nn.Parameter(torch.tensor(prob_W_init)) optim = torch.optim.SGD([W], lr=0.01) cpu_losses = [] for _ in range(prob_steps): optim.zero_grad() v = (X @ W)**2 loss = torch.mean(v) loss.backward() optim.step() cpu_losses.append(loss.detach()) cpu_losses = np.array(cpu_losses) cpu_W_final = W.detach().numpy() # Run on 8 IPUs W_init = torch.tensor( prob_W_init.reshape(prob_W_init.shape[0], tensor_shards, prob_W_init.shape[1] // tensor_shards).transpose( 1, 0, 2)).contiguous() m = ModelWithLoss(W_init) optim = torch.optim.SGD(m.parameters(), lr=0.01) inputGroupType = poptorch.enums.CommGroupType.Consecutive weightGroupType = poptorch.enums.CommGroupType.Orthogonal if orthogonalInput: inputGroupType, weightGroupType = weightGroupType, inputGroupType pt_opts = poptorch.Options() pt_opts.replicationFactor(data_shards * tensor_shards) pt_opts.inputReplicaGrouping(tensor_shards, inputGroupType) pt_opts.outputMode(poptorch.OutputMode.All) pt_m = poptorch.trainingModel(m, optimizer=optim, options=pt_opts) pt_m.W.replicaGrouping(weightGroupType, data_shards, poptorch.enums.VariableRetrievalMode.OnePerGroup) pt_losses = [] if data_shards > 1: X = X.reshape(data_shards, X.shape[0] // data_shards, *X.shape[1:]) for _ in range(prob_steps): _, loss = pt_m(X) # We divide by the number of replicas because the mean is being # taken only over a part of the tensor on each replica, so we need to # divide by the number of replicas to get the correct mean. pt_losses.append( torch.sum(loss.detach()) / (data_shards * tensor_shards)) pt_losses = np.array(pt_losses) pt_W_final = m.W.detach().numpy().transpose(1, 0, 2) \ .reshape(prob_W_init.shape) np.testing.assert_allclose(cpu_losses, pt_losses, atol=1e-6) np.testing.assert_allclose(cpu_W_final, pt_W_final, atol=1e-6) @pytest.mark.ipuHardwareRequired def test_per_replica_variables_no_grouping(): shape0 = 4 shape1 = shape0 + 2 model = ModelWithLoss(torch.randn(shape0, shape1)) optimizer = poptorch.optim.AdamW(model.parameters(), 0.1) options = poptorch.Options() options.replication_factor = shape0 training_model = poptorch.trainingModel(model, options, optimizer=optimizer) training_model.W.replicaGrouping( poptorch.CommGroupType.NoGrouping, 0, poptorch.VariableRetrievalMode.OnePerGroup) training_model(torch.randn(shape0, shape1)) for _, v in optimizer.state_dict()["ipu_state"].items(): assert v.shape[0] == shape0 training_model.destroy() ================================================ FILE: tests/requires_grad_test.py ================================================ #!/usr/bin/env python3 # Copyright (c) 2021 Graphcore Ltd. All rights reserved. import torch import helpers import poptorch def test_requires_grad_false_simple(): torch.manual_seed(42) class Model(torch.nn.Module): def __init__(self, a, b, c, d): super().__init__() self.a = torch.nn.Parameter(a) self.b = torch.nn.Parameter(b) self.c = torch.nn.Parameter(c, requires_grad=False) self.d = torch.nn.Parameter(d, requires_grad=False) self.loss = torch.nn.MSELoss() def forward(self, target): s0 = self.a + self.b s1 = self.c + self.d return self.loss(s0 + s1, target) # Ends up with requires_grad=True. a = torch.randn(5) b = torch.randn(5) # Ends up with requires_grad=False. c = torch.randn(5) d = torch.randn(5) target = torch.randn(5) model = Model(a.clone(), b.clone(), c.clone(), d.clone()) native_out = model(target) poptorch_model = poptorch.trainingModel(model) poptorch_out = poptorch_model(target) helpers.assert_allclose(actual=poptorch_out, expected=native_out) for _ in range(100): poptorch_out = poptorch_model(target) assert not torch.allclose(poptorch_out, native_out) # 'a' and 'b' are updated assert not torch.allclose(poptorch_model.a.data, a) assert not torch.allclose(poptorch_model.b.data, b) # 'c' and 'd' are not updated helpers.assert_allclose(actual=poptorch_model.c.data, expected=c) helpers.assert_allclose(actual=poptorch_model.d.data, expected=d) def test_requires_grad_false_on_single_input(): torch.manual_seed(42) class Model(torch.nn.Module): def __init__(self, a, b): super().__init__() self.a = torch.nn.Parameter(a) self.b = torch.nn.Parameter(b, requires_grad=False) self.loss = torch.nn.MSELoss() def forward(self, target): s = self.a + self.b return self.loss(s, target) # Ends up with requires_grad=True. a = torch.randn(5) # Ends up with requires_grad=False. b = torch.randn(5) target = torch.randn(5) model = Model(a.clone(), b.clone()) native_out = model(target) poptorch_model = poptorch.trainingModel(model) poptorch_out = poptorch_model(target) helpers.assert_allclose(actual=poptorch_out, expected=native_out) for _ in range(100): poptorch_out = poptorch_model(target) assert not torch.allclose(poptorch_out, native_out) # 'a' is updated assert not torch.allclose(poptorch_model.a.data, a) # 'b' is not updated helpers.assert_allclose(actual=poptorch_model.b.data, expected=b) ================================================ FILE: tests/rnn_test.py ================================================ #!/usr/bin/env python3 # Copyright (c) 2022 Graphcore Ltd. All rights reserved. import pytest import torch import torch.nn as nn import helpers import poptorch @pytest.mark.parametrize("nonlinearity", ['tanh', 'relu']) @pytest.mark.parametrize("batch_first", [True, False]) def test_rnn(nonlinearity, batch_first): torch.manual_seed(42) num_batches = 10 sequence_length = 5 batch_size = 8 input_size = 4 hidden_size = 3 num_layers = 1 if batch_first: input_shape = (batch_size, sequence_length, input_size) else: input_shape = (sequence_length, batch_size, input_size) inputs = [torch.randn(input_shape) for _ in range(num_batches)] h = torch.randn((num_layers, batch_size, hidden_size)) rnn = nn.RNN( input_size, hidden_size, num_layers, nonlinearity=nonlinearity, batch_first=batch_first, ) model = helpers.ModelWithWeights(rnn, inputs[0].shape, lambda x: x[0]) ipu_model = poptorch.trainingModel(model) for input in inputs: (out_cpu, h_cpu), _ = model((input, h)) (out_ipu, h_ipu), _ = ipu_model((input, h)) helpers.assert_allclose(actual=out_ipu, expected=out_cpu) helpers.assert_allclose(actual=h_ipu, expected=h_cpu) ipu_model.assert_weights_changed() h = h_cpu ================================================ FILE: tests/sharding_test.py ================================================ #!/usr/bin/env python3 # Copyright (c) 2021 Graphcore Ltd. All rights reserved. import torch import helpers import poptorch def test_sharded_execution(): class Model(torch.nn.Module): def forward(self, x): with poptorch.Block("0", ipu_id=0): x = x * 2 with poptorch.Block("1", ipu_id=1): x = x * 3 with poptorch.Block("2", ipu_id=2): x = x * 4 with poptorch.Block("3", ipu_id=3): x = x * 5 return x native = Model() stages = [poptorch.Stage(f"{k}") for k in range(0, 4)] strategy = poptorch.ShardedExecution(*stages) opts = poptorch.Options() opts.setExecutionStrategy(strategy) ipu = poptorch.inferenceModel(native, opts) torch.manual_seed(42) inp = torch.randn(3, 7) native_out = native(inp) ipu_out = ipu(inp) helpers.assert_allclose(actual=ipu_out, expected=native_out) ================================================ FILE: tests/slice_test.py ================================================ #!/usr/bin/env python3 # Copyright (c) 2020 Graphcore Ltd. All rights reserved. import copy import pytest import torch import helpers import poptorch def slice_test_harness(tensor_x, tensor_y, start_fn, end_fn, step): op = lambda x, y: x[start_fn(x):end_fn(x):step] + y model = helpers.ModelWithWeights(op, tensor_x.shape) # Run on CPU. native_out, _ = model((tensor_x, tensor_y)) # Run on IPU. options = poptorch.Options() poptorch_model = poptorch.trainingModel(model, options=options) poptorch_out, _ = poptorch_model((tensor_x, tensor_y)) # Inference test - check outputs helpers.assert_allclose(expected=native_out, actual=poptorch_out) # Training test - check weights changed poptorch_model.assert_weights_changed() @pytest.mark.parametrize("step", [1, 2, 3]) def test_slice_idx_size_of(step): def start_fn(tensor_in): return tensor_in.shape[0] // 2 def end_fn(tensor_in): return tensor_in.shape[0] - 1 slice_test_harness(torch.tensor([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]), torch.tensor([3.0]), start_fn, end_fn, step) def dynamic_slice_harness(tensor_in, extra_in, start_fn, end_fn, step, test_training=False): # TODO(T62094) PopART doesn't currently support dynamic slices in training. # Once it works, switch back test_training to True by default. options = poptorch.Options() if test_training: size = end_fn(1) - start_fn(1) op = lambda t, e: poptorch.dynamic_slice(t, 0, start_fn(e), size, step) model = helpers.ModelWithWeights(op, tensor_in.shape) # Run on CPU. native_out, _ = model((tensor_in, extra_in)) # Run on IPU. poptorch_model = poptorch.trainingModel(model, options) poptorch_out, _ = poptorch_model((tensor_in, extra_in)) # Training test - check weights changed poptorch_model.assert_weights_changed() else: model = torch.nn.Module() size = (end_fn(torch.tensor([1], dtype=torch.int)) - start_fn(torch.tensor([1], dtype=torch.int))).item() model.forward = lambda t, e: poptorch.dynamic_slice( t, 0, start_fn(e), size, step) # Run on CPU. native_out = model(tensor_in, extra_in) # Run on IPU. poptorch_model = poptorch.inferenceModel(model, options) # Make sure the model is compiled using different tensor values # otherwise there is no way to tell if the values are compiled # in the executable or truly dynamic. poptorch_model.compile( torch.randn_like(tensor_in), # Use a random input extra_in + torch.tensor([20]) # Offset extra_in ) poptorch_out = poptorch_model(tensor_in, extra_in) helpers.assert_allclose(expected=native_out, actual=poptorch_out) @pytest.mark.parametrize("step", [1, 2, 3]) def test_dynamic_slice_one_dim_add(step): def start_fn(extra_in): return extra_in def end_fn(extra_in): return extra_in + 4 dynamic_slice_harness( torch.tensor([2.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]), torch.tensor([1]), start_fn, end_fn, step) @pytest.mark.parametrize("step", [1, 2, 3]) def test_dynamic_slice_one_dim_subtract(step): def start_fn(extra_in): return extra_in - 4 def end_fn(extra_in): return extra_in dynamic_slice_harness( torch.tensor([2.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]), torch.tensor([5]), start_fn, end_fn, step) @pytest.mark.parametrize("step", [1, 2, 3]) def test_dynamic_slice_one_dim_mix_up(step): def start_fn(extra_in): tmp = extra_in + 3 tmp = tmp - 10 tmp = tmp + 3 return tmp def end_fn(extra_in): tmp = extra_in - 6 tmp = tmp + 4 return tmp dynamic_slice_harness( torch.tensor([2.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]), torch.tensor([5]), start_fn, end_fn, step) @pytest.mark.parametrize("step", [1, 2, 3]) def test_dynamic_slice_two_dims(step): def start_fn(extra_in): return extra_in.to(torch.int32) def end_fn(extra_in): return extra_in.to(torch.int32) + 1 dynamic_slice_harness( torch.tensor([[2.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0], [8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0]]), torch.tensor([0]), start_fn, end_fn, step) @pytest.mark.parametrize("step", [1, 2, 3]) def test_dynamic_slice_two_dims_twice_sliced(step): start_dim_one = torch.tensor([1]) start_dim_two = torch.tensor([0]) op = lambda t: t[start_dim_one:start_dim_one + 2:step, start_dim_two: start_dim_two + 4:step] tensor_in = torch.tensor([[2.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0], [8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0], [2.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0], [8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0]]) model = helpers.ModelWithWeights(op, tensor_in.shape) # Run on CPU. native_out, _ = model((tensor_in, )) # Run on IPU. options = poptorch.Options() poptorch_model = poptorch.trainingModel(model, options=options) poptorch_out, _ = poptorch_model((tensor_in, )) # Inference test - check outputs helpers.assert_allclose(expected=native_out, actual=poptorch_out) # Training test - check weights changed poptorch_model.assert_weights_changed() def test_dynamic_slice_one_dim_equal(): def start_fn(extra_in): return extra_in def end_fn(extra_in): return extra_in error_msg = r"The start and end of a slice must be different." with pytest.raises(poptorch.Error, match=error_msg): # Set test_training=False because we expect inference to fail dynamic_slice_harness(torch.tensor( [2.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]), torch.tensor([5]), start_fn, end_fn, 1, test_training=False) def test_dynamic_slice_one_dim_less_than(): def start_fn(extra_in): return extra_in def end_fn(extra_in): return extra_in - 2 error_msg = (r"Taking a slice of a tensor with the end less than the " r"start is not supported.") with pytest.raises(poptorch.Error, match=error_msg): # Set test_training=False because we expect inference to fail dynamic_slice_harness(torch.tensor( [2.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]), torch.tensor([5]), start_fn, end_fn, 2, test_training=False) def test_dynamic_slice_one_dim_add_non_factor(): def start_fn(extra_in): return extra_in def end_fn(extra_in): return extra_in + 7 error_msg = (r"The size of the slice \(7\) must be a factor of the " r"slicing dimension \(8\)\.") with pytest.raises(poptorch.Error, match=error_msg): # Set test_training=False because we expect inference to fail dynamic_slice_harness(torch.tensor( [2.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]), torch.tensor([1]), start_fn, end_fn, 1, test_training=False) @pytest.mark.parametrize("dim", [0, 1, 2]) @pytest.mark.parametrize("use_half", [True, False]) def test_unbind(dim, use_half): if use_half: # Test correct implicit casting def op(x): unbound = torch.unbind(x, dim) return unbound[0] + 2.0, unbound[1] else: op = lambda x: torch.unbind(x, dim) x = torch.randn(2, 3, 4) model = helpers.ModelWithWeights(op, x.shape, out_fn=lambda x: x[0]) # Unfortunately not all forms of matmul are supported for torch.half on the # CPU (including 1-dim input, 2-dim weights), so we can only run the IPU # model with halves. poptorch_model = copy.deepcopy(model) if use_half: poptorch_model.half() # pylint: disable=protected-access poptorch_model._weights_before = poptorch_model.lin.weight.detach( ).clone() options = poptorch.Options() poptorch_model = poptorch.trainingModel(poptorch_model, options=options) native_out, _ = model((x, )) poptorch_out, _ = poptorch_model((x.half() if use_half else x, )) # Check the unbound dim length is the same assert len(native_out) == len(poptorch_out) # Inference test - check outputs for tensor_native, tensor_pop in zip(native_out, poptorch_out): if use_half: tensor_native = tensor_native.half() helpers.assert_allclose(expected=tensor_native, actual=tensor_pop, atol=0.01, rtol=0.01) # Training test - check weights changed poptorch_model.assert_weights_changed() def test_scalarslice(): class Model(torch.nn.Module): def forward(self, x): return (x / 2)[:] model = Model() options = poptorch.Options() poptorch_model = poptorch.inferenceModel(model, options) input_tensor = torch.tensor([2]) assert poptorch_model(input_tensor) == model(input_tensor) def test_select_negative_dim(): class Model(torch.nn.Module): def forward(self, x): return x.select(-1, 1) model = Model() options = poptorch.Options() poptorch_model = poptorch.inferenceModel(model, options) input_tensor = torch.rand((2, 4)) helpers.assert_allequal(actual=poptorch_model(input_tensor), expected=model(input_tensor)) def test_slice_negative_dim(): class Model(torch.nn.Module): def forward(self, x): # This lowers to aten::select with a negative dim, which is what # we want to test in the JIT dispatcher return x.narrow(-1, 0, 2) model = Model() options = poptorch.Options() poptorch_model = poptorch.inferenceModel(model, options) input_tensor = torch.rand((2, 4)) helpers.assert_allequal(actual=poptorch_model(input_tensor), expected=model(input_tensor)) def dynamic_update_harness(tensor_in, src_in, extra_in, start_fn, end_fn, dim=0, test_training=False): # TODO(T62094) PopART doesn't currently support dynamic slices in training. # Once it works, switch back test_training to True by default. options = poptorch.Options() if test_training: size = end_fn(1) - start_fn(1) op = lambda t, s, e: poptorch.dynamic_update(t, s, dim, start_fn(e), size) model = helpers.ModelWithWeights(op, tensor_in.shape) # Run on IPU. poptorch_model = poptorch.trainingModel(model, options) poptorch_out, _ = poptorch_model((tensor_in, src_in, extra_in)) # Run on CPU. native_out, _ = model((tensor_in, src_in, extra_in)) # Training test - check weights changed poptorch_model.assert_weights_changed() else: model = torch.nn.Module() size = (end_fn(torch.tensor([1], dtype=torch.int)) - start_fn(torch.tensor([1], dtype=torch.int))).item() model.forward = lambda t, s, e: poptorch.dynamic_update( t, s, dim, start_fn(e), size) # Run on IPU. poptorch_model = poptorch.inferenceModel(model, options) # Make sure the model is compiled using different tensor values # otherwise there is no way to tell if the values are compiled # in the executable or truly dynamic. poptorch_model.compile( torch.randn_like(tensor_in), # Use a random input torch.randn_like(src_in), # Use random source values extra_in + torch.tensor([20]) # Offset extra_in ) poptorch_out = poptorch_model(tensor_in, src_in, extra_in) # Run on CPU. native_out = model(tensor_in, src_in, extra_in) helpers.assert_allclose(expected=native_out, actual=poptorch_out) def test_dynamic_update_single_update(): def start_fn(extra_in): return extra_in def end_fn(extra_in): return extra_in + 1 dynamic_update_harness( torch.tensor([2.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]), torch.tensor([-1.0]), torch.tensor([1]), start_fn, end_fn) def test_dynamic_update_one_dim_add(): def start_fn(extra_in): return extra_in def end_fn(extra_in): return extra_in + 4 dynamic_update_harness( torch.tensor([2.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]), torch.tensor([-1.0, -1.0, -1.0, -1.0]), torch.tensor([1]), start_fn, end_fn) def test_dynamic_update_one_dim_subtract(): def start_fn(extra_in): return extra_in - 4 def end_fn(extra_in): return extra_in dynamic_update_harness( torch.tensor([2.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]), torch.tensor([-1.0, -1.0, -1.0, -1.0]), torch.tensor([5]), start_fn, end_fn) def test_dynamic_update_one_dim_equal(): def start_fn(extra_in): return extra_in def end_fn(extra_in): return extra_in error_msg = r"The start and end of a slice must be different" with pytest.raises(poptorch.Error, match=error_msg): dynamic_update_harness( torch.tensor([2.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]), torch.tensor([-1.0]), torch.tensor([1]), start_fn, end_fn) def test_dynamic_update_one_dim_add_non_factor(): def start_fn(extra_in): return extra_in def end_fn(extra_in): return extra_in + 2 # Set test_training=False because we expect inference to fail dynamic_update_harness(torch.tensor([2.0, 2.0, 3.0]), torch.tensor([-1.0, -1.0]), torch.tensor([1]), start_fn, end_fn, test_training=False) def test_dynamic_update_one_dim_less_than(): def start_fn(extra_in): return extra_in def end_fn(extra_in): return extra_in - 2 error_msg = (r"Taking a slice of a tensor with the end less than the " r"start is not supported.") with pytest.raises(poptorch.Error, match=error_msg): # Set test_training=False because we expect inference to fail dynamic_update_harness(torch.tensor( [2.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]), torch.tensor([7.0, 8.0]), torch.tensor([5]), start_fn, end_fn, test_training=False) def test_dynamic_update_two_dims(): def start_fn(extra_in): return extra_in.to(torch.int32) def end_fn(extra_in): return extra_in.to(torch.int32) + 1 dynamic_update_harness( torch.tensor([[2.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0], [8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0]]), torch.tensor([[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0]]), torch.tensor([0]), start_fn, end_fn) def test_dynamic_update_wrong_dim(): def start_fn(extra_in): return extra_in.to(torch.int32) def end_fn(extra_in): return extra_in.to(torch.int32) + 1 error_msg = (r"input and src tensors must have same dimensionality. " r"\(2\) vs \(1\)") with pytest.raises(poptorch.Error, match=error_msg): dynamic_update_harness( torch.tensor([[2.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0], [8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0]]), torch.tensor([-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0]), torch.tensor([0]), start_fn, end_fn) def test_dynamic_update_two_dims_dim1(): def start_fn(extra_in): return extra_in.to(torch.int32) def end_fn(extra_in): return extra_in.to(torch.int32) + 1 dynamic_update_harness(torch.tensor( [[2.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0], [8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0]]), torch.tensor([[-1.0], [-1.0]]), torch.tensor([4]), start_fn, end_fn, dim=1) def test_dynamic_update_3_dims_dim0(): def start_fn(extra_in): return extra_in.to(torch.int32) def end_fn(extra_in): return extra_in.to(torch.int32) + 2 input = torch.ones(3, 4, 5) src = torch.ones(2, 4, 5) * -1.0 dynamic_update_harness(input, src, torch.tensor([1]), start_fn, end_fn, dim=0) def test_dynamic_update_3_dims_dim1(): def start_fn(extra_in): return extra_in.to(torch.int32) def end_fn(extra_in): return extra_in.to(torch.int32) + 2 input = torch.ones(3, 4, 5) src = torch.ones(3, 2, 5) * -1.0 dynamic_update_harness(input, src, torch.tensor([1]), start_fn, end_fn, dim=1) def test_dynamic_update_3_dims_dim2(): def start_fn(extra_in): return extra_in.to(torch.int32) def end_fn(extra_in): return extra_in.to(torch.int32) + 3 input = torch.ones(3, 4, 5) src = torch.ones(3, 4, 3) * -1.0 dynamic_update_harness(input, src, torch.tensor([2]), start_fn, end_fn, dim=2) def test_dynamic_update_wrong_dtype(): t = torch.tensor([2.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]) s = torch.tensor([-1]) idx = torch.tensor([1]) model = torch.nn.Module() model.forward = lambda t, s, e: poptorch.dynamic_update(t, s, 0, idx, 1) # Run on IPU. options = poptorch.Options() poptorch_model = poptorch.inferenceModel(model, options) error_msg = (r"input and src tensor must have same dtype\." r" \(torch\.float32 vs torch.int32\)") with pytest.raises(poptorch.Error, match=error_msg): poptorch_model.compile( torch.randn_like(t), # Use a random input s, idx + torch.tensor([20]) # Offset extra_in ) ================================================ FILE: tests/tensor_ops_test.py ================================================ #!/usr/bin/env python3 # Copyright (c) 2020 Graphcore Ltd. All rights reserved. import copy from functools import partial import re import pytest import torch import helpers import poptorch # Tensors # Creation ops (we don't support many of these) # torch.numel, torch.tensor, torch.sparse_coo_tensor, torch.as_tensor, torch.as_strided, torch.from_numpy, torch.zeros, # torch.zeros_like, torch.ones, torch.ones_like, torch.arange, torch.range, torch.linspace, torch.logspace, torch.eye, # torch.empty, torch.empty_like, torch.empty_strided, torch.full, torch.full_like, torch.quantize_per_tensor, torch.quantize_per_channel, # Indexing, Slicing, Joining, Mutating Ops # torch.cat, torch.chunk, torch.gather, torch.index_select, torch.masked_select, torch.narrow, torch.nonzero, torch.reshape, torch.split, # torch.squeeze, torch.stack, torch.t, torch.take, torch.transpose, torch.unbind, torch.unsqueeze, torch.where, torch._C.Generator, # torch._C.Generator.device, def zeros_and_ones_harness(model, dtype, is_like): assert dtype in [torch.float16, torch.float32, torch.int32, torch.bool] torch.manual_seed(42) # Calculating with ints/bools does not produce meaningful gradients test_training = not dtype in (torch.int32, torch.bool) inputs = [torch.tensor([1], dtype=dtype)] if is_like: inputs.append(torch.empty(3, 5, 1)) inputs = tuple(inputs) options = poptorch.Options() if test_training: out_fn = lambda out: out[0] model = helpers.ModelWithWeights(model, inputs[0].shape, out_fn=out_fn) # We need to copy the model to use the original weights for native comparison model_copy = copy.deepcopy(model) # Run on IPU. poptorch_model = poptorch.trainingModel(model, options) poptorch_out, _ = poptorch_model(inputs) if dtype is torch.float16: # Promote CPU model and input model_copy = model_copy.float() inputs = tuple(input.float() for input in inputs) # promote IPU result to allow comparison poptorch_out = [pop.float() for pop in poptorch_out] native_out, _ = model_copy(inputs) else: native_out = model(*inputs) poptorch_model = poptorch.inferenceModel(model, options) poptorch_out = poptorch_model(*inputs) # Inference test - check outputs for native, pop in zip(native_out, poptorch_out): rtol = 0.001 if dtype is torch.float16 else 0.0001 atol = 1e-4 if dtype is torch.float16 else 1e-5 helpers.assert_allclose(expected=native, actual=pop, rtol=rtol, atol=atol) if test_training: # Training test - check weights changed poptorch_model.assert_weights_changed() zeros_and_ones_dtypes = [torch.float16, torch.float32, torch.int32, torch.bool] @pytest.mark.parametrize("dtype", zeros_and_ones_dtypes) def test_zeros_and_ones(dtype): class Model(torch.nn.Module): def forward(self, z): x = torch.zeros(3, 5, 1, dtype=dtype) y = torch.ones(3, 5, 1, dtype=dtype) return (x * y) + z, (y + x) + z zeros_and_ones_harness(Model(), dtype, False) @pytest.mark.parametrize("dtype", zeros_and_ones_dtypes) def test_new_zeros_and_new_ones(dtype): class Model(torch.nn.Module): def forward(self, z): x = z.new_zeros(3, 5, 1) y = z.new_ones(3, 5, 1) return (x * y) + z, (y + x) + z zeros_and_ones_harness(Model(), dtype, False) @pytest.mark.parametrize("dtype", zeros_and_ones_dtypes) def test_zeros_like_and_ones_like(dtype): class Model(torch.nn.Module): def forward(self, z, t): x = torch.zeros_like(t, dtype=dtype) y = torch.ones_like(t, dtype=dtype) return (x * y) + z, (y + x) + z zeros_and_ones_harness(Model(), dtype, True) def fuzzy_compare_exceptions(e_cpu, e_ipu): """Compares error messages from CPU and IPU implementations if they do not match a fuzzy comparison (all words in the first line of the CPU exception are also in the IPU exception) an error is raised. """ e_ipu_words = {word: i for i, word in enumerate(str(e_ipu).split())} # Only compare the first line (The following lines are usually a stacktrace) cpu_msg = str(e_cpu).split("\n")[0] if not all( e_ipu_words.get(word, -1) >= i for i, word in enumerate(cpu_msg.split())): raise ValueError("CPU and IPU error messages did not match: " f"'{cpu_msg}' not in '{e_ipu}'") from e_ipu print(f"CPU and IPU error messages did match: '{cpu_msg}' in '{e_ipu}'") def op_harness(op, *inputs, test_training=True, assert_fn=None, out_fn=None, native_out=None, fuzzy_errors=False, allow_native_errors=True): """The op harness allows to test the native torch API against poptorch. This function wraps an operation into a model and allows training and inference comparisons between py and poptorch. This function returns without errors when tensors are almost equal or the IPU and CPU implementation provide the same error messages. """ def exception_catcher(model, *inputs, can_raise_exception=True): __tracebackhide__ = True # pylint: disable=W0612 op_raises_exception = False try: if test_training: native_out, _ = model(*inputs) else: native_out = model(*inputs) except Exception as e: # pylint: disable=W0703 if not can_raise_exception: raise native_out = ("error", e) op_raises_exception = True assert not poptorch.poptorch_core.isCompilingWithDispatcher(), ( "[Internal] Clean up failed: dispatcher still active") return native_out, op_raises_exception if assert_fn is None: def assert_fn(native_out, poptorch_out): if isinstance(native_out, tuple): for native, pop in zip(native_out, poptorch_out): helpers.assert_allclose(expected=native, actual=pop) else: helpers.assert_allclose(expected=native_out, actual=poptorch_out) op_raises_exception = False options = poptorch.Options() if test_training: # Set a fixed seed for the weights of the model torch.manual_seed(42) model = helpers.ModelWithWeights(op, inputs[0].shape, out_fn=out_fn) # Run on CPU. if native_out is None: native_out, op_raises_exception = exception_catcher(model, inputs) # native_out could be an alias of the input and so modified by # the poptorch_model, except if its an error if op_raises_exception: if not allow_native_errors: raise native_out[1] elif isinstance(native_out, tuple): # pylint: disable=E1101 native_out = tuple(n.clone().detach() for n in native_out) else: native_out = native_out.clone().detach() else: op_raises_exception = isinstance( native_out, tuple) and native_out[0] == "error" # Run on IPU. poptorch_model = poptorch.trainingModel(model, options=options) poptorch_out, ipu_raises = exception_catcher( poptorch_model, inputs, can_raise_exception=op_raises_exception) # Training test - check weights changed if no error was thrown try: poptorch_model.assert_weights_changed() assert not op_raises_exception, ( "Weights changed despite errors being " "thrown in IPU evaluation.") except AssertionError: if not op_raises_exception: raise else: model = torch.nn.Module() model.forward = op # Run on CPU. if native_out is None: native_out, op_raises_exception = exception_catcher(model, *inputs) if op_raises_exception and not allow_native_errors: raise native_out[1] else: op_raises_exception = isinstance( native_out, tuple) and native_out[0] == "error" poptorch_model = poptorch.inferenceModel(model, options) # Run on IPU. poptorch_out, ipu_raises = exception_catcher( poptorch_model, *inputs, can_raise_exception=op_raises_exception) # Compare outputs if not ipu_raises and op_raises_exception: _, cpu_error = native_out raise RuntimeError("The torch and poptorch API do not match, " "poptorch returned without error while torch failed" f" with {cpu_error}") from cpu_error if fuzzy_errors and op_raises_exception: fuzzy_compare_exceptions(native_out[1], poptorch_out[1]) elif op_raises_exception: _, cpu_error = native_out _, ipu_error = poptorch_out with pytest.raises(type(cpu_error), match="^" + re.escape(f"{cpu_error}") + "$"): raise ipu_error else: assert_fn(native_out, poptorch_out) class TestOpHarness: """Test the exception matching functionality of the op_harness function.""" exact_error_check = "Regex pattern.*does not match" fuzzy_error_check = "CPU and IPU error messages did not match" op_harness = op_harness @pytest.fixture(autouse=True, params=[True, False]) def training(self, request, monkeypatch): monkeypatch.setattr(self, "op_harness", partial(op_harness, test_training=request.param)) def test_fuzzy_error_mismatch(self): x = torch.randn(2, 3) def op(x): raise ValueError("Hi") with pytest.raises(ValueError, match=self.fuzzy_error_check): self.op_harness(op, x, native_out=("error", ValueError("Hey")), fuzzy_errors=True) def test_error_mismatch(self): x = torch.randn(2, 3) def op(x): raise ValueError("Hi") with pytest.raises(AssertionError, match=self.exact_error_check): self.op_harness(op, x, native_out=("error", ValueError("Hey"))) def test_exact_match(self): x = torch.randn(2, 3) def op(x): raise ValueError("Hi") self.op_harness(op, x) def test_fuzzy_match(self): x = torch.randn(2, 3) def op(x): raise ValueError("Hi Hey") self.op_harness(op, x, native_out=("error", ValueError("Hey")), fuzzy_errors=True) def test_fuzzy_mismatch(self): x = torch.randn(2, 3) def op(x): raise ValueError("Hi") with pytest.raises(ValueError, match=self.fuzzy_error_check): self.op_harness(op, x, native_out=("error", ValueError("Hey Hi")), fuzzy_errors=True) def test_reject_fuzzy_match_without_fuzzy_option(self): x = torch.randn(2, 3) def op(x): raise ValueError("Hi Hey") with pytest.raises(AssertionError, match=self.exact_error_check): self.op_harness(op, x, native_out=("error", ValueError("Hey"))) def test_reject_exception_if_not_native(self): x = torch.randn(2, 3) error = ValueError("Hi Hey") def op(x): raise error with pytest.raises(type(error), match=f"{error}"): self.op_harness(op, x, native_out=(1)) def test_no_ipu_exception_with_native_exception(self): x = torch.randn(2, 3) error = ValueError("Hi Hey") def op(x): return torch.roll(x, 1) with pytest.raises(RuntimeError, match=f"{error}"): self.op_harness(op, x, native_out=("error", error)) # Note: Many of the following operations don't depend on the values of the tensors # but we still need to fix the random seed for any op with randomly generated values # so that it's guaranteed that weights change after one training step @pytest.mark.parametrize("dim", [0, 1]) @pytest.mark.parametrize( "dtypes", [ [torch.float] * 3, [torch.int] * 3, [torch.int, torch.float], [torch.float, torch.int], ], ids=["all_floats", "all_ints", "int,float", "float,int"]) def test_cat(dim, dtypes): torch.manual_seed(42) # Cannot control the type of the first tensor as it needs to be # torch.float32 to be a valid input to the Linear layer used in # op_harness. first_input = torch.randn(2, 3) tensors = [torch.randn(2, 3).to(dtype=dtype) for dtype in dtypes] op = lambda *xs: torch.cat(xs, dim=dim) op_harness(op, first_input, *tensors, allow_native_errors=False) @pytest.mark.parametrize("dim", [0, 1]) def test_cat_transpose(dim): """This combination of ops without ImplicitCasting causes the code to crash out.""" torch.manual_seed(42) floatTensor = torch.randn(2, 3).to(dtype=torch.float) intTensor = torch.randn(2, 3).to(dtype=torch.int) op = lambda floatTensor, intTensor: torch.cat((intTensor, floatTensor), dim=dim).transpose(1, 0) op_harness(op, floatTensor, intTensor, allow_native_errors=False) @pytest.mark.parametrize("dim_size", [11, 12, 13]) def test_chunk(dim_size): torch.manual_seed(42) x = torch.randn(dim_size) op = lambda x: torch.chunk(x, 6) op_harness(op, x, out_fn=lambda x: x[0]) def test_cat_chunk_slice(): def forward(x, mems): index = 8 cat = torch.cat([mems, x], 0) split, _ = torch.chunk(cat, 2, dim=2) split2 = split[index:] return split2 mems = torch.randn(1600, 1, 10, 10, 5) x = torch.randn(8, 1, 10, 10, 5) op = forward op_harness(op, x, mems, test_training=False) def test_cat_chunk_slice_multiple_slices(): def forward(x, mems): index = 8 cat = torch.cat([mems, x], 0) _, _, split2, _, _ = torch.chunk(cat, 5, dim=2) split5 = split2[index:] return split5 mems = torch.randn(1600, 1, 10, 10, 5) x = torch.randn(8, 1, 10, 10, 5) op = forward op_harness(op, x, mems, test_training=False) def fast_gather_last_dim(data, idx): assert poptorch.ipuHardwareIsAvailable(), \ "Hardware IPU needed to compile this FastGatherLastDim custom op" out = None if poptorch.isRunningOnIpu(): target = torch.zeros(idx.shape).type_as(data) target.requires_grad_() o = poptorch.custom_op([data, idx], "FastGatherLastDim", "poptorch.custom_ops", 1, example_outputs=[target], attributes={}) out = o[0] else: out = torch.gather(data, -1, idx) return out @pytest.mark.ipuHardwareRequired def test_fastgather_3dim(): torch.manual_seed(42) shape = (9, 11, 6) input = torch.randn(shape) indices = torch.randint(0, 6, shape) op_harness(fast_gather_last_dim, input, indices) # Gather index last dim smaller than input last dim indices = torch.randint(0, 6, (9, 11, 3)) op_harness(fast_gather_last_dim, input, indices) # Gather index different shape should fail indices = torch.randint(0, 6, (9, 1, 6)) with pytest.raises(poptorch.poptorch_core.Error): op_harness(fast_gather_last_dim, input, indices) # Gather index different rank should fail indices = torch.randint(0, 6, (11, 6)) with pytest.raises(poptorch.poptorch_core.Error): op_harness(fast_gather_last_dim, input, indices) @pytest.mark.parametrize("dim", [0, 1, 2, -1, -2]) @pytest.mark.parametrize("larger_index", [True, False]) def test_gather_3dim(dim, larger_index): torch.manual_seed(42) shape = (9, 11, 6) input = torch.randn(shape) indices = torch.randint(0, 6, shape) op = lambda x, y: torch.gather(x, dim, y) op_harness(op, input, indices) small_shape = (7, 9, 5) if larger_index: larger_dims = list(small_shape) larger_dims[dim] = shape[dim] + 1 small_shape = tuple(larger_dims) indices = torch.randint(0, 6, small_shape) op = lambda x, y: torch.gather(x, dim, y) op_harness(op, input, indices) @pytest.mark.parametrize("dim", [0, 1, 2, 3]) @pytest.mark.parametrize("larger_index", [True, False]) def test_gather_4dim(dim, larger_index): torch.manual_seed(42) shape = (5, 8, 6, 7) input = torch.randn(shape) indices = torch.randint(0, 5, shape) op = lambda x, y: torch.gather(x, dim, y) op_harness(op, input, indices) small_shape = (4, 5, 2, 6) if larger_index: larger_dims = list(small_shape) larger_dims[dim] = shape[dim] + 1 small_shape = tuple(larger_dims) indices = torch.randint(0, 5, small_shape) op = lambda x, y: torch.gather(x, dim, y) op_harness(op, input, indices) @pytest.mark.parametrize("dim", [0, 1, 2, 3, 4]) @pytest.mark.parametrize("larger_index", [True, False]) def test_gather_5dim(dim, larger_index): torch.manual_seed(42) shape = (3, 3, 3, 3, 3) input = torch.randn(shape) indices = torch.randint(0, 3, shape) op = lambda x, y: torch.gather(x, dim, y) op_harness(op, input, indices) small_shape = (2, 2, 2, 2, 2) if larger_index: larger_dims = list(small_shape) larger_dims[dim] = shape[dim] + 1 small_shape = tuple(larger_dims) indices = torch.randint(0, 3, small_shape) op = lambda x, y: torch.gather(x, dim, y) op_harness(op, input, indices) @pytest.mark.parametrize("dim", range(-3, 3)) @pytest.mark.parametrize("reduce", [None, "add", "multiply"]) def test_scatter(dim, reduce): torch.manual_seed(42) dim_length = 3 shape = (dim_length, ) * 3 input = torch.randn(shape) indices = torch.randint(dim_length, shape) source = torch.randn(shape) op = lambda inp, idx, src: inp.scatter(dim, idx, src, reduce) op_harness(op, input, indices, source) @pytest.mark.parametrize("reduce", [None, 'add']) @pytest.mark.parametrize("value", [1, 1.1]) def test_scatter_value_inplace(reduce, value): torch.manual_seed(42) shape = (6, 6) input = torch.randn(shape).to(torch.float32) indices = torch.randint(6, (1, 6)).squeeze() def op(inp, idx, reduce, value): out = torch.zeros((idx.size(0), 6), dtype=inp.dtype) if reduce is None: out.scatter_(1, idx.unsqueeze(1), value) else: out.scatter_(1, idx.unsqueeze(1), value, reduce=reduce) return out.mul_(inp) op_harness(op, input, indices, reduce, value) @pytest.mark.parametrize("reduce", [None, 'add']) @pytest.mark.parametrize("value", [1, 1.1]) def test_scatter_value(reduce, value): torch.manual_seed(42) shape = (6, 6) input = torch.randn(shape).to(torch.float32) indices = torch.randint(6, (1, 6)).squeeze() def op(inp, idx, reduce, value): out = torch.zeros((idx.size(0), 6), dtype=inp.dtype) if reduce is None: out = torch.scatter(out, 1, idx.unsqueeze(1), value) else: out = torch.scatter(out, 1, idx.unsqueeze(1), value, reduce=reduce) return out.mul_(inp) op_harness(op, input, indices, reduce, value) @pytest.mark.parametrize("dim", range(-3, 3)) @pytest.mark.parametrize("reduce", [None, "add", "multiply"]) def test_scatter_(dim, reduce): torch.manual_seed(42) dim_length = 3 shape = (dim_length, ) * 3 input = torch.randn(shape) indices = torch.randint(dim_length, shape) source = torch.randn(shape) op = lambda inp, idx, src: inp.scatter_(dim, idx, src, reduce) op_harness(op, input, indices, source) @pytest.mark.parametrize("dim", range(-3, 3)) @pytest.mark.parametrize("reduce", [None, "add", "multiply"]) def test_scatter_scalar(dim, reduce): torch.manual_seed(42) dim_length = 3 shape = (dim_length, ) * 3 input = torch.randn(shape) indices = torch.randint(dim_length, shape) source = 5.0 op = lambda inp, idx: inp.scatter(dim, idx, source, reduce) op_harness(op, input, indices) @pytest.mark.parametrize("reduce", [None, "add", "multiply"]) def test_scatter_different_src_index_shapes(reduce): indices = torch.tensor([[0, 1, 2, 0]]).long() input = torch.zeros(3, 5, dtype=torch.float32) dim = 0 op = lambda inp, idx: inp.scatter_( dim, idx, torch.arange(1, 11, dtype=torch.float32).reshape((2, 5)), reduce) op_harness(op, input, indices, test_training=False) def test_reshape(): op = lambda x: torch.reshape(x, (1, 1, 2, 2)) x = torch.arange(4.) op_harness(op, x) def test_constExpr_reshape(): a = 2 b = 3 c = 4 class Model(torch.nn.Module): def forward(self, input): # Use a constant in order for this code to be run in the # ConstExpr pass mask = torch.ones(b, a, device=input.device).to(torch.bool) mask = mask.unsqueeze(1) # The expand on CPU will be implemented by setting the # stride to 0 mask = mask.expand([-1, c, -1]) mask = mask.reshape([b * c, a]) return mask * input[0] input = torch.randn(1).to(torch.bool) native = Model() out_native = native(input) opts = poptorch.Options() m = poptorch.inferenceModel(Model(), opts) out_ipu = m(input) helpers.assert_allequal(actual=out_ipu, expected=out_native) @pytest.mark.parametrize("split_size_or_sections", (1, 5, 6, 20, [10, 10], [19, 1])) def test_split(split_size_or_sections): torch.manual_seed(42) x = torch.randn(20, 10) op = lambda x: torch.split(x, split_size_or_sections) op_harness(op, x, out_fn=lambda x: x[0]) def test_split_singleton(): torch.manual_seed(42) x = torch.randn(1, 4, 3, 1) op = lambda x: torch.split(x, 1, 1)[0] op_harness(op, x) @pytest.mark.parametrize("inplace", [True, False]) def test_squeeze(inplace): torch.manual_seed(42) x = torch.randn(1, 1, 5, 1, 10, 1) def f(t): if inplace: t.squeeze_() return t return torch.squeeze(t) op_harness(f, x) def test_t(): torch.manual_seed(42) x = torch.randn(20, 10) op_harness(torch.t, x) def test_transpose(): torch.manual_seed(42) x = torch.randn(3, 2, 5, 2) op = lambda x: torch.transpose(x, 3, 0) op_harness(op, x) def test_transpose_negative_dims(): torch.manual_seed(42) x = torch.randn(3, 2, 5, 2) y = torch.randn(2, 2, 5, 3) op = lambda x, y: torch.transpose(x, -1, 0) + y op_harness(op, x, y, test_training=False) def test_numpy_T(): torch.manual_seed(42) op = lambda x: x.T x = torch.randn(3, 2, 5, 4) op_harness(op, x) x = torch.randn(5) op_harness(op, x) def test_unsqueeze(): torch.manual_seed(42) x = torch.randn(3, 2, 5, 2) op = lambda x: torch.unsqueeze(x, 1) op_harness(op, x) def test_broadcast_to(): torch.manual_seed(42) x = torch.randn(3, 1) op = lambda x: torch.broadcast_to(x, (3, 4)) op_harness(op, x) @pytest.mark.parametrize( "shape", [ (2, 4, 4), # standard (2, 2, 2, 4, 4), # more dimensions (2, 4, -1), # negative dimension (2, 2, -1, 2, 4), # negative & extra dimensions ]) def test_expand(shape): torch.manual_seed(42) x = torch.randn(2, 1, 4) op = lambda x: x.expand(shape) op_harness(op, x) @pytest.mark.parametrize("shape", [(5), (1, 2, 3)]) def test_expand_scalar(shape): torch.manual_seed(42) x = torch.randn(()) op = lambda x: x.expand(shape) op_harness(op, x, test_training=False) def test_expand_as(): torch.manual_seed(42) x = torch.randn(3, 1) y = torch.randn(3, 4) op = lambda x, y: x.expand_as(y) op_harness(op, x, y) def test_flatten(): torch.manual_seed(42) x = torch.randn(3, 1) op_harness(torch.flatten, x) def test_view(): torch.manual_seed(42) x = torch.randn(30, 5) op = lambda x: x.view((15, 2, 5)) op_harness(op, x) @pytest.mark.parametrize("input_shapes", [(1, ), (2, ), (2, 2), (2, 3, 4)]) def test_size(input_shapes): x = torch.ones(*input_shapes) # Use size as input to another operation to workaround pruning error op = lambda x: x.view(x.size()) op_harness(op, x) input_shapes = [(1, 4, 5), (2, ), (2, 2), (2, 3, 4, 1, 3, 4)] dtypes = [torch.float, torch.float16, torch.int32] @helpers.printCapfdOnExit @helpers.overridePoptorchLogLevel("DEBUG") @pytest.mark.parametrize("input_shapes", input_shapes) @pytest.mark.parametrize("t", dtypes) def test_fill(capfd, input_shapes, t): float_test_num = 1.9375 def op(x): value = 42 if x.dtype == torch.int32 else float_test_num x = x + 0 # Ensure x is not modified in place # Add zero to all results to avoid pruning the whole graph return x.fill_(value) + 0, torch.full_like(x, value) + 0, torch.full( input_shapes, value, dtype=x.dtype) + 0, x.new_full( input_shapes, value, dtype=x.dtype) + 0, torch.ones_like(x) + 0 x = torch.ones(*input_shapes, dtype=t) native_out = tuple( torch.full(input_shapes, float_test_num) for _ in range(3)) if t == torch.float16 else None def assert_fn(native_out, poptorch_out): for native, pop in zip(native_out, poptorch_out): if t == torch.float16: pop = pop.float() assert native.dtype == pop.dtype helpers.assert_allequal(expected=native, actual=pop) # Fill is non-differentiable so set test_training=False op_harness(op, x, test_training=False, assert_fn=assert_fn, native_out=native_out) testlog = helpers.LogChecker(capfd) testlog.assert_no_matches("expand") def test_triu_in_constexpr(): # triu is unsupported but the RHS should be reduced # to a constant before the op reaches PopART # canonicalisation def triu_inplace(x): # dispatches to aten::triu return x + torch.ones(3, 3).triu_() def triu_out(x): # dispatches to aten::triu.out return x + torch.triu(torch.ones(3, 3)) x = torch.ones(3, 3) op_harness(triu_inplace, x, test_training=False) op_harness(triu_out, x, test_training=False) @pytest.mark.parametrize("input_shapes", [ ((10, 10), (10, 10), (10, 10)), ((10, 1, 10), (10, 10), (10, 10, 1)), ((), (), ()), ((10, 1, 10), (), (10, 10, 1)), ((), (10, 10), ()), ]) def test_where_broadcast(input_shapes): torch.manual_seed(42) cond_shape = input_shapes[0] x_shape = input_shapes[1] y_shape = input_shapes[2] class Model(torch.nn.Module): def forward(self, cond, x, y): return torch.where(cond, x, y) cond = torch.empty(cond_shape).bernoulli_().to(torch.bool) x = torch.randn(x_shape) y = torch.randn(y_shape) cpu_mod = Model() ipu_mod = poptorch.inferenceModel(cpu_mod) torch.testing.assert_close(actual=ipu_mod(cond, x, y), expected=cpu_mod(cond, x, y)) @pytest.mark.parametrize("input_shapes", input_shapes) @pytest.mark.parametrize("value", [0.666, -4.32, float("Inf"), float("-Inf")]) def test_masked_fill(input_shapes, value): torch.manual_seed(42) class Model(torch.nn.Module): def forward(self, x): fill_result = x.masked_fill(x > 0.5, value) where_result = torch.where(x > 0.5, x, torch.tensor(value)) return fill_result, where_result x = torch.randn(*input_shapes) op_harness(Model(), x, out_fn=lambda x: x[0]) @pytest.mark.parametrize("input_shapes", [(1, ), (2, ), (3, 4), (1, 3, 4)]) @pytest.mark.parametrize("dim", [0, 1, 2]) def test_stack(input_shapes, dim): torch.manual_seed(42) if dim > len(input_shapes): pytest.skip() op = lambda *xs: torch.stack(xs, dim=dim) inputs = [torch.randn(*input_shapes) for _ in range(3)] op_harness(op, *inputs) @pytest.mark.parametrize("input_shapes", [(1, ), (2, ), (2, 3), (1, 3, 4)]) @pytest.mark.parametrize("dims", [[1], [3], [2, 1], [2, 3], [1, 1, 1], [3, 2, 4]]) def test_repeat(input_shapes, dims): if len(dims) < len(input_shapes): pytest.skip( "Number of dimensions of repeat dims can not be smaller than number" " of dimensions of tensor.") torch.manual_seed(42) op = lambda x: x.repeat(dims) a = torch.randn(*input_shapes) op_harness(op, a) def test_repeat_training_input(): class Model(torch.nn.Module): def __init__(self): super().__init__() # Dummy weights for training self.lin = torch.nn.Linear(2, 1) def forward(self, x): x = x.repeat(5, 2, 2) return x, poptorch.identity_loss(x**2, reduction="sum") torch.manual_seed(42) input = torch.randn((10, 1, 1)) model = Model() options = poptorch.Options() poptorch_model = poptorch.trainingModel(model, options=options) native_out, _ = model(input) poptorch_out, _ = poptorch_model(input) helpers.assert_allclose(expected=native_out, actual=poptorch_out) @pytest.mark.parametrize("input_shapes", [(1, ), (2, ), (2, 3), (1, 3, 4)]) @pytest.mark.parametrize("dtype", [torch.float, torch.int]) def test_clone_one(input_shapes, dtype): torch.manual_seed(42) op = lambda x: x.clone() x = torch.randn(*input_shapes) def assert_fn(native_out, poptorch_out): for pop, native in zip(poptorch_out, native_out): assert native.dtype == pop.dtype helpers.assert_allclose(expected=native, actual=pop) # Calculating with integers does not produce meaningful gradients test_training = dtype is torch.float op_harness(op, x, test_training=test_training, assert_fn=assert_fn) def test_clone_two(): torch.manual_seed(42) class Model(torch.nn.Module): def forward(self, x, y, z): x += y x_clone = x.clone() x += y x_clone += z return x, x_clone dummy_x = torch.randn([2, 3]) dummy_y = torch.randn([2, 3]) dummy_z = torch.randn([2, 3]) model = Model() native_out = model(dummy_x.clone(), dummy_y.clone(), dummy_z.clone()) options = poptorch.Options() poptorch_model = poptorch.inferenceModel(model, options) poptorch_out = poptorch_model(dummy_x.clone(), dummy_y.clone(), dummy_z.clone()) helpers.assert_allclose(expected=native_out, actual=poptorch_out) @pytest.mark.parametrize("input_shapes", [(1, ), (2, ), (2, 3), (1, 3, 4)]) @pytest.mark.parametrize("dtype", [torch.float, torch.half, torch.int]) def test_copy_(input_shapes, dtype): torch.manual_seed(42) op = lambda x, y: y.copy_(x) x = torch.randn(*input_shapes) y = torch.empty_like(x, dtype=dtype) def assert_fn(native_out, poptorch_out): for pop, native in zip(poptorch_out, native_out): helpers.assert_allclose(expected=native, actual=pop, check_dtype=True) # Calculating with integers does not produce meaningful gradients test_training = dtype is torch.float op_harness(op, x, y, test_training=test_training, assert_fn=assert_fn) @pytest.mark.parametrize("shifts,dims", [(1, 0), (-1, 0), (10, 1), (-10, 1), (0, 2), ((1, 1), (0, 1)), ((1, -1), (1, 2)), ((-3, -4), (0, 2)), ((1, 2, 3), (0, 1, 2)), ((-1, -2, -3), (0, 1, 2)), (5, None), (-3, None), (1, -1), (1, -3), (1, -4), (1, 3)]) def test_roll(shifts, dims): torch.manual_seed(0) op = lambda x: x.roll(shifts, dims) x = torch.randn((2, 3, 4)) op_harness(op, x, fuzzy_errors=True) @pytest.mark.parametrize("dims", [0, 1, -1]) def test_flip(dims): torch.manual_seed(0) op = lambda x: x.flip(dims) x = torch.randn((2, 3)) op_harness(op, x) @pytest.mark.parametrize("with_clone", [True, False]) @pytest.mark.parametrize("with_detach", [True, False]) def test_detach_and_clone(with_clone, with_detach): torch.manual_seed(42) class Model(torch.nn.Module): def __init__(self): super().__init__() self.first_layer = torch.nn.Linear(10, 10) self.second_layer = torch.nn.Linear(10, 10) self.loss = torch.nn.MSELoss() def forward(self, x, target): out = self.first_layer(x) if with_clone: out = out.clone() if with_detach: out = out.detach() out = self.second_layer(out) loss = self.loss(out, target) return out, loss model = Model() options = poptorch.Options() poptorch_model = poptorch.trainingModel(model, options=options, optimizer=torch.optim.SGD( model.parameters(), lr=0.01)) target = torch.ones(10) input = torch.randn(10) bias_at_start = model.first_layer.bias.clone().data weight_at_start = model.first_layer.weight.clone().data for _ in range(100): _, _ = poptorch_model(input, target) if with_detach: assert (bias_at_start == model.first_layer.bias).all() assert (weight_at_start == model.first_layer.weight).all() else: assert (bias_at_start != model.first_layer.bias).all() assert (weight_at_start != model.first_layer.weight).all() def test_torch_inference_mode(): class SimpleModel(torch.nn.Module): def __init__(self): super().__init__() self.fc1 = torch.nn.Linear(10, 10) self.loss = torch.nn.MSELoss() def forward(self, x): x = self.fc1(x) x = x[torch.arange(4, device=x.device), :] loss = self.loss(x, x) return loss model = SimpleModel() options = poptorch.Options() model = poptorch.inferenceModel(model.train(), options=options) x = torch.rand(4, 10) with torch.inference_mode(): model.compile(x=x) @helpers.printCapfdOnExit def test_requires_grad_true(capfd): model = torch.nn.Linear(1, 1) options = poptorch.Options() poptorch_model = poptorch.inferenceModel(model, options) poptorch_model(torch.tensor([0.0], requires_grad=True)) log = helpers.LogChecker(capfd) log.assert_contains( "Input tensor has requires_grad=True set. " "This tensor will be detached because backward pass via " "inputs is not supported.") @pytest.mark.parametrize("args", [(5, ), (1, ), (5, 10), (5, 10, 2), (10, 1, -1), (10, 1, -2), (1, 5, 10), (2.5, ), (2, 10.), (2, 10, 3.4)]) def test_arange(args): torch.manual_seed(42) class Model(torch.nn.Module): def forward(self, a): return torch.arange(*args) + a options = poptorch.Options() cpu_model = Model() ipu_model = poptorch.inferenceModel(cpu_model, options) a = torch.randn(()) cpu_res = cpu_model(a) ipu_res = ipu_model(a) helpers.assert_allclose(actual=ipu_res, expected=cpu_res) @pytest.mark.parametrize("args", [(5, ), (5, 10), (5, 10, 2), (2.5, ), (2, 10.), (2, 10, 3.4)]) def test_arange_types(args): torch.manual_seed(42) should_be_float = any(isinstance(a, float) for a in args) class Model(torch.nn.Module): def forward(self, a): res = torch.arange(*args) assert res.is_floating_point() == should_be_float return torch.index_select(res, 0, a) # So the graph's not empty options = poptorch.Options() cpu_model = Model() ipu_model = poptorch.inferenceModel(cpu_model, options) a = torch.tensor([0]) cpu_res = cpu_model(a) ipu_res = ipu_model(a) exp_dtype = cpu_res.dtype if exp_dtype == torch.int64: exp_dtype = torch.int32 elif exp_dtype == torch.float64: exp_dtype = torch.float32 # NOTE: this may depend on torch.get_default_dtype() assert ipu_res.dtype == exp_dtype @pytest.mark.parametrize("input_shape,dim,size,step", [((7, ), 0, 2, 1), ((7, ), 0, 2, 2), ((10, ), 0, 2, 2), ((10, ), 0, 2, 1), ((5, 5), 0, 2, 2), ((5, 5), 1, 2, 2), ((3, 2, 1), 0, 2, 2), ((10, 10, 10), 1, 5, 2)]) def test_unfold(input_shape, dim, size, step): torch.manual_seed(0) op = lambda x: x.unfold(dim, size, step) x = torch.randn(input_shape) op_harness(op, x) @pytest.mark.parametrize("op", [ lambda input: torch.take_along_dim(input, torch.argmax(input)), lambda input: torch.take_along_dim( input, torch.argsort(input, dim=1), dim=1), lambda input: torch.take_along_dim( input, torch.tensor([[2, 0, 1]], dtype=torch.int64), dim=1), lambda input: torch.take_along_dim( input, torch.tensor([[2, 0, 1, 0]], dtype=torch.int64), dim=1), ]) def test_take_along_dim(op): torch.manual_seed(42) input = torch.tensor([[10, 30, 20], [60, 40, 50]]).float() op_harness(op, input) ================================================ FILE: tests/test_doc_urls.py ================================================ #!/usr/bin/env python3 # Copyright (c) 2021 Graphcore Ltd. All rights reserved. import glob import os import re import requests DOC_FOLDER = "../docs/user_guide" URL_PATTERN = re.compile(r"\bhttps?:[^\s>]+") # URLs which don't exist yet (e.g documentation for a future release) can be # added to the dictionary of exceptions: PRE_RELEASE_URLS = { } def get_all_links_from_file(rst_file_name): # Known issue: if a link is split over multiple lines, only the first line # (containing 'http') will be considered matched. print(f"Reading {rst_file_name}") all_links = [] # Force as extended ASCII to avoid decoding errors: # assume all urls are made of 8-bit chars only with open(rst_file_name, "r", encoding="latin-1") as rst_file: for line in rst_file: matches = URL_PATTERN.findall(line) for match in matches: all_links.append(match) return all_links def convert_to_internal(url): for forwarder in PRE_RELEASE_URLS: if url.startswith(forwarder): print("Will try pre-release URL:") return True, url.replace(forwarder, PRE_RELEASE_URLS[forwarder], 1) return False, url def check_url_works(url): print(f"Testing {url}") try: r = requests.head(url) except (requests.exceptions.ConnectionError, requests.exceptions.Timeout): # Allow the test to succeed with intermittent issues. # (TooManyRedirects is not caught as could be a broken url.) return None code = r.status_code message = requests.status_codes._codes[code][0] # pylint: disable=protected-access print(f"{message} ({code})") if r.status_code == 302: check_url_works(r.headers["Location"]) else: # Allow any non 4xx status code, as other failures could be temporary # and break the CI tests. if r.status_code >= 400 and r.status_code < 500: return url, message, code print() return None def test_all_links(): user_guide_path = os.path.realpath( os.path.join(os.path.dirname(os.path.abspath(__file__)), DOC_FOLDER) ) failed_urls = [] for rst_file in glob.glob(f"{user_guide_path}/*.rst"): for url in get_all_links_from_file(rst_file): url_result = check_url_works(url) # If URL didn't work, check internal repos for pending release if url_result is not None: is_pre_release, internal_url = convert_to_internal(url) if is_pre_release: url_result = check_url_works(internal_url) if url_result is not None: url, message, code = url_result failed_urls.append(f"{url}: {message} ({code})") print() no_failures = not failed_urls assert no_failures, "\n".join(failed_urls) ================================================ FILE: tests/test_perf_counters.py ================================================ # Copyright (c) 2021 Graphcore Ltd. All rights reserved. import torch import pytest import poptorch class Model(torch.nn.Module): def forward(self, x, y): return torch.matmul(x, y) def assert_perf_counter_size(perf, inputs, outputs, steps, outsteps=None): def assert_size(perf, elems, steps): assert len(perf) == elems for elem in perf: assert len(elem) == steps outsteps = outsteps or steps assert_size(perf['input'], inputs, steps) assert_size(perf['input_complete'], inputs, steps) assert_size(perf['output'], outputs, outsteps) assert_size(perf['output_complete'], outputs, outsteps) def assert_latency_values(model): def check(latency): (minimum, maximum, average) = latency assert minimum <= average assert average <= maximum host2ipu = model.getHostIpuLatency() compute = model.getComputeLatency() ipu2host = model.getIpuHostLatency() round_trip = model.getLatency() check(host2ipu) check(compute) check(ipu2host) check(round_trip) def test_simple(): x = torch.randn(100, 100) y = torch.randn(100, 100) model = Model() poptorch_model = poptorch.inferenceModel(model) poptorch_model(x, y) perf = poptorch_model.getPerfCounters() assert_perf_counter_size(perf, 2, 1, 1) assert_latency_values(poptorch_model) def test_steps(): x = torch.randn(10, 100, 100) y = torch.randn(10, 100, 100) model = Model() opts = poptorch.Options().deviceIterations(10) poptorch_model = poptorch.inferenceModel(model, opts) poptorch_model(x, y) perf = poptorch_model.getPerfCounters() assert_perf_counter_size(perf, 2, 1, 10) assert_latency_values(poptorch_model) @pytest.mark.ipuHardwareRequired def test_replicas(): x = torch.randn(4, 100, 100) y = torch.randn(4, 100, 100) model = Model() opts = poptorch.Options().replicationFactor(4) poptorch_model = poptorch.inferenceModel(model, opts) poptorch_model(x, y) perf = poptorch_model.getPerfCounters() assert_perf_counter_size(perf, 2, 1, 4) assert_latency_values(poptorch_model) @pytest.mark.parametrize("mode, period", [(poptorch.OutputMode.Final, 1), (poptorch.OutputMode.All, 1), (poptorch.OutputMode.Sum, 1), (poptorch.OutputMode.EveryN, 2)]) @pytest.mark.parametrize("steps", [2, 4]) @pytest.mark.parametrize("replicas", [1, 2]) @pytest.mark.ipuHardwareRequired def test_inference(mode, period, steps, replicas): model = Model() opts = poptorch.Options() opts.outputMode(mode, period) opts.deviceIterations(steps) opts.replicationFactor(replicas) poptorch_model = poptorch.inferenceModel(model, opts) torch.manual_seed(42) x = torch.randn(16, 100, 100) y = torch.randn(16, 100, 100) poptorch_model(x, y) perf = poptorch_model.getPerfCounters() outsteps = steps * replicas if mode in [poptorch.OutputMode.Final, poptorch.OutputMode.Sum]: outsteps = replicas elif mode is poptorch.OutputMode.EveryN: outsteps = steps // period * replicas assert_perf_counter_size(perf, 2, 1, steps * replicas, outsteps) assert_latency_values(poptorch_model) @pytest.mark.parametrize("mode, period", [(poptorch.OutputMode.Final, 1), (poptorch.OutputMode.All, 1), (poptorch.OutputMode.Sum, 1), (poptorch.OutputMode.EveryN, 2)]) @pytest.mark.parametrize("steps", [2, 4]) @pytest.mark.parametrize("accums", [1, 2]) @pytest.mark.parametrize("replicas", [1, 2]) @pytest.mark.ipuHardwareRequired def test_training(mode, period, steps, accums, replicas): torch.manual_seed(42) inputs = torch.randn(16, 100) targets = torch.randn(16, 100) opts = poptorch.Options() opts.outputMode(mode, period) opts.deviceIterations(steps) opts.Training.gradientAccumulation(accums) opts.replicationFactor(replicas) class Model(torch.nn.Module): def __init__(self): super().__init__() self.linear = torch.nn.Linear(100, 100) self.loss = torch.nn.MSELoss() def forward(self, data, target): out = self.linear(data) loss = self.loss(out, target) return out, loss model = Model() poptorch_model = poptorch.trainingModel(model, options=opts) poptorch_model(inputs, targets) perf = poptorch_model.getPerfCounters() outsteps = steps * accums * replicas if mode in [poptorch.OutputMode.Final, poptorch.OutputMode.Sum]: outsteps = replicas elif mode is poptorch.OutputMode.EveryN: outsteps = steps // period * accums * replicas assert_perf_counter_size(perf, 2, 2, steps * accums * replicas, outsteps) assert_latency_values(poptorch_model) def test_synthetic_data(): model = Model() opts = poptorch.Options() opts.deviceIterations(16) opts.enableSyntheticData(True) poptorch_model = poptorch.inferenceModel(model, opts) torch.manual_seed(42) x = torch.randn(16, 100, 100) y = torch.randn(16, 100, 100) poptorch_model(x, y) perf = poptorch_model.getPerfCounters() assert_perf_counter_size(perf, 2, 1, 0, 0) latency = poptorch_model.getLatency() assert latency == (0., 0., 0.) ================================================ FILE: tests/timeout_handler.py ================================================ #!/usr/bin/env python3 # Copyright (c) 2021 Graphcore Ltd. All rights reserved. """ ctest --timeout uses SIGKILL to kill processes after they time out unfortunately this prevents Linux from generating a core dump. So instead this script sends a SIGABRT to the process when it times out which will create a core dump. The second part of the script is so that the test appears as "Timeout" in the ctest results (instead of Aborted): unfortunately there is no way to mark a test as "Timeout in ctest, this can only be done if ctest detects the timeout itself. In order to achieve this we set TIMEOUT_AFTER_MATCH "1;TEST_TIMEOUT on all the tests in ctest: it means ctest will consider a test to have timed out (and kill it) if it doesn't complete within 1 second of printing the string TEST_TIMEOUT. """ import subprocess import signal import sys import time import os # Assuming the ctest --timeout argument is set to the same value: we want this # one to kick in first, so remove 60 seconds from it. timeout = int(os.environ.get("POPTORCH_TEST_TIMEOUT", "1000")) - 60 # Run the command passed # start_new_session is used to create a new process group so that we can send a # signal to the entire process group when we try to kill the test. with subprocess.Popen(sys.argv[1:], start_new_session=True) as p: try: print("Setting timeout to %d seconds" % timeout, flush=True) p.wait(timeout=timeout) except subprocess.TimeoutExpired as e: print("Timeout after %d seconds" % timeout, flush=True) # Timeout: send an segmentation fault signal to generate a core dump. process_group = os.getpgid(p.pid) subprocess.run([ # pylint: disable=subprocess-run-check "gdb", "--batch", "--quiet", "-ex", "thread apply all bt", "-ex", "thread apply all py-bt", "-ex", "detach", "-ex", "quit", "-p", str(process_group) ]) print("Sending signal to process group %d of process %d" % (process_group, p.pid), flush=True) os.killpg(process_group, signal.SIGSEGV) print("Waiting for aborted process...", flush=True) # Wait for the process to exit cleanly p.wait() # Signal to ctest it was a timeout print("TEST_TIMEOUT", flush=True) # give ctest some time to process the timeout time.sleep(60) print("ERROR: Shouldn't have reached this point", flush=True) # Note: in theory ctest should kill this process 1 second after TEST_TIMEOUT was printed. sys.exit(p.returncode) ================================================ FILE: tests/torch_nn_test.py ================================================ # Copyright (c) 2020 Graphcore Ltd. All rights reserved. import os import sys import torch from torch.testing._internal.jit_metaprogramming_utils import get_all_nn_module_tests, get_nn_mod_test_name, get_nn_module_name_from_kwargs import pytest import helpers import poptorch # Importing jit_metaprogramming_utils changes the default type to # double set it back to float. torch.set_default_dtype(torch.float32) # yapf: disable # pylint: disable=line-too-long EXPECTED_FAILURES = { # TODO(T26651) Popart feature request "test_nn_BatchNorm3d_not_affine": "Weights & bias are mandatory in Popart: No input found for input 1 of Op(ai.onnx.BatchNormalization:9, inputs=[Reshape:0], outputs=[]), but input is not optional", # TODO(T31811): Circular padding "test_nn_Padding12_1dcircular": "outputs_[i]->uses().empty() INTERNAL ASSERT FAILED", "test_nn_Padding31_1dcircular": "outputs_[i]->uses().empty() INTERNAL ASSERT FAILED", "test_nn_Padding33_1dcircular": "outputs_[i]->uses().empty() INTERNAL ASSERT FAILED", "test_nn_Padding1221_2dcircular": "outputs_[i]->uses().empty() INTERNAL ASSERT FAILED", "test_nn_Padding2322_2dcircular": "outputs_[i]->uses().empty() INTERNAL ASSERT FAILED", "test_nn_Padding3331_2dcircular": "outputs_[i]->uses().empty() INTERNAL ASSERT FAILED", "test_nn_Conv1d_circular_stride2_pad2": "margin of error", "test_nn_Conv2d_circular_stride2_pad2": "margin of error", # TODO(T26652): Popart feature request "test_nn_LayerNorm_3d_no_elementwise_affine": "Weights & bias are mandatory in Popart: No input found for input 1 of Op(ai.graphcore.GroupNormalization:1, inputs=[Flatten:0], outputs=[]), but input is not optional", "test_nn_LayerNorm_1d_no_elementwise_affine": "Weights & bias are mandatory in Popart: No input found for input 1 of Op(ai.graphcore.GroupNormalization:1, inputs=[Flatten:0], outputs=[]), but input is not optional", "test_nn_GroupNorm_1d_no_affine_IN": "Weights & bias are mandatory in Popart: No input found for input 1 of Op(ai.graphcore.GroupNormalization:1, inputs=[Flatten:0], outputs=[]), but input is not optional", "test_nn_GroupNorm_1d_no_affine_LN": "Weights & bias are mandatory in Popart: No input found for input 1 of Op(ai.graphcore.GroupNormalization:1, inputs=[Flatten:0], outputs=[]), but input is not optional", "test_nn_GroupNorm_2d_no_affine_IN": "Weights & bias are mandatory in Popart: No input found for input 1 of Op(ai.graphcore.GroupNormalization:1, inputs=[Flatten:0], outputs=[]), but input is not optional", "test_nn_GroupNorm_2d_no_affine_LN": "Weights & bias are mandatory in Popart: No input found for input 1 of Op(ai.graphcore.GroupNormalization:1, inputs=[Flatten:0], outputs=[]), but input is not optional", "test_nn_interpolate_nearest_1d_zero_dim": "Zero-sized tensors are unsupported", "test_nn_interpolate_nearest_2d_zero_dim": "Zero-sized tensors are unsupported", "test_nn_interpolate_nearest_3d_zero_dim": "Zero-sized tensors are unsupported", "test_nn_CrossMapLRN2d": "Broadcasting failed", "test_nn_PReLU_1d_multiparam": "Broadcasting failed", "test_nn_PReLU_2d_multiparam": "Broadcasting failed", "test_nn_PReLU_3d_multiparam": "Broadcasting failed", "test_nn_BatchNorm1d_3d_input_not_affine": "No input found for input 1 of ai.onnx.BatchNormalization", "test_nn_BatchNorm1d_not_affine": "No input found for input 1 of ai.onnx.BatchNormalization", "test_nn_BatchNorm2d_not_affine": "No input found for input 1 of ai.onnx.BatchNormalization", "test_nn_GroupNorm_1d_affine": "margin of error", "test_nn_LayerNorm_1d_empty_elementwise_affine": "std::out_of_range exception", "test_nn_Conv1d_zero_batch": "StepIO did not provide input data for tensor input", "test_nn_Conv2d_zero_batch": "StepIO did not provide input data for tensor input", "test_nn_Conv3d_zero_batch": "StepIO did not provide input data for tensor input", "test_nn_ConvTranspose1d_dilated": "Popart exception format", "test_nn_ConvTranspose2d_dilated": "Popart exception format", "test_nn_ConvTranspose3d_dilated": "Popart exception format", "test_nn_MaxPool2d_3d_input": "Invalid length of strides vector", "test_nn_LPPool2d_norm": "Invalid length of padding vector", "test_nn_AdaptiveAvgPool1d": "margin of error", "test_nn_AdaptiveAvgPool1d_one_output": "margin of error", "test_nn_AdaptiveAvgPool2d_single": "margin of error", "test_nn_AdaptiveAvgPool2d_tuple": "margin of error", "test_nn_AdaptiveAvgPool2d_tuple_none": "margin of error", "test_nn_AdaptiveAvgPool2d_alert_nondeterministic": "margin of error", "test_nn_AdaptiveAvgPool3d_single": "margin of error", "test_nn_AdaptiveAvgPool3d_tuple": "margin of error", "test_nn_AdaptiveAvgPool3d_tuple_none": "margin of error", "test_nn_AdaptiveAvgPool3d_alert_nondeterministic": "margin of error", "test_nn_Conv3d_circular_stride2_pad2": "hangs ? really slow ?", "test_nn_Padding122112_3dcircular": "hangs ? really slow ?", "test_nn_Padding322112_3dcircular": "hangs ? really slow ?", "test_nn_Padding332122_3dcircular": "hangs ? really slow ?", "test_nn_softmax_spatial_dtype": "AssertionError: With rtol=0.0001 and atol=1e-05, found 64 element(s) (out of 64) whose difference(s) exceeded the margin of error (including 0 nan comparisons). The greatest difference was 0.662305723875761 (0.706894040107727 vs. 0.04458831623196602), which occurred at index (0, 1, 2, 3).", "test_nn_Softmin_multidim": "AssertionError: With rtol=0.0001 and atol=1e-05, found 300 element(s) (out of 300) whose difference(s) exceeded the margin of error (including 0 nan comparisons). The greatest difference was 0.528009258210659 (0.5376919507980347 vs. 0.009682692587375641), which occurred at index (0, 1, 4, 6).", "test_nn_GroupNorm_2d_affine": "AssertionError: With rtol=0.0001 and atol=1e-05, found 144 element(s) (out of 144) whose difference(s) exceeded the margin of error (including 0 nan comparisons). The greatest difference was 1.3549566268920898 (2.1383447647094727 vs. 0.7833881378173828), which occurred at index (2, 2, 1, 1).", "test_nn_AvgPool1d_stride_pad": "AssertionError: With rtol=0.0001 and atol=1e-05, found 12 element(s) (out of 24) whose difference(s) exceeded the margin of error (including 0 nan comparisons). The greatest difference was 0.39088454842567444 (0.39088454842567444 vs. 0.7817690968513489), which occurred at index (0, 2, 3).", "test_nn_AvgPool2d_stride_pad": "AssertionError: With rtol=0.0001 and atol=1e-05, found 72 element(s) (out of 96) whose difference(s) exceeded the margin of error (including 0 nan comparisons). The greatest difference was 0.7429442256689072 (0.24764807522296906 vs. 0.9905923008918762), which occurred at index (0, 1, 0, 0).", "test_nn_AvgPool2d_divisor": "AssertionError: With rtol=0.0001 and atol=1e-05, found 54 element(s) (out of 54) whose difference(s) exceeded the margin of error (including 0 nan comparisons). The greatest difference was 2.755292057991028 (3.673722743988037 vs. 0.9184306859970093), which occurred at index (1, 2, 2, 2).", "test_nn_AvgPool2d_divisor_stride": "AssertionError: With rtol=0.0001 and atol=1e-05, found 54 element(s) (out of 54) whose difference(s) exceeded the margin of error (including 0 nan comparisons). The greatest difference was 2.236291766166687 (2.981722354888916 vs. 0.745430588722229), which occurred at index (0, 1, 2, 2).", "test_nn_AvgPool2d_divisor_stride_pad": "AssertionError: With rtol=0.0001 and atol=1e-05, found 72 element(s) (out of 96) whose difference(s) exceeded the margin of error (including 0 nan comparisons). The greatest difference was 2.427279531955719 (3.236372709274292 vs. 0.809093177318573), which occurred at index (0, 0, 1, 2).", "test_nn_AvgPool3d_stride_pad": "AssertionError: With rtol=0.0001 and atol=1e-05, found 114 element(s) (out of 162) whose difference(s) exceeded the margin of error (including 0 nan comparisons). The greatest difference was 0.6763140857219696 (0.22543802857398987 vs. 0.9017521142959595), which occurred at index (0, 2, 2, 0, 0).", "test_nn_AvgPool3d_stride_pad_gpu_fixedkw_output": "AssertionError: With rtol=0.0001 and atol=1e-05, found 66 element(s) (out of 72) whose difference(s) exceeded the margin of error (including 0 nan comparisons). The greatest difference was 0.3809252828359604 (0.22855515778064728 vs. 0.6094804406166077), which occurred at index (1, 1, 0, 0, 1).", "test_nn_AvgPool3d_stride_pad_gpu_general_output": "AssertionError: With rtol=0.0001 and atol=1e-05, found 264 element(s) (out of 270) whose difference(s) exceeded the margin of error (including 0 nan comparisons). The greatest difference was 0.4184434115886688 (0.16373875737190247 vs. 0.5821821689605713), which occurred at index (0, 2, 2, 0, 4).", "test_nn_AvgPool3d_stride_pad_gpu_input_nooverlap": "AssertionError: With rtol=0.0001 and atol=1e-05, found 156 element(s) (out of 162) whose difference(s) exceeded the margin of error (including 0 nan comparisons). The greatest difference was 0.8563244119286537 (0.12233205884695053 vs. 0.9786564707756042), which occurred at index (0, 0, 2, 2, 0).", "test_nn_AvgPool3d_divisor": "AssertionError: With rtol=0.0001 and atol=1e-05, found 48 element(s) (out of 48) whose difference(s) exceeded the margin of error (including 0 nan comparisons). The greatest difference was 5.214784324169159 (5.959753513336182 vs. 0.7449691891670227), which occurred at index (0, 2, 0, 0, 0).", "test_nn_AvgPool3d_divisor_stride": "AssertionError: With rtol=0.0001 and atol=1e-05, found 48 element(s) (out of 48) whose difference(s) exceeded the margin of error (including 0 nan comparisons). The greatest difference was 4.80545711517334 (5.491950988769531 vs. 0.6864938735961914), which occurred at index (1, 1, 1, 1, 0).", "test_nn_AvgPool3d_divisor_stride_pad": "AssertionError: With rtol=0.0001 and atol=1e-05, found 156 element(s) (out of 162) whose difference(s) exceeded the margin of error (including 0 nan comparisons). The greatest difference was 4.580634713172913 (5.235011100769043 vs. 0.6543763875961304), which occurred at index (1, 2, 1, 1, 1).", "test_nn_AvgPool3d_divisor_stride_pad_gpu_fixedkw_output": "AssertionError: With rtol=0.0001 and atol=1e-05, found 72 element(s) (out of 72) whose difference(s) exceeded the margin of error (including 0 nan comparisons). The greatest difference was 33.63082355260849 (34.16464614868164 vs. 0.5338225960731506), which occurred at index (1, 1, 1, 1, 1).", "test_nn_AvgPool3d_divisor_stride_pad_gpu_general_output": "AssertionError: With rtol=0.0001 and atol=1e-05, found 270 element(s) (out of 270) whose difference(s) exceeded the margin of error (including 0 nan comparisons). The greatest difference was 32.37348711490631 (32.887351989746094 vs. 0.5138648748397827), which occurred at index (1, 1, 1, 1, 2).", "test_nn_AvgPool3d_divisor_stride1_pad0_gpu_input": "AssertionError: With rtol=0.0001 and atol=1e-05, found 48 element(s) (out of 48) whose difference(s) exceeded the margin of error (including 0 nan comparisons). The greatest difference was 15.688140630722046 (16.29153060913086 vs. 0.6033899784088135), which occurred at index (0, 2, 0, 0, 0).", "test_nn_AvgPool3d_divisor_stride_pad_gpu_input_nooverlap": "AssertionError: With rtol=0.0001 and atol=1e-05, found 114 element(s) (out of 162) whose difference(s) exceeded the margin of error (including 0 nan comparisons). The greatest difference was 5.383516907691956 (6.152590751647949 vs. 0.7690738439559937), which occurred at index (0, 2, 1, 1, 1).", "test_nn_GELU": "AssertionError: With rtol=0.0001 and atol=1e-05, found 7 element(s) (out of 30) whose difference(s) exceeded the margin of error (including 0 nan comparisons). The greatest difference was 0.00013697147369384766 (0.7933593392372131 vs. 0.7932223677635193), which occurred at index (0, 1, 3).", "test_nn_GELU_no_batch_dim": "AssertionError: Tensor-likes are not close", "test_nn_softmax_spatial_special": "AssertionError: With rtol=0.0001 and atol=1e-05, found 1024 element(s) (out of 1024) whose difference(s) exceeded the margin of error (including 0 nan comparisons). The greatest difference was 0.009421411203220487 (0.012496765702962875 vs. 0.0030753544997423887), which occurred at index (0, 114, 0, 0).", "test_nn_softmax_spatial": "AssertionError: With rtol=0.0001 and atol=1e-05, found 64 element(s) (out of 64) whose difference(s) exceeded the margin of error (including 0 nan comparisons). The greatest difference was 0.6575267650187016 (0.7020591497421265 vs. 0.04453238472342491), which occurred at index (1, 0, 1, 0).", "test_nn_softmax_functional_dim0": "AssertionError: With rtol=0.0001 and atol=1e-05, found 120 element(s) (out of 120) whose difference(s) exceeded the margin of error (including 0 nan comparisons). The greatest difference was 0.7021725764498115 (0.7146565914154053 vs. 0.012484014965593815), which occurred at index (0, 1, 2, 1).", "test_nn_log_softmax_spatial_special": "AssertionError: With rtol=0.0001 and atol=1e-05, found 1024 element(s) (out of 1024) whose difference(s) exceeded the margin of error (including 0 nan comparisons). The greatest difference was 1.4219598770141602 (-4.52596378326416 vs. -5.94792366027832), which occurred at index (0, 127, 1, 0).", "test_nn_log_softmax_spatial": "AssertionError: With rtol=0.0001 and atol=1e-05, found 64 element(s) (out of 64) whose difference(s) exceeded the margin of error (including 0 nan comparisons). The greatest difference was 3.2339431643486023 (-0.6880427002906799 vs. -3.9219858646392822), which occurred at index (1, 0, 2, 0).", "test_nn_log_softmax_dim0": "AssertionError: With rtol=0.0001 and atol=1e-05, found 120 element(s) (out of 120) whose difference(s) exceeded the margin of error (including 0 nan comparisons). The greatest difference was 4.541183948516846 (-0.645355224609375 vs. -5.186539173126221), which occurred at index (0, 0, 2, 0).", "test_nn_Softmax2d": "AssertionError: With rtol=0.0001 and atol=1e-05, found 600 element(s) (out of 600) whose difference(s) exceeded the margin of error (including 0 nan comparisons). The greatest difference was 0.5474691272247583 (0.5498999953269958 vs. 0.002430868102237582), which occurred at index (0, 1, 6, 13).", "test_nn_LogSoftmax_multiparam": "AssertionError: With rtol=0.0001 and atol=1e-05, found 600 element(s) (out of 600) whose difference(s) exceeded the margin of error (including 0 nan comparisons). The greatest difference was 5.692737579345703 (-1.065810203552246 vs. -6.758547782897949), which occurred at index (0, 2, 7, 2).", "test_nn_TransformerEncoderLayer_gelu_activation": "AssertionError: With rtol=0.0001 and atol=1e-05, found 2 element(s) (out of 24) whose difference(s) exceeded the margin of error (including 0 nan comparisons). The greatest difference was 6.428360939025879e-05 (-0.4767186641693115 vs. -0.47665438055992126), which occurred at index (1, 2, 0).", "test_nn_TransformerDecoderLayer_gelu_activation": "AssertionError: With rtol=0.0001 and atol=1e-05, found 3 element(s) (out of 36) whose difference(s) exceeded the margin of error (including 0 nan comparisons). The greatest difference was 0.00011313706636428833 (-0.07721851021051407 vs. -0.07710537314414978), which occurred at index (0, 0, 3).", "test_nn_LPPool1d_norm": "Output dimensions mismatch: assert torch.Size([1, 3, 3]) == torch.Size([1, 3, 6])", "test_nn_ConvTranspose1d": "Output dimensions mismatch: assert torch.Size([1, 4, 20]) == torch.Size([1, 4, 19])", "test_nn_ConvTranspose1d_no_bias": "Output dimensions mismatch: assert torch.Size([1, 4, 12]) == torch.Size([1, 4, 11])", "test_nn_ConvTranspose1d_groups": "Output dimensions mismatch: assert torch.Size([2, 6, 20]) == torch.Size([2, 6, 19])", "test_nn_ConvTranspose2d": "Output dimensions mismatch: assert torch.Size([1, 4, 20, 12]) == torch.Size([1, 4, 19, 11])", "test_nn_ConvTranspose2d_no_bias": "Output dimensions mismatch: assert torch.Size([1, 4, 12, 20]) == torch.Size([1, 4, 11, 19])", "test_nn_BCELoss_weights_no_reduce": "RuntimeError: expected int at position 0, but got: Tensor", "test_nn_Bilinear": "TypeError: bilinear(): argument 'input2' (position 2) must be Tensor, not tuple", "test_nn_Embedding": "RuntimeError: Expected tensor for argument #1 'indices' to have scalar type Long; but got torch.FloatTensor instead (while checking arguments for embedding)", "test_nn_BCELoss_weights_no_reduce_scalar": "IndexError: tuple index out of range", "test_nn_BCEWithLogitsLoss_no_reduce_scalar": "IndexError: tuple index out of range", "test_nn_KLDivLoss_no_reduce_scalar": "IndexError: tuple index out of range", "test_nn_KLDivLoss_no_reduce_scalar_log_target": "IndexError: tuple index out of range", "test_nn_L1Loss_no_reduce_scalar": "IndexError: tuple index out of range", "test_nn_MSELoss_no_reduce_scalar": "IndexError: tuple index out of range", "test_nn_SmoothL1Loss_no_reduce_scalar": "IndexError: tuple index out of range", "test_nn_MultiLabelMarginLoss_0d_no_reduce": "IndexError: tuple index out of range", "test_nn_BCELoss_no_reduce_scalar": "IndexError: tuple index out of range", "test_nn_softmax_functional_scalar": "IndexError: tuple index out of range", "test_nn_SELU_scalar": "IndexError: tuple index out of range", "test_nn_CELU_scalar": "IndexError: tuple index out of range", "test_nn_GELU_scalar": "IndexError: tuple index out of range", "test_nn_log_softmax_scalar": "IndexError: tuple index out of range", "test_nn_Threshold_threshold_value_scalar": "IndexError: tuple index out of range", "test_nn_ReLU_scalar": "IndexError: tuple index out of range", "test_nn_ReLU6_scalar": "IndexError: tuple index out of range", "test_nn_RReLU_with_up_down_scalar": "IndexError: tuple index out of range", "test_nn_Hardtanh_scalar": "IndexError: tuple index out of range", "test_nn_Sigmoid_scalar": "IndexError: tuple index out of range", "test_nn_Tanh_scalar": "IndexError: tuple index out of range", "test_nn_Softmax_scalar": "IndexError: tuple index out of range", "test_nn_LogSoftmax_multiparam_scalar": "IndexError: tuple index out of range", "test_nn_ELU_scalar": "IndexError: tuple index out of range", "test_nn_Hardshrink_scalar": "IndexError: tuple index out of range", "test_nn_LeakyReLU_with_negval_scalar": "IndexError: tuple index out of range", "test_nn_LogSigmoid_scalar": "IndexError: tuple index out of range", "test_nn_Softplus_beta_threshold_scalar": "IndexError: tuple index out of range", "test_nn_Softshrink_lambda_scalar": "IndexError: tuple index out of range", "test_nn_PReLU_scalar": "IndexError: tuple index out of range", "test_nn_Softsign_scalar": "IndexError: tuple index out of range", "test_nn_Softmin_scalar": "IndexError: tuple index out of range", "test_nn_Tanhshrink_scalar": "IndexError: tuple index out of range", "test_nn_SiLU_scalar": "IndexError: tuple index out of range", # input 1 for ai.graphcore.BatchNormalization "test_nn_BatchNorm2d_zero_batch": "ERROR in NormalizationOps.cpp:98: weight->type()->cast() == nullptr Context: PopartCanonicalization processing %10 : Float(0:20, 5:4, 2:2, 2:1) = aten::batch_norm(%input, %4, %5, %11, %12, %13, %14, %15, %16)", "test_nn_BatchNorm1d_zero_batch": "RuntimeError: ERROR in NormalizationOps.cpp:98: weight->type()->cast() == nullptr Context: PopartCanonicalization processing %10 : Float(0:45, 5:9, 9:1) = aten::batch_norm(%input, %4, %5, %11, %12, %13, %14, %15, %16)", "test_nn_BatchNorm3d_zero_batch": "RuntimeError: ERROR in NormalizationOps.cpp:98: weight->type()->cast() == nullptr Context: PopartCanonicalization processing %10 : Float(0:40, 5:8, 2:4, 2:2, 2:1) = aten::batch_norm(%input, %4, %5, %11, %12, %13, %14, %15, %16)", # margin of error "test_nn_KLDivLoss_no_reduce": "Unsupported op(s): aten::kl_div aten::kl_div", "test_nn_FractionalMaxPool2d_alert_nondeterministic": "T30594", "test_nn_FractionalMaxPool2d_ratio": "T30594", "test_nn_FractionalMaxPool2d_ratio_no_batch_dim": "T30594", "test_nn_FractionalMaxPool2d_ratio_no_batch_dim_no_random_samples": "T30594", "test_nn_FractionalMaxPool2d_ratio_return_indices": "T30594", "test_nn_FractionalMaxPool2d_size": "T30594", "test_nn_FractionalMaxPool2d_size_no_batch_dim": "T30594", "test_nn_FractionalMaxPool2d_size_no_batch_dim_no_random_samples": "T30594", "test_nn_FractionalMaxPool3d_alert_nondeterministic": "T30594", "test_nn_FractionalMaxPool3d_asymsize": "T30594", "test_nn_FractionalMaxPool3d_ratio": "T30594", "test_nn_FractionalMaxPool3d_ratio_no_batch_dim": "T30594", "test_nn_FractionalMaxPool3d_ratio_no_batch_dim_no_random_samples": "T30594", "test_nn_FractionalMaxPool3d_ratio_return_indices": "T30594", "test_nn_FractionalMaxPool3d_size": "T30594", "test_nn_FractionalMaxPool3d_size_no_batch_dim": "T30594", "test_nn_FractionalMaxPool3d_size_no_batch_dim_no_random_samples": "T30594", "test_nn_BCELoss_no_reduce": "T30603", "test_nn_BCEWithLogitsLoss_no_reduce": "T30603", "test_nn_NLLLoss_no_reduce_ignore_index": "T30603", "test_nn_NLLLoss_no_reduce_weights": "T30603", "test_nn_NLLLoss_no_reduce_weights_ignore_index": "T30603", "test_nn_NLLLoss_no_reduce_weights_ignore_index_neg": "T30603", "test_nn_NLLLoss2d_no_reduce": "T30603", "test_nn_NLLLoss2d_no_reduce_weights": "T30603", "test_nn_NLLLoss2d_no_reduce_ignore_index": "T30603", "test_nn_NLLLossNd_no_reduce": "T30603", "test_nn_NLLLossNd_no_reduce_weights": "T30603", "test_nn_NLLLossNd_no_reduce_ignore_index": "T30603", "test_nn_MultiLabelMarginLoss_index_neg": "T30603", "test_nn_MultiLabelMarginLoss_no_reduce": "T30603", "test_nn_HingeEmbeddingLoss_no_reduce": "T30603", "test_nn_HingeEmbeddingLoss_margin_no_reduce": "T30603", "test_nn_MultiMarginLoss_no_reduce": "T30603", "test_nn_MultiMarginLoss_1d_no_reduce": "T30603", "test_nn_multimarginloss_1d_input_0d_target_no_reduce": "T30603", "test_nn_MultiMarginLoss_p_no_reduce": "T30603", "test_nn_MultiMarginLoss_margin_no_reduce": "T30603", "test_nn_MultiMarginLoss_weights_no_reduce": "T30603", "test_nn_MultiLabelMarginLoss_1d_no_reduce": "T30603", "test_nn_AdaptiveMaxPool3d_single": "T30564", "test_nn_MaxPool1d_return_indices": "Max pool return indices not supported.", "test_nn_MaxPool2d_return_indices": "Max pool return indices not supported.", "test_nn_MaxPool3d_return_indices": "Max pool return indices not supported.", # TODO(T30564): Support adaptive max pool "test_nn_AdaptiveMaxPool1d": "T30564", "test_nn_AdaptiveMaxPool1d_no_batch_dim": "T30564", "test_nn_AdaptiveMaxPool2d_single": "T30564", "test_nn_AdaptiveMaxPool2d_tuple": "T30564", "test_nn_AdaptiveMaxPool2d_tuple_none": "T30564", "test_nn_AdaptiveMaxPool2d_alert_nondeterministic": "T30564", "test_nn_AdaptiveMaxPool2d_no_batch_dim": "T30564", "test_nn_AdaptiveMaxPool3d_no_batch_dim": "T30564", "test_nn_AdaptiveMaxPool3d_tuple": "T30564", "test_nn_AdaptiveMaxPool3d_tuple_none": "T30564", "test_nn_AdaptiveMaxPool3d_single_nonatomic": "T30564", "test_nn_AdaptiveMaxPool3d_tuple_nonatomic": "T30564", # Input dims indivisible by output dims, output doesn't match torch "test_nn_AdaptiveAvgPool1d_no_batch_dim": "Output differs from torch due to implementation detail", "test_nn_AdaptiveAvgPool2d_no_batch_dim": "Output differs from torch due to implementation detail", "test_nn_AdaptiveAvgPool3d_no_batch_dim": "Output differs from torch due to implementation detail", # torch.complex128 not supported "test_nn_L1Loss_no_reduce_complex": "torch.complex128 not supported", "test_nn_ReflectionPad1d_complex": "torch.complex128 not supported", "test_nn_ReflectionPad2d_complex": "torch.complex128 not supported", "test_nn_ReflectionPad3d_complex": "torch.complex128 not supported", "test_nn_ReplicationPad1d_complex": "torch.complex128 not supported", "test_nn_ReplicationPad2d_complex": "torch.complex128 not supported", "test_nn_ConstantPad1d_complex": "torch.complex128 not supported", "test_nn_ConstantPad2d_complex": "torch.complex128 not supported", "test_nn_ConstantPad3d_complex": "torch.complex128 not supported", "test_nn_ReplicationPad3d_complex": "torch.complex128 not supported", "test_nn_ZeroPad2d_complex": "torch.complex128 not supported", # TODO(T42768): Support aten::_convolution_mode "test_nn_Conv1d_pad_valid": "T42768", "test_nn_Conv1d_pad_same": "T42768", "test_nn_Conv1d_pad_same2": "T42768", "test_nn_Conv1d_pad_same_dilated": "T42768", "test_nn_Conv2d_pad_valid": "T42768", "test_nn_Conv2d_pad_same": "T42768", "test_nn_Conv2d_pad_same_dilated": "T42768", "test_nn_Conv3d_pad_valid": "T42768", "test_nn_Conv3d_pad_same": "T42768", "test_nn_Conv3d_pad_same_dilated": "T42768", # TODO(T42770): Support torch.nn.HuberLoss "test_nn_HuberLoss_delta": "T42770", # TODO(T42771): Support torch.nn.PixelUnshuffle "test_nn_PixelUnshuffle": "T42771", # TODO(T42772): Support torch.nn.Mish "test_nn_Mish": "T42772", "test_nn_Mish_scalar": "T42772", "test_nn_Mish_no_batch_dim": "T42772", # TODO(T48781): Support torch.nn.Unflatten "test_nn_Unflatten_no_batch_dim": "T48781", # TODO(T49021): Support torch.nn.ReflectionPad3d "test_nn_ReflectionPad3d": "T49021", "test_nn_ReflectionPad3d_no_batch_dim": "T49021", # TODO(T49073): Match torch 1.10 implementation "test_nn_GroupNorm_1d_affine_large_batch": "T49073", # unsupported upsampling modes downstream "test_nn_interpolate_linear_1d": "Upsample mode not supported", "test_nn_interpolate_linear_tuple_1d": "Upsample mode not supported", "test_nn_interpolate_linear_scale_1d": "Upsample mode not supported", "test_nn_interpolate_linear_1d_zero_dim": "Upsample mode not supported", "test_nn_interpolate_linear_1d_align_corners": "Upsample mode not supported", "test_nn_interpolate_linear_scale_1d_align_corners": "Upsample mode not supported", "test_nn_interpolate_linear_1d_alert_nondeterministic": "Upsample mode not supported", "test_nn_interpolate_bilinear_2d_zero_dim": "Upsample mode not supported", "test_nn_interpolate_bilinear_scale_tuple_skewed_2d": "Upsample mode not supported", "test_nn_interpolate_bilinear_tuple_2d_align_corners": "Upsample mode not supported", "test_nn_interpolate_bilinear_scale_tuple_skewed_2d_align_corners": "Upsample mode not supported", "test_nn_interpolate_bilinear_2d_alert_nondeterministic": "Upsample mode not supported", "test_nn_interpolate_bicubic_2d": "Upsample mode not supported", "test_nn_interpolate_bicubic_2d_zero_dim": "Upsample mode not supported", "test_nn_interpolate_bicubic_tuple_2d": "Upsample mode not supported", "test_nn_interpolate_bicubic_scale_2d": "Upsample mode not supported", "test_nn_interpolate_bicubic_scale_tuple_shared_2d": "Upsample mode not supported", "test_nn_interpolate_bicubic_scale_tuple_skewed_2d": "Upsample mode not supported", "test_nn_interpolate_bicubic_tuple_2d_align_corners": "Upsample mode not supported", "test_nn_interpolate_bicubic_scale_tuple_skewed_2d_align_corners": "Upsample mode not supported", "test_nn_interpolate_bicubic_2d_alert_nondeterministic": "Upsample mode not supported", "test_nn_interpolate_trilinear_3d": "Upsample mode not supported", "test_nn_interpolate_trilinear_3d_zero_dim": "Upsample mode not supported", "test_nn_interpolate_trilinear_tuple_3d": "Upsample mode not supported", "test_nn_interpolate_trilinear_scale_3d": "Upsample mode not supported", "test_nn_interpolate_trilinear_tuple_3d_align_corners": "Upsample mode not supported", "test_nn_interpolate_trilinear_scale_3d_align_corners": "Upsample mode not supported", "test_nn_interpolate_trilinear_3d_alert_nondeterministic": "Upsample mode not supported", "test_nn_EmbeddingBag_sparse": "T27057: sparse gradient support", "test_nn_Embedding_sparse": "T27057: sparse gradient support", "test_nn_MultiheadAttention": "ai.onnx.Dropout:10 ratio value 0 is not valid", "test_nn_EmbeddingBag_sum_padding_idx": "padding_idx not supported", "test_nn_EmbeddingBag_mean_padding_idx": "padding_idx not supported", "test_nn_EmbeddingBag_max_padding_idx": "padding_idx not supported", } FLOAT_EXPECTED_FAILURES = { # Tests that fail on float only, e.g. due to OOM on the small IPU model "test_nn_LayerNorm_3d_no_affine_large_feature": "Tile 0 receives more data than it has total memory in exchange", } HALF_EXPECTED_FAILURES = { # T30731 - tests failing with very large error "test_nn_BatchNorm1d_affine_simple_average": "AssertionError: With rtol=0.05 and atol=0.0001", "test_nn_BatchNorm1d_not_tracking_stats": "Tensor-likes are not close", "test_nn_BatchNorm2d_momentum": "AssertionError: With rtol=0.05 and atol=0.0001", "test_nn_BatchNorm3d_momentum": "AssertionError: With rtol=0.05 and atol=0.0001", "test_nn_BatchNorm1d_3d_input": "AssertionError: With rtol=0.05 and atol=0.0001", "test_nn_BatchNorm2d_2d_simple_average": "AssertionError: With rtol=0.05 and atol=0.0001", "test_nn_BatchNorm1d_affine": "AssertionError: With rtol=0.05 and atol=0.0001", "test_nn_BatchNorm2d": "AssertionError: With rtol=0.05 and atol=0.0001", "test_nn_BatchNorm3d": "AssertionError: With rtol=0.05 and atol=0.0001, found 384 element(s) (out of 384) whose difference(s) exceeded the margin of error (including 0 nan comparisons). The greatest difference was 312.5105660557747 (0.9894339442253113 vs. 313.5), which occurred at index (1, 2, 2, 1, 1).", "test_nn_BatchNorm3d_3d_simple_average": "AssertionError: With rtol=0.05 and atol=0.0001, found 384 element(s) (out of 384) whose difference(s) exceeded the margin of error (including 0 nan comparisons). The greatest difference was 30.140776455402374 (0.9842235445976257 vs. 31.125), which occurred at index (1, 0, 2, 2, 3).", "test_nn_LSTMCell": "Exception: TEXCPT_INVALID_ADDR", } FLOAT_PRECISION_EXCEPTIONS = { "test_nn_GroupNorm_2d_affine_large_feature": (1e-3, 1e-3), "test_nn_GroupNorm_2d_no_affine_large_feature": (1e-3, 1e-3), } HALF_PRECISION_EXCEPTIONS = { "test_nn_BatchNorm2d_not_tracking_stats": (0.05, 1e-3), "test_nn_BatchNorm3d_not_tracking_stats": (0.05, 1e-2), "test_nn_Conv1d_dilated": (0.05, 1e-3), "test_nn_Conv1d_pad2": (0.05, 1e-3), "test_nn_Conv2d_depthwise_padded": (0.05, 1e-3), # TODO(T31811)? "test_nn_Conv2d_groups": (0.05, 1e-3), "test_nn_Conv2d_groups_thnn": (0.05, 1e-3), "test_nn_Conv2d_replicate_stride2_pad2": (0.05, 1e-3), "test_nn_Conv3d_dilated": (0.05, 1e-3), "test_nn_Conv3d_groups": (0.05, 1e-3), "test_nn_GroupNorm_2d_affine_large_feature": (0.05, 1e-2), "test_nn_GroupNorm_2d_no_affine_large_feature": (0.05, 1e-2), "test_nn_InstanceNorm1d": (0.05, 1e-3), "test_nn_InstanceNorm1d_no_batch_dim": (0.05, 1e-3), "test_nn_InstanceNorm3d_no_batch_dim": (0.05, 1e-3), "test_nn_KLDivLoss_with_target_no_reduce": (0.05, 1e-2), "test_nn_LayerNorm_1d_elementwise_affine": (0.05, 0.002), "test_nn_LayerNorm_3d_elementwise_affine": (0.05, 0.002), "test_nn_LayerNorm_3d_no_affine_large_feature": (0.05, 0.002), "test_nn_Linear_no_bias": (0.05, 1e-3), "test_nn_TransformerDecoderLayer_relu_activation": (0.05, 1e-2), "test_nn_Transformer_multilayer_coder": (0.05, 1e-2), } # pylint: enable=line-too-long # yapf: enable all_tests = {} # Inspired from torch/testing/_internal/jit_metaprogramming_utils.py for test in get_all_nn_module_tests(): test_name = get_nn_mod_test_name(**test) name = get_nn_module_name_from_kwargs(**test) if "constructor_args_fn" in test: args = test["constructor_args_fn"]() else: args = test.get("constructor_args", ()) if "constructor" in test: module = test["constructor"](*args) else: module = getattr(torch.nn, name)(*args) module.eval() if 'input_fn' in test: input = test['input_fn']() elif "input" in test: input = (test.get("input"), ) else: input = (torch.rand(test['input_size'], dtype=torch.float), ) if 'extra_args' in test: input = input + test['extra_args'] if 'target_size' in test: input = input + (test['target_size'], ) elif 'target_fn' in test: input = input + (test['target_fn'](), ) if not isinstance(input, tuple): input = (input, ) assert test_name not in all_tests all_tests[test_name] = (module, input) def assert_allclose(native_out, poptorch_out, rtol, atol): if isinstance(native_out, tuple): assert isinstance(poptorch_out, tuple) for idx, native_out_t in enumerate(native_out): assert_allclose(native_out_t, poptorch_out[idx], rtol, atol) return if native_out.size() == tuple(): native_out = torch.tensor(native_out.float()) assert native_out.size() == poptorch_out.size() helpers.assert_allclose(expected=native_out.float(), actual=poptorch_out.float(), rtol=rtol, atol=atol) @pytest.mark.parametrize("test_name", all_tests.keys()) @pytest.mark.parametrize("use_half", [False, True]) def test_pytorch_nn(test_name, use_half): reason = EXPECTED_FAILURES.get(test_name) if reason is None: reason = HALF_EXPECTED_FAILURES.get( test_name) if use_half else FLOAT_EXPECTED_FAILURES.get(test_name) if reason: pytest.skip(reason) print(f"Running {test_name}", flush=True) model, inputs = all_tests[test_name] model = model.float() inputs = [ i.float() if isinstance(i, torch.Tensor) and i.is_floating_point() else i for i in inputs ] ref = model(*inputs) rtol = None atol = None if use_half: model = model.half() inputs = [ i.half() if isinstance(i, torch.Tensor) and i.is_floating_point() else i for i in inputs ] rtol, atol = HALF_PRECISION_EXCEPTIONS.get(test_name, (0.05, 1e-4)) else: rtol, atol = FLOAT_PRECISION_EXCEPTIONS.get(test_name, (None, None)) poptorch_model = poptorch.inferenceModel(model) poptorch_out = poptorch_model(*inputs) assert_allclose(ref, poptorch_out, rtol, atol) if __name__ == "__main__": assert len(sys.argv) >= 2, f"Usage {sys.argv[0]} (test_name)+" # Disable expected failures: EXPECTED_FAILURES.clear() HALF_EXPECTED_FAILURES.clear() if len(sys.argv) == 2: test_pytorch_nn(sys.argv[1], os.environ.get("HALF", "0") == "1") sys.exit(0) fails = [] for testname in sys.argv[1:]: try: test_pytorch_nn(testname, os.environ.get("HALF", "0") == "1") except (RuntimeError, AssertionError, poptorch.Error): fails.append(testname) if len(fails) > 0: print("\nFailed Tests:") for fail in fails: print("\t" + fail) sys.exit(len(fails)) ================================================ FILE: tests/torchvision_inference_test.py ================================================ #!/usr/bin/env python3 # Copyright (c) 2020 Graphcore Ltd. All rights reserved. import os # pylint: disable=unused-import import unittest.mock import pytest import torch import torchvision.models as models import helpers import poptorch # Torchvision models. # AlexNet # VGG-11 # VGG-13 # VGG-16 # VGG-19 # VGG-11 with batch normalization # VGG-13 with batch normalization # VGG-16 with batch normalization # VGG-19 with batch normalization # ResNet-18 # ResNet-34 # ResNet-50 # ResNet-101 # ResNet-152 # SqueezeNet 1.0 # SqueezeNet 1.1 # Densenet-121 # Densenet-169 # Densenet-201 # Densenet-161 # Inception v3 # GoogleNet # ShuffleNet V2 # MobileNet V2 # ResNeXt-50-32x4d # ResNeXt-101-32x8d # Wide ResNet-50-2 # Wide ResNet-101-2 # MNASNet 1.0 # Models here are hopefully representative of their cousins (i.e test Resnet18 without testing Resnet-34/50/101/152) # The others will be tested in hardware benchmark tests, tested_models = [ models.resnet18, models.resnext50_32x4d, models.mnasnet1_0, models.mobilenet_v2, models.googlenet, models.inception_v3, # SqueezeNet v1.0 simply has more parameters and a greater computational cost models.squeezenet1_1, ] # Deliberately un-tested models untested_models = [ models.vgg11, # Supported but takes a long time to compile. models.shufflenet_v2_x1_0, # Supported but takes a long time to compile. models.densenet121, # Supported but takes a long time to compile. models.wide_resnet50_2, # Supported but doesn't fit on 1 IPU. # Supported on IPU_MODEL but runs into stream limit on IPU. models.alexnet, ] def inference_harness(imagenet_model): torch.manual_seed(42) image_input = torch.randn([1, 3, 224, 224]) # We are running on a dummy input so it doesn't matter if the weights are trained. model = imagenet_model(pretrained=False) model.eval() # Run on CPU. native_out = model(image_input) poptorch_model = poptorch.inferenceModel(model) poptorch_out = poptorch_model(image_input) helpers.assert_allclose(expected=native_out, actual=poptorch_out, atol=1e-05, rtol=0.1) native_class = torch.topk(torch.softmax(native_out, 1), 5) pop_class = torch.topk(torch.softmax(poptorch_out, 1), 5) helpers.assert_allequal(expected=native_class.indices, actual=pop_class.indices) helpers.assert_allclose(expected=native_class.values, actual=pop_class.values) @unittest.mock.patch.dict("os.environ", helpers.disableSmallModel()) @pytest.mark.parametrize("model", tested_models + untested_models) @pytest.mark.extendedTestingOnly def test_model(model): if model in untested_models: pytest.skip("Model not currently tested") inference_harness(model) ================================================ FILE: tests/type_support_test.py ================================================ #!/usr/bin/env python3 # Copyright (c) 2020 Graphcore Ltd. All rights reserved. import numpy as np import torch import torch.nn as nn import pytest import helpers import poptorch MANY_TYPES = (torch.float16, torch.float32, torch.float64, torch.int32, torch.int64) DEMOTED_ON_IPU = (torch.float64, torch.int64) def get_simple_adder(return_type): class SimpleAdder(nn.Module): def forward(self, x, y): return (x + y).type(return_type) return poptorch.inferenceModel(SimpleAdder()) @pytest.mark.parametrize("input_type", MANY_TYPES) @pytest.mark.parametrize("output_type", MANY_TYPES) def test_many_input_output_types(input_type, output_type): model = get_simple_adder(output_type) t1 = torch.tensor([1.0, 25, -1.0, 83], dtype=input_type) t2 = torch.tensor([2.0, 35, 1.0, 32.4], dtype=input_type) output = model(t1, t2) if output_type not in DEMOTED_ON_IPU: assert output[0].dtype == output_type assert output[1].dtype == output_type helpers.assert_allclose(actual=output, expected=torch.tensor([3., 60., 0., 115.4]), atol=0.5, rtol=0) @pytest.mark.parametrize("input_1_type", MANY_TYPES) @pytest.mark.parametrize("input_2_type", MANY_TYPES) @pytest.mark.parametrize("output_type", MANY_TYPES) def test_many_implicit_cast(input_1_type, input_2_type, output_type): model = get_simple_adder(output_type) t1 = torch.tensor([1.0, 25., -1.0, 83.], dtype=input_1_type) t2 = torch.tensor([2.0, 35., 1.0, 32.4], dtype=input_2_type) helpers.assert_allclose(actual=model(t1, t2), expected=torch.tensor([3., 60., 0., 115.4]), atol=0.5, rtol=0) def get_unpack_clamp(): class UnpackClamp(nn.Module): def forward(self, x): i, _ = x return i.clamp(-1, 1) return poptorch.inferenceModel(UnpackClamp()) @pytest.mark.parametrize("input_type", MANY_TYPES) def test_clamp_many_types(input_type): model = get_unpack_clamp() x = torch.tensor([[-2, -1, 0, 1, 2], [0, 0, 0, 0, 0]], dtype=input_type) y = model(x) np.testing.assert_allclose(y.numpy(), np.array([-1, -1, 0, 1, 1])) def get_simple_add_two(): class GetSimpleAddTwo(nn.Module): def forward(self, x): return x + 2 return poptorch.inferenceModel(GetSimpleAddTwo()) @pytest.mark.parametrize("input_type", MANY_TYPES) def test_add_two_many_types(input_type): model = get_simple_add_two() t = torch.tensor([1.0, 25., -1.0, 83.], dtype=input_type) helpers.assert_allclose(actual=model(t), expected=torch.tensor([3.0, 27., 1, 85.]), atol=0.5, rtol=0) def get_simple_incrementer(constant_type, return_type): class SimpleIncrementer(nn.Module): def forward(self, x): return (x + torch.tensor(1, dtype=constant_type)).type(return_type) return poptorch.inferenceModel(SimpleIncrementer()) @pytest.mark.parametrize("input_type", MANY_TYPES) @pytest.mark.parametrize("constant_type", MANY_TYPES) @pytest.mark.parametrize("output_type", MANY_TYPES) def test_many_constant_implicit_cast(input_type, constant_type, output_type): #Will not trace if constant_type == torch.float16: return model = get_simple_incrementer(constant_type, output_type) t = torch.tensor([1.0, 25., -1.0, 83.], dtype=input_type) helpers.assert_allclose(actual=model(t), expected=torch.tensor([2.0, 26., 0, 84.]), atol=0.5, rtol=0) @pytest.mark.parametrize("input_1_type", MANY_TYPES) @pytest.mark.parametrize("input_2_type", MANY_TYPES) def test_many_implicit_cast_greater_than(input_1_type, input_2_type): class GreaterThan(nn.Module): def forward(self, x, y): return x > y model = poptorch.inferenceModel(GreaterThan()) t1 = torch.tensor([1, -1, 2.0, 550.4], dtype=input_1_type) t2 = torch.tensor([2.4, 2, 1.0, 32.4], dtype=input_2_type) helpers.assert_allequal(actual=model(t1, t2), expected=torch.tensor([False, False, True, True])) @pytest.mark.parametrize("input_type", MANY_TYPES) def test_many_implicit_cast_greater_than_one(input_type): class GreaterThanOne(nn.Module): def forward(self, x): return x > 1 model = poptorch.inferenceModel(GreaterThanOne()) t = torch.tensor([2.5, -1, 2.0, 550.4], dtype=input_type) helpers.assert_allequal(actual=model(t), expected=torch.tensor([True, False, True, True])) @pytest.mark.parametrize("input_1_type", MANY_TYPES) @pytest.mark.parametrize("input_2_type", MANY_TYPES) def test_many_implicit_cast_equals(input_1_type, input_2_type): class Equals(nn.Module): def forward(self, x, y): return x == y model = poptorch.inferenceModel(Equals()) t1 = torch.tensor([1, -1, 2.0, 550.4], dtype=input_1_type) t2 = torch.tensor([2.4, 2, 2.0, 550.4], dtype=input_2_type) depends = False if (input_1_type == torch.float16 and input_2_type == torch.float16): depends = True if (input_1_type in (torch.float32, torch.float64) and input_2_type in (torch.float32, torch.float64)): depends = True if (input_1_type in (torch.int32, torch.int64) and input_2_type in (torch.int32, torch.int64)): depends = True helpers.assert_allequal(actual=model(t1, t2), expected=torch.tensor( [False, False, True, depends])) @pytest.mark.parametrize("input_type", MANY_TYPES) def test_many_implicit_cast_equals_one(input_type): class EqualsOne(nn.Module): def forward(self, x): return x == 1 model = poptorch.inferenceModel(EqualsOne()) t = torch.tensor([2.5, 1, 2.0, 550.4], dtype=input_type) helpers.assert_allequal(actual=model(t), expected=torch.tensor([False, True, False, False])) @pytest.mark.parametrize("input_1_type", MANY_TYPES) @pytest.mark.parametrize("input_2_type", MANY_TYPES) def test_many_implicit_cast_less_than(input_1_type, input_2_type): class LessThan(nn.Module): def forward(self, x, y): return x < y model = poptorch.inferenceModel(LessThan()) t1 = torch.tensor([1, -1, 2.0, 550.4], dtype=input_1_type) t2 = torch.tensor([2.4, 2, 1.0, 32.4], dtype=input_2_type) helpers.assert_allequal(actual=model(t1, t2), expected=torch.tensor([True, True, False, False])) @pytest.mark.parametrize("input_type", MANY_TYPES) def test_many_implicit_cast_less_than_one(input_type): class LessThanOne(nn.Module): def forward(self, x): return x < 1 model = poptorch.inferenceModel(LessThanOne()) t = torch.tensor([2.5, -1, 2.0, 550.4], dtype=input_type) helpers.assert_allequal(actual=model(t), expected=torch.tensor([False, True, False, False])) @pytest.mark.parametrize("input_type", MANY_TYPES) def test_many_implicit_cast_one_less_than(input_type): class OneLessThan(nn.Module): def forward(self, x): return 1 < x # pylint: disable=misplaced-comparison-constant model = poptorch.inferenceModel(OneLessThan()) t = torch.tensor([2.5, -1, 2.0, 550.4], dtype=input_type) helpers.assert_allequal(actual=model(t), expected=torch.tensor([True, False, True, True])) @pytest.mark.parametrize("input_type", [torch.int8, torch.uint8, torch.int16]) def test_small_int(input_type): class Model(nn.Module): def forward(self, x): return x.float() input = torch.arange(100) # Convert to desired input type input = input.to(input_type) model = poptorch.inferenceModel(Model()) output = model(input) assert output.dtype == torch.float helpers.assert_allequal(actual=output, expected=input.float()) @pytest.mark.parametrize("input_type", [torch.int8, torch.uint8, torch.int16]) def test_small_int_return(input_type): class Model(nn.Module): def forward(self, x): return x, x.float() + x.float() input = torch.arange(100) # Convert to desired input/output type input = input.to(input_type) model = poptorch.inferenceModel(Model()) output, _ = model(input) assert output.dtype == input_type helpers.assert_allequal(actual=output, expected=input) def test_tuple_and_list_constant(): class Model(torch.nn.Module): def forward(self): const1 = torch.tensor([1., 2.]) const2 = torch.tensor([3., 4.]) return torch.tensor(1), const1 + const2, [const1, const2] model = Model() inference_model = poptorch.inferenceModel(model) poptorch_out = inference_model() native = model() helpers.assert_allclose(actual=poptorch_out, expected=native) def test_tuple_and_list_constant_double_nested(): class Model(torch.nn.Module): def forward(self): const1 = torch.tensor([1., 2.]) const2 = torch.tensor([3., 4.]) return ([torch.tensor(1)], const1 + const2, ([const1, const2], [const1, const2]), const2) model = Model() inference_model = poptorch.inferenceModel(model) poptorch_out = inference_model() native = model() helpers.assert_allclose(actual=poptorch_out, expected=native) ================================================ FILE: tests/weights_writing_test.py ================================================ #!/usr/bin/env python3 # Copyright (c) 2020 Graphcore Ltd. All rights reserved. import os import types import copy import tempfile import unittest.mock import numpy as np import pytest import torch import torch.optim as optim import helpers import poptorch # pragma pylint: disable=unsubscriptable-object class ModelWithLoss(torch.nn.Module): def __init__(self, loss, use_dropout=False): super().__init__() self.linear = torch.nn.Linear(10, 10) self.loss = loss if use_dropout: self.dropout = torch.nn.Dropout() else: self.dropout = lambda x: x def forward(self, data, target=None): out = self.dropout(self.linear(data)) if target is None: return out loss = self.loss(out, target) return out, loss @pytest.mark.parametrize("use_half", [True, False]) def test_training_and_inference(use_half): torch.manual_seed(42) # 10 Batches of 10. input = torch.randn(10, 10) # 10 batches of 1 label = torch.randint(0, 10, [1]) label = label.expand([10]) model = ModelWithLoss(torch.nn.CrossEntropyLoss()) if use_half: model.half() input = input.half() # Run on IPU batch size 1 * 10 popart batches. opts = poptorch.Options().deviceIterations(10) training = poptorch.trainingModel(model, options=opts) inference = poptorch.inferenceModel(model, options=opts) # Run all 10 batches as batchsize 10. out = inference(input) # Sanity check we weren't already matching the label. assert not torch.equal(torch.argmax(out.int(), dim=1), label) for _ in range(0, 1000): _, loss = training(input, label) # Each batch should NOT report its own loss. As by default training # model should have a "Final" output mode. assert len(loss.size()) == 0 # Run with trained weights. out = inference(input) # Check we are now equal with labels. helpers.assert_allequal(actual=torch.argmax(out.int(), dim=1), expected=label) @pytest.mark.parametrize("use_half", [True, False]) def test_training_inference_parameters(use_half): torch.manual_seed(42) # 10 Batches of 10. input = torch.randn(10, 10) # 10 batches of 1 label = torch.randint(0, 10, [1]) label = label.expand([10]) model = ModelWithLoss(torch.nn.CrossEntropyLoss()) if use_half: model.half() input = input.half() # Run on IPU batch size 1 * 10 popart batches. opts = poptorch.Options().deviceIterations(10) inference = poptorch.inferenceModel(model, opts) training = poptorch.trainingModel(model, options=opts) inference = poptorch.inferenceModel(model) # Run all 10 batches as batchsize 10. out = inference(input) # Sanity check we weren't already matching the label. assert not torch.equal(torch.argmax(out.int(), dim=1), label) for _ in range(0, 1000): _, loss = training(input, label) # Each batch should NOT report its own loss. As by default training model should have a "Final" output mode. assert len(loss.size()) == 0 # This will trigger copyWeightsToHost() for _ in model.named_parameters(): pass # Run with trained weights. out = inference(input) # Check we are now equal with labels. helpers.assert_allequal(actual=torch.argmax(out.int(), dim=1), expected=label) @pytest.mark.parametrize("use_half", [True, False]) def test_access_parameters(use_half): torch.manual_seed(42) # 10 Batches of 10. input = torch.randn(10, 10) # 10 batches of 1 label = torch.randint(0, 10, [1]) label = label.expand([10]) model = ModelWithLoss(torch.nn.CrossEntropyLoss()) if use_half: model.half() input = input.half() # Run on IPU batch size 1 * 10 popart batches. opts = poptorch.Options().deviceIterations(10) poptorch_model = poptorch.trainingModel(model, options=opts) original_weights = str(model.linear.weight) inference = poptorch.inferenceModel(model) # Run all 10 batches as batchsize 10. out = inference(input) assert original_weights == str(model.linear.weight) # Sanity check we weren't already matching the label. assert not torch.equal(torch.argmax(out.int(), dim=1), label) for _ in range(0, 1000): _, loss = poptorch_model(input, label) # Each batch should NOT report its own loss. As by default training model should have a "Final" output mode. assert len(loss.size()) == 0 assert original_weights != str(poptorch_model.model.linear.weight) # Run with trained weights. out = inference(input) # Check we are now equal with labels. helpers.assert_allequal(actual=torch.argmax(out.int(), dim=1), expected=label) class DummyTrainingModel(torch.nn.Module): """ Dummy training model """ def __init__(self): super().__init__() self.conv = torch.nn.Conv2d(16, 4, (3, 3)) self.loss = torch.nn.NLLLoss() self.batch_norm = torch.nn.BatchNorm2d(4) self.softmax = torch.nn.LogSoftmax(dim=1) def forward(self, x, target): x = self.conv(x) x = self.batch_norm(x) x = self.softmax(x) return self.loss(x, target) def test_torch_save(): torch.manual_seed(42) # create a dummy model model = DummyTrainingModel() # create optimizer optimizer = optim.SGD(model.parameters(), lr=0.01) # store the weights before training pre_train_weights = copy.deepcopy(model.state_dict()['conv.weight']) # wrap it in a trainingModel training_model = poptorch.trainingModel(model, optimizer=optimizer) # run on dummy data for one iteration input = torch.randn(5, 16, 10, 10) target = torch.empty(5, 8, 8, dtype=torch.long).random_(0, 4) _ = training_model(input, target) with tempfile.TemporaryDirectory(dir=".") as d: model_file = os.path.join(d, "model.save") # save the model torch.save(model, model_file) # reload the model reloaded_model = torch.load(model_file) # make sure the reloaded weights are the same as the # model and trainingModel assert np.allclose(model.state_dict()['conv.weight'], reloaded_model.state_dict()['conv.weight']) assert np.allclose(model.state_dict()['conv.weight'], training_model.state_dict()['conv.weight']) # make sure we actually trained and we are not just checking # the original wrapped model weights assert not np.allclose(model.state_dict()['conv.weight'], pre_train_weights) @helpers.printCapfdOnExit @helpers.overridePoptorchLogLevel("DEBUG") @pytest.mark.ipuHardwareRequired def test_seed_precompilation(capfd): # create a dummy model model = ModelWithLoss(torch.nn.CrossEntropyLoss(), use_dropout=True) # create optimizer optimizer = poptorch.optim.SGD(model.parameters(), lr=0.01) opts = poptorch.Options().randomSeed(42) opts.useOfflineIpuTarget(poptorch.ipuHardwareVersion()) training_model = poptorch.trainingModel(model, options=opts, optimizer=optimizer) # 10 Batches of 10. input = torch.randn(10, 10) # 10 batches of 1 label = torch.randint(0, 10, [1]) label = label.expand([10]) training_model.compile(input, label) # Clear the outputs (We only want to parse what's triggered by save() helpers.LogChecker(capfd) with tempfile.TemporaryDirectory() as d: path = os.path.join(d, "checkpoint.pt") training_model.save(path) # Creating a checkpoint should trigger copies of the weights, optimizer state # random seed and rng state but as we're using an offline target nothing # should happen. log = helpers.LogChecker(capfd) log.assert_no_matches("Reading random seed") log.assert_no_matches("Reading RNG state") log.assert_no_matches("Implicit copyWeightsToHost()") log.assert_no_matches( "Writing optimiser state tensors from IPU to host.") poptorch.load(path) log = helpers.LogChecker(capfd) log.assert_matches("Writing weights from host to IPU memory") log.assert_matches("Setting random seed to") # We haven't run on HW so we don't have a state yet log.assert_no_matches("Setting RNG state") log.assert_no_matches( "Writing optimiser state tensors from host to IPU memory") @helpers.printCapfdOnExit @helpers.overridePoptorchLogLevel("DEBUG") @pytest.mark.ipuHardwareRequired def test_save_everything(capfd, caplog): # create a dummy model model = ModelWithLoss(torch.nn.CrossEntropyLoss(), use_dropout=True) # create optimizer optimizer = poptorch.optim.SGD(model.parameters(), lr=0.01) opts = poptorch.Options().randomSeed(42) training_model = poptorch.trainingModel(model, options=opts, optimizer=optimizer) # 10 Batches of 10. input = torch.randn(10, 10) # 10 batches of 1 label = torch.randint(0, 10, [1]) label = label.expand([10]) first_out, first_loss = training_model(input, label) # Clear the outputs (We only want to parse what's triggered by save() helpers.LogChecker(capfd) origin_out = [] loaded_out = [] with tempfile.TemporaryDirectory() as d: path = os.path.join(d, "checkpoint.pt") training_model.save(path) # Creating a checkpoint should trigger copies of the weights, optimizer state # random seed and rng state. log = helpers.LogChecker(capfd) log.assert_matches("Reading random seed") log.assert_matches("Reading RNG state") log.assert_matches("Writing optimiser state tensors from IPU to host.") log = helpers.LogChecker(caplog) log.assert_matches("Implicit copyWeightsToHost()") origin_out.append(training_model(input, label)) loaded = poptorch.load(path) log = helpers.LogChecker(capfd) log.assert_matches("Writing weights from host to IPU memory") log.assert_matches("Setting random seed to") log.assert_matches("Setting RNG state") log.assert_matches( "Writing optimiser state tensors from host to IPU memory") loaded_out.append(loaded(input, label)) origin_out.append(training_model(input, label)) # Everything is loaded: there shouldn't be any transfer log = helpers.LogChecker(capfd) log.assert_no_matches("Writing weights from host to IPU memory") log.assert_no_matches("Implicit copyWeightsToHost()") log.assert_no_matches("random seed") log.assert_no_matches("RNG state") log.assert_no_matches("Writing optimiser state tensors from") loaded.detachFromDevice() log = helpers.LogChecker(capfd) log.assert_matches("Writing weights from IPU to host") log.assert_matches("Writing optimiser state tensors from IPU to host") log.assert_matches("Reading random seed") log.assert_matches("Reading RNG state") log.assert_matches("Detached from device") loaded_out.append(loaded(input, label)) log = helpers.LogChecker(capfd) log.assert_matches("Writing weights from host to IPU memory") log.assert_matches( "Writing optimiser state tensors from host to IPU memory") log.assert_matches("Setting random seed to") log.assert_matches("Setting RNG state") for (out, loss), (load_out, load_loss) in zip(origin_out, loaded_out): helpers.assert_allclose(expected=out, actual=load_out) assert loss == load_loss assert not torch.allclose(out, first_out, rtol=1e-02, atol=1e-02) assert loss != first_loss def train_and_check_weight_sharing_ipu_cpu(model, training_model, input, target, original_parameters): # Make sure the first run doesn't already pass the test. original, _ = training_model(input, target) assert not torch.allclose(original, target, rtol=1e-02, atol=1e-02) # Train on IPU. for _ in range(0, 1000): out, _ = training_model(input, target) assert training_model.deviceToHostCounter == 0, \ "No implicit copy needed to train the model" # Run without copying the weights and check they've been automatically updated. native_out = model(input) helpers.assert_allclose(expected=native_out, actual=out) assert training_model.deviceToHostCounter == 1, \ "1 implicit copy after having trained the model" training_model.deviceToHostCounter = 0 # reset counter current_parameters = str(list(model.parameters())) assert original_parameters != current_parameters assert training_model.deviceToHostCounter == 0, \ "No implicit copy needed to access the parameters after inference" last_parameters = current_parameters native_out = model(input) helpers.assert_allclose(expected=native_out, actual=out) assert training_model.deviceToHostCounter == 0, \ "No implicit copy needed after inference" current_parameters = str(list(model.parameters())) assert last_parameters == current_parameters assert training_model.deviceToHostCounter == 0, \ "No implicit copy needed to access the parameters after inference" def test_weights_sharing_ipu_cpu(): torch.manual_seed(42) model = ModelWithLoss(torch.nn.MSELoss()) training_model = poptorch.trainingModel(model) training_model.deviceToHostCounter = 0 realMethod = training_model.copyWeightsToHost original_parameters = str(list(model.parameters())) def deviceToHostWrapper(model): model.deviceToHostCounter += 1 realMethod() training_model.copyWeightsToHost = types.MethodType( deviceToHostWrapper, training_model) # Same model as above, they will share weights (in 'model') which once training is finished can be copied back. target = torch.randn(10) input = torch.randn(10) train_and_check_weight_sharing_ipu_cpu(model, training_model, input, target, original_parameters) # Train on IPU. for _ in range(0, 50): out, _ = training_model(input, target) current_parameters = str(list(model.parameters())) assert training_model.deviceToHostCounter == 1, \ "1 implicit copy after having trained the model" assert original_parameters != current_parameters training_model.deviceToHostCounter = 0 # reset counter for _ in range(0, 50): out, _ = training_model(input, target) # Access a parameter directly: print(model.linear.weight.data) assert training_model.deviceToHostCounter == 1, \ "1 implicit copy after having trained the model" training_model.deviceToHostCounter = 0 # reset counter for _ in range(0, 50): out, _ = training_model(input, target) # Check state_dict works: torch.save(model.state_dict(), "/tmp/model.save") model.state_dict() assert training_model.deviceToHostCounter == 1, \ "1 implicit copy after having trained the model" training_model.deviceToHostCounter = 0 # reset counter for _ in range(0, 50): out, _ = training_model(input, target) assert training_model.deviceToHostCounter == 0, \ "No implicit copy needed to train the model" # Run without copying the weights and check they've been automatically updated. native_out = model(input) helpers.assert_allclose(expected=native_out, actual=out) assert training_model.deviceToHostCounter == 1, \ "1 implicit copy after having trained the model" training_model.deviceToHostCounter = 0 # reset counter native_out = model(input) helpers.assert_allclose(expected=native_out, actual=out) assert training_model.deviceToHostCounter == 0, \ "No implicit copy needed after inference" # Check we have trained the "model" helpers.assert_allclose(expected=native_out, actual=target, rtol=1e-02, atol=1e-02) def train_N_times_and_check_copying(N, inference_model, training_model, input, target): # Train on IPU. for _ in range(0, N): out, _ = training_model(input, target) assert training_model.deviceToHostCounter == 0, \ "No implicit copy needed to train the model" # Run without copying the weights and check they've been automatically updated. out_inference = inference_model(input) helpers.assert_allclose(expected=out, actual=out_inference) assert training_model.deviceToHostCounter == 1, \ "1 implicit copy after having trained the model" training_model.deviceToHostCounter = 0 # reset counter out_inference = inference_model(input) helpers.assert_allclose(expected=out, actual=out_inference) assert training_model.deviceToHostCounter == 0, \ "No implicit copy needed after inference" return out_inference def test_weights_sharing_ipus(): torch.manual_seed(42) model = ModelWithLoss(torch.nn.MSELoss()) training_model = poptorch.trainingModel(model) training_model.deviceToHostCounter = 0 realMethod = training_model.copyWeightsToHost def deviceToHostWrapper(model): model.deviceToHostCounter += 1 realMethod() training_model.copyWeightsToHost = types.MethodType( deviceToHostWrapper, training_model) # Same model as above, they will share weights (in 'model') which once training is finished can be copied back. inference_model = poptorch.inferenceModel(model) target = torch.randn(10) input = torch.randn(10) out_inference = inference_model(input) assert not torch.allclose(out_inference, target, rtol=1e-02, atol=1e-02) # Make sure the first run doesn't already pass the test. original, _ = training_model(input, target) assert not torch.allclose(original, target, rtol=1e-02, atol=1e-02) train_N_times_and_check_copying(1000, inference_model, training_model, input, target) out_inference = train_N_times_and_check_copying(1500, inference_model, training_model, input, target) helpers.assert_allclose(actual=out_inference, expected=target, rtol=1e-02, atol=1e-02) def test_implicit_first_time_copy(): torch.manual_seed(42) # Train on host. model = torch.nn.Linear(10, 10) target = torch.randn(10) input = torch.randn(10) loss_function = torch.nn.MSELoss() optimizer = optim.SGD(model.parameters(), lr=0.01) model.eval() # Make sure the first run doesn't already pass the test. native = model(input) assert not torch.allclose(native, target, rtol=1e-02, atol=1e-02) model.train() for _ in range(0, 2500): optimizer.zero_grad() # Run model. outputs = model(input) # Back prop loss. loss = loss_function(target, outputs) loss.backward() optimizer.step() # Check the model is now trained model.eval() native = model(input) helpers.assert_allclose(actual=native, expected=target, rtol=1e-02, atol=1e-02) # Run on IPU. ipuModel = poptorch.inferenceModel(model) poptorch_out = ipuModel(input) # Check IPU returns same value as native without the weights explicitly being copied. helpers.assert_allclose(expected=native, actual=poptorch_out) helpers.assert_allclose(actual=poptorch_out, expected=target, rtol=1e-02, atol=1e-02) def test_implicit_first_time_copy_negative(): torch.manual_seed(42) # Train on host. model = torch.nn.Linear(10, 10) target = torch.randn(10) input = torch.randn(10) loss_function = torch.nn.MSELoss() optimizer = optim.SGD(model.parameters(), lr=0.01) model.eval() # Make sure the first run doesn't already pass the test. native = model(input) assert not torch.allclose(native, target, rtol=1e-02, atol=1e-02) # Run on IPU. poptorch_model = poptorch.inferenceModel(model) poptorch_out = poptorch_model(input) # Weights should be copied so check we are matching host but NOT the target. helpers.assert_allclose(expected=native, actual=poptorch_out) assert not torch.allclose(native, target, rtol=1e-02, atol=1e-02) model.train() for _ in range(0, 2500): optimizer.zero_grad() # Run model. outputs = model(input) # Back prop loss. loss = loss_function(target, outputs) loss.backward() optimizer.step() # Check the model is now trained model.eval() native = model(input) helpers.assert_allclose(actual=native, expected=target, rtol=1e-02, atol=1e-02) # Without recompilation or copying the weights check we are matching neither host nor the target. poptorch_out = poptorch_model(input) # Check IPU *does not* return the same value as native assert not torch.allclose(poptorch_out, native) assert not torch.allclose(poptorch_out, target, rtol=1e-02, atol=1e-02) def test_weight_overwrite_trained_weight(): torch.manual_seed(42) model = ModelWithLoss(torch.nn.MSELoss()) poptorch_model = poptorch.trainingModel(model) target = torch.randn(10) input = torch.randn(10) # Make sure the first run doesn't already pass the test. original, loss = poptorch_model(input, target) assert not torch.allclose(original, target, rtol=1e-02, atol=1e-02) # Train on IPU. for _ in range(0, 2500): trained_out, trained_loss = poptorch_model(input, target) # Check we have trained the "model" helpers.assert_allclose(actual=trained_out, expected=target, rtol=1e-02, atol=1e-02) # Overwrite the trained weights with weights from host. poptorch_model.copyWeightsToDevice() # Don't train them. poptorch_model.setOptimizer(optim.SGD(model.parameters(), lr=0.0)) out, loss = poptorch_model(input, target) host_out = model(input) # Check we are no longer trained. assert not torch.allclose(out, target, rtol=1e-02, atol=1e-02) assert not torch.allclose(loss, trained_loss) helpers.assert_allclose(expected=host_out, actual=out) @pytest.mark.parametrize("use_half", [True, False]) def test_access_scalar_parameter(use_half): class ExampleModel(torch.nn.Module): def __init__(self): super().__init__() self.bias = torch.nn.Parameter(torch.zeros(())) def forward(self, x): x = x + 1 # It is important to make sure the result of the print is used. x = poptorch.ipu_print_tensor(x) return x + self.bias def custom_loss(output, target): # Mean squared error with a scale loss = output - target loss = loss * loss * 5 return poptorch.identity_loss(loss, reduction="mean") class ExampleModelWithCustomLoss(torch.nn.Module): def __init__(self): super().__init__() self.model = ExampleModel() def forward(self, input, target=None): out = self.model(input) if target is not None: return out, custom_loss(out, target) return out model = ExampleModelWithCustomLoss() input = torch.tensor([1.0, 2.0, 3.0]) target = torch.tensor([30.0, 40.0, 50.0]) if use_half: model.half() input = input.half() target = target.half() poptorch_model = poptorch.trainingModel(model) original_bias = str(poptorch_model.model.model.bias) for _ in range(10): poptorch_model(input=input, target=target) updated_bias = str(poptorch_model.model.model.bias) assert original_bias != updated_bias poptorch_model.copyWeightsToHost() # Bias should already be up to date assert updated_bias == str(poptorch_model.model.model.bias) @pytest.mark.parametrize("reverse_equal_call", [True, False]) def test_copy_on_torch_equal(reverse_equal_call): torch.manual_seed(42) model = ModelWithLoss(torch.nn.MSELoss()) poptorch_model = poptorch.trainingModel(model, optimizer=torch.optim.SGD( model.parameters(), lr=0.01)) target = torch.ones(10) input = torch.randn(10) weight_at_start = model.linear.weight.clone().data for _ in range(100): poptorch_model(input, target) if reverse_equal_call: assert not torch.equal(model.linear.weight, weight_at_start) else: assert not torch.equal(weight_at_start, model.linear.weight) def test_copy_after_compile(): torch.manual_seed(42) model = ModelWithLoss(torch.nn.MSELoss()) poptorch_model = poptorch.trainingModel(model, optimizer=torch.optim.SGD( model.parameters(), lr=0.01)) target = torch.ones(10) input = torch.randn(10) poptorch_model.compile(input, target) # If we haven't copied the weights, Popart will fire an exception # when trying to execute the model. poptorch_model(input, target) def test_torch_save_unwrapped(): torch.manual_seed(42) class Model(torch.nn.Module): def __init__(self): super().__init__() self.conv = torch.nn.Conv2d(2, 2, 1, padding=0) self.register_buffer("test_buffer", torch.zeros([2], dtype=torch.float32)) self.register_parameter("test_param", torch.nn.Parameter(torch.empty(10))) self.loss = torch.nn.L1Loss() def forward(self, inp): out = self.conv(inp) loss = self.loss(out) return out, loss model = Model() # Only training models instrument the model so we can't use poptporch.inferenceModel poptorch.trainingModel(model) # An inference model sharing its user model with a training model will be instrumented though. poptorch.inferenceModel(model) with tempfile.TemporaryDirectory() as tmp: torch_file = os.path.join(tmp, "torch_saved.pt") torch.save(model.state_dict(), torch_file) # Ensure the state dictionaries returned by the training and inference models don't contain any PopTorch wrapper. with unittest.mock.patch.object( poptorch._impl, # pylint: disable=protected-access "_restoreWrapperIfNecessary", wraps=poptorch._impl._restoreWrapperIfNecessary # pylint: disable=protected-access ) as restore_fn: torch.load(torch_file) restore_fn.assert_not_called() ================================================ FILE: version.json ================================================ {"major": "3", "minor": "4", "point": "0"}