Repository: Yangqing/caffe2 Branch: master Commit: 28523ff1ff33 Files: 1931 Total size: 17.8 MB Directory structure: gitextract_2u1ieaci/ ├── .gitattributes ├── .github/ │ ├── CONTRIBUTING.md │ ├── ISSUE_TEMPLATE.md │ └── PULL_REQUEST_TEMPLATE.md ├── .gitignore ├── .gitmodules ├── .jenkins/ │ ├── README.md │ ├── build.sh │ └── test.sh ├── .travis/ │ ├── build.sh │ ├── install.sh │ ├── setup.sh │ └── test.sh ├── .travis.yml ├── CMakeLists.txt ├── LICENSE ├── Makefile ├── NOTICE ├── README.md ├── VERSION_NUMBER ├── appveyor.yml ├── binaries/ │ ├── CMakeLists.txt │ ├── caffe2_benchmark.cc │ ├── convert_caffe_image_db.cc │ ├── convert_db.cc │ ├── convert_encoded_to_raw_leveldb.cc │ ├── core_overhead_benchmark.cc │ ├── db_throughput.cc │ ├── inspect_gpus.cc │ ├── make_cifar_db.cc │ ├── make_image_db.cc │ ├── make_mnist_db.cc │ ├── predictor_verifier.cc │ ├── print_core_object_sizes.cc │ ├── print_registered_core_operators.cc │ ├── run_plan.cc │ ├── run_plan_mpi.cc │ ├── speed_benchmark.cc │ ├── split_db.cc │ ├── tutorial_blob.cc │ └── zmq_feeder.cc ├── caffe/ │ ├── __init__.py │ └── proto/ │ ├── CMakeLists.txt │ ├── __init__.py │ └── caffe.proto ├── caffe2/ │ ├── .clang-format │ ├── CMakeLists.txt │ ├── __init__.py │ ├── contrib/ │ │ ├── CMakeLists.txt │ │ ├── __init__.py │ │ ├── aten/ │ │ │ ├── CMakeLists.txt │ │ │ ├── README.md │ │ │ ├── aten_op.cc │ │ │ ├── aten_op.h │ │ │ ├── aten_op_cuda.cc │ │ │ ├── aten_op_template.h │ │ │ ├── aten_test.py │ │ │ ├── docs/ │ │ │ │ ├── pytorch_to_caffe2.md │ │ │ │ └── sample.py │ │ │ └── gen_op.py │ │ ├── cuda-convnet2/ │ │ │ ├── LICENSE │ │ │ ├── README.md │ │ │ ├── build.sh │ │ │ ├── convdata.py │ │ │ ├── convnet.py │ │ │ ├── cudaconv3/ │ │ │ │ ├── Makefile │ │ │ │ ├── include/ │ │ │ │ │ ├── conv_util.cuh │ │ │ │ │ └── cudaconv2.cuh │ │ │ │ └── src/ │ │ │ │ ├── conv_util.cu │ │ │ │ ├── filter_acts.cu │ │ │ │ ├── img_acts.cu │ │ │ │ └── weight_acts.cu │ │ │ ├── cudaconvnet/ │ │ │ │ ├── Makefile │ │ │ │ ├── __init__.py │ │ │ │ ├── include/ │ │ │ │ │ ├── actbroadcaster.cuh │ │ │ │ │ ├── convnet.cuh │ │ │ │ │ ├── copypipeline.cuh │ │ │ │ │ ├── cost.cuh │ │ │ │ │ ├── data.cuh │ │ │ │ │ ├── gradreducer.cuh │ │ │ │ │ ├── jpeg.h │ │ │ │ │ ├── layer.cuh │ │ │ │ │ ├── layer_kernels.cuh │ │ │ │ │ ├── lr.cuh │ │ │ │ │ ├── memorysource.cuh │ │ │ │ │ ├── messages.cuh │ │ │ │ │ ├── neuron.cuh │ │ │ │ │ ├── pipedispenser.cuh │ │ │ │ │ ├── pyconvnet.cuh │ │ │ │ │ ├── reducepipeline.cuh │ │ │ │ │ ├── streambroadcast.cuh │ │ │ │ │ ├── timer.cuh │ │ │ │ │ ├── util.cuh │ │ │ │ │ ├── weights.cuh │ │ │ │ │ └── worker.cuh │ │ │ │ └── src/ │ │ │ │ ├── actbroadcaster.cu │ │ │ │ ├── convnet.cu │ │ │ │ ├── copypipeline.cu │ │ │ │ ├── cost.cu │ │ │ │ ├── data.cu │ │ │ │ ├── gradreducer.cu │ │ │ │ ├── jpeg.cpp │ │ │ │ ├── layer.cu │ │ │ │ ├── layer_kernels.cu │ │ │ │ ├── lr.cu │ │ │ │ ├── memorysource.cu │ │ │ │ ├── neuron.cu │ │ │ │ ├── pyconvnet.cu │ │ │ │ ├── reducepipeline.cu │ │ │ │ ├── streambroadcast.cu │ │ │ │ ├── util.cu │ │ │ │ ├── weights.cu │ │ │ │ └── worker.cu │ │ │ ├── initw.py │ │ │ ├── layer.py │ │ │ ├── layers/ │ │ │ │ ├── layer-params-cifar10-11pct.cfg │ │ │ │ ├── layer-params-imagenet-1gpu.cfg │ │ │ │ ├── layer-params-imagenet-2gpu-data.cfg │ │ │ │ ├── layer-params-imagenet-2gpu-model.cfg │ │ │ │ ├── layer-params-imagenet-4gpu-data-model.cfg │ │ │ │ ├── layer-params-imagenet-4gpu-data.cfg │ │ │ │ ├── layers-cifar10-11pct.cfg │ │ │ │ ├── layers-imagenet-1gpu.cfg │ │ │ │ ├── layers-imagenet-2gpu-data.cfg │ │ │ │ ├── layers-imagenet-2gpu-model.cfg │ │ │ │ ├── layers-imagenet-4gpu-data-model.cfg │ │ │ │ └── layers-imagenet-4gpu-data.cfg │ │ │ ├── make-data/ │ │ │ │ ├── input_meta │ │ │ │ ├── make-data.py │ │ │ │ └── pyext/ │ │ │ │ ├── Makefile │ │ │ │ ├── __init__.py │ │ │ │ ├── include/ │ │ │ │ │ └── pyext.h │ │ │ │ └── src/ │ │ │ │ └── pyext.cpp │ │ │ ├── nvmatrix/ │ │ │ │ ├── Makefile │ │ │ │ ├── include/ │ │ │ │ │ ├── memory.cuh │ │ │ │ │ ├── nvmatrix.cuh │ │ │ │ │ ├── nvmatrix_kernels.cuh │ │ │ │ │ └── nvmatrix_operators.cuh │ │ │ │ └── src/ │ │ │ │ ├── memory.cu │ │ │ │ ├── nvmatrix.cu │ │ │ │ └── nvmatrix_kernels.cu │ │ │ ├── python_util/ │ │ │ │ ├── __init__.py │ │ │ │ ├── data.py │ │ │ │ ├── gpumodel.py │ │ │ │ ├── options.py │ │ │ │ └── util.py │ │ │ ├── shownet.py │ │ │ └── util/ │ │ │ ├── Makefile │ │ │ ├── include/ │ │ │ │ ├── matrix.h │ │ │ │ ├── matrix_funcs.h │ │ │ │ ├── queue.h │ │ │ │ ├── sync.h │ │ │ │ └── thread.h │ │ │ └── src/ │ │ │ └── matrix.cpp │ │ ├── docker-ubuntu-14.04/ │ │ │ └── Dockerfile │ │ ├── gloo/ │ │ │ ├── CMakeLists.txt │ │ │ ├── allgather_ops.cc │ │ │ ├── allgather_ops.h │ │ │ ├── allreduce_ops.cc │ │ │ ├── allreduce_ops.h │ │ │ ├── allreduce_ops_gpu.cc │ │ │ ├── barrier_ops.cc │ │ │ ├── barrier_ops.h │ │ │ ├── broadcast_ops.cc │ │ │ ├── broadcast_ops.h │ │ │ ├── broadcast_ops_gpu.cc │ │ │ ├── common.cc │ │ │ ├── common.h │ │ │ ├── common_world_ops.cc │ │ │ ├── common_world_ops.h │ │ │ ├── common_world_ops_gpu.cc │ │ │ ├── context.cc │ │ │ ├── context.h │ │ │ ├── gloo_test.py │ │ │ ├── py_export.cc │ │ │ ├── reduce_scatter_ops.cc │ │ │ ├── reduce_scatter_ops.h │ │ │ ├── store_handler.cc │ │ │ └── store_handler.h │ │ ├── nccl/ │ │ │ ├── CMakeLists.txt │ │ │ ├── cuda_nccl_gpu.cc │ │ │ ├── cuda_nccl_gpu.h │ │ │ ├── cuda_nccl_op_gpu.cc │ │ │ └── nccl_ops_test.py │ │ ├── nervana/ │ │ │ ├── CMakeLists.txt │ │ │ ├── nervana.h │ │ │ ├── nervana_c_api.cu │ │ │ ├── nervana_c_api.h │ │ │ ├── nervana_fc_op_gpu.cc │ │ │ ├── nervana_fc_op_gpu_test.cc │ │ │ ├── nervana_init_gpu.cc │ │ │ └── nervana_math_gpu.cc │ │ ├── nnpack/ │ │ │ ├── nnpack_ops.cc │ │ │ └── nnpack_ops_test.py │ │ ├── playground/ │ │ │ ├── AnyExp.py │ │ │ ├── AnyExpOnTerm.py │ │ │ ├── ModuleRegister.py │ │ │ ├── README.md │ │ │ ├── __init__.py │ │ │ ├── checkpoint.py │ │ │ ├── compute_loss.py │ │ │ ├── compute_topk_accuracy.py │ │ │ ├── meter.py │ │ │ ├── module_map.py │ │ │ ├── output_generator.py │ │ │ └── resnet50demo/ │ │ │ ├── IN1k_resnet50.py │ │ │ ├── __init__.py │ │ │ ├── caffe2_resnet50_default_forward.py │ │ │ ├── caffe2_resnet50_default_param_update.py │ │ │ ├── explicit_resnet_forward.py │ │ │ ├── explicit_resnet_param_update.py │ │ │ ├── gfs_IN1k.py │ │ │ └── rendezvous_filestore.py │ │ ├── prof/ │ │ │ ├── CMakeLists.txt │ │ │ ├── cuda_profile_ops.cc │ │ │ ├── cuda_profile_ops_test.py │ │ │ ├── htrace_async_dag_net_gpu.cc │ │ │ ├── htrace_conf.cc │ │ │ ├── htrace_conf.h │ │ │ ├── htrace_dag_net.cc │ │ │ ├── htrace_to_chrome.py │ │ │ ├── prof_dag_net.cc │ │ │ ├── prof_dag_net.h │ │ │ ├── prof_dag_stats_op.cc │ │ │ └── prof_dag_stats_op.h │ │ ├── script/ │ │ │ ├── CMakeLists.txt │ │ │ ├── caffe2_script_test.py │ │ │ ├── compiler.cc │ │ │ ├── compiler.h │ │ │ ├── error_report.h │ │ │ ├── examples/ │ │ │ │ ├── example_beam_search.c2s │ │ │ │ ├── example_post_eos_penalty.c2s │ │ │ │ └── run_examples.py │ │ │ ├── lexer.cc │ │ │ ├── lexer.h │ │ │ ├── parser.h │ │ │ ├── tree.h │ │ │ └── tree_views.h │ │ ├── shm_mutex/ │ │ │ ├── CMakeLists.txt │ │ │ ├── shm_mutex.cc │ │ │ └── shm_mutex.h │ │ ├── tensorboard/ │ │ │ ├── __init__.py │ │ │ ├── tensorboard.md │ │ │ ├── tensorboard.py │ │ │ ├── tensorboard_exporter.py │ │ │ ├── tensorboard_exporter_test.py │ │ │ └── tensorboard_test.py │ │ ├── torch/ │ │ │ ├── th_ops.cc │ │ │ ├── th_ops_gpu.cu │ │ │ ├── th_ops_test.py │ │ │ ├── torch_op.cpp │ │ │ ├── torch_op.h │ │ │ ├── torch_op_gpu.cpp │ │ │ └── torch_ops_test.py │ │ └── warpctc/ │ │ ├── ctc_op.cpp │ │ ├── ctc_op.h │ │ ├── ctc_op_gpu.cpp │ │ └── ctc_ops_test.py │ ├── core/ │ │ ├── CMakeLists.txt │ │ ├── allocator.cc │ │ ├── allocator.h │ │ ├── asan.h │ │ ├── blob.h │ │ ├── blob_gpu_test.cc │ │ ├── blob_serialization.cc │ │ ├── blob_serialization.h │ │ ├── blob_serialization_gpu.cc │ │ ├── blob_serializer_base.h │ │ ├── blob_stats.cc │ │ ├── blob_stats.h │ │ ├── blob_test.cc │ │ ├── common.cc │ │ ├── common.h │ │ ├── common_cudnn.cc │ │ ├── common_cudnn.h │ │ ├── common_gpu.cc │ │ ├── common_gpu.h │ │ ├── common_omp.h │ │ ├── common_test.cc │ │ ├── context.cc │ │ ├── context.h │ │ ├── context_gpu.cu │ │ ├── context_gpu.h │ │ ├── context_gpu_test.cc │ │ ├── context_test.cc │ │ ├── cudnn_wrappers.h │ │ ├── db.cc │ │ ├── db.h │ │ ├── event.cc │ │ ├── event.h │ │ ├── event_cpu.h │ │ ├── event_gpu.cc │ │ ├── event_gpu_test.cc │ │ ├── event_test.cc │ │ ├── flags.cc │ │ ├── flags.h │ │ ├── graph.cc │ │ ├── graph.h │ │ ├── graph_test.cc │ │ ├── init.cc │ │ ├── init.h │ │ ├── init_intrinsics_check.cc │ │ ├── init_omp.cc │ │ ├── init_test.cc │ │ ├── logging.cc │ │ ├── logging.h │ │ ├── logging_is_google_glog.h │ │ ├── logging_is_not_google_glog.h │ │ ├── logging_test.cc │ │ ├── macros.h │ │ ├── macros.h.in │ │ ├── memonger.cc │ │ ├── memonger.h │ │ ├── module.cc │ │ ├── module.h │ │ ├── module_test.cc │ │ ├── net.cc │ │ ├── net.h │ │ ├── net_async_base.cc │ │ ├── net_async_base.h │ │ ├── net_async_dag_gpu.cc │ │ ├── net_async_dag_gpu.h │ │ ├── net_async_gpu_thread_pool.h │ │ ├── net_async_gpu_thread_pool_gpu.cc │ │ ├── net_async_polling.cc │ │ ├── net_async_polling.h │ │ ├── net_async_scheduling.cc │ │ ├── net_async_scheduling.h │ │ ├── net_dag.cc │ │ ├── net_dag.h │ │ ├── net_dag_utils.cc │ │ ├── net_dag_utils.h │ │ ├── net_gpu_test.cc │ │ ├── net_simple.cc │ │ ├── net_simple.h │ │ ├── net_simple_async.cc │ │ ├── net_simple_async.h │ │ ├── net_singlethread_async_gpu.cc │ │ ├── net_test.cc │ │ ├── numa.cc │ │ ├── numa.h │ │ ├── observer.h │ │ ├── observer_test.cc │ │ ├── operator.cc │ │ ├── operator.h │ │ ├── operator_gpu_test.cc │ │ ├── operator_gradient.h │ │ ├── operator_schema.cc │ │ ├── operator_schema.h │ │ ├── operator_schema_test.cc │ │ ├── operator_test.cc │ │ ├── parallel_net_test.cc │ │ ├── plan_executor.cc │ │ ├── plan_executor.h │ │ ├── predictor.cc │ │ ├── predictor.h │ │ ├── predictor_test.cc │ │ ├── qtensor.cc │ │ ├── qtensor.h │ │ ├── qtensor_serialization.cc │ │ ├── qtensor_serialization.h │ │ ├── registry.h │ │ ├── registry_test.cc │ │ ├── scope_guard.h │ │ ├── static_tracepoint.h │ │ ├── static_tracepoint_elfx86.h │ │ ├── stats.cc │ │ ├── stats.h │ │ ├── stats_test.cc │ │ ├── tensor.cc │ │ ├── tensor.h │ │ ├── timer.h │ │ ├── timer_test.cc │ │ ├── transform.cc │ │ ├── transform.h │ │ ├── transform_test.cc │ │ ├── typeid.cc │ │ ├── typeid.h │ │ ├── typeid_test.cc │ │ ├── types.cc │ │ ├── types.h │ │ ├── workspace.cc │ │ ├── workspace.h │ │ └── workspace_test.cc │ ├── cuda_rtc/ │ │ ├── CMakeLists.txt │ │ ├── common_rtc.h │ │ ├── elemenntwise_rtc_gpu.cc │ │ └── pool_op_rtc_gpu.cc │ ├── db/ │ │ ├── CMakeLists.txt │ │ ├── create_db_op.cc │ │ ├── create_db_op.h │ │ ├── create_db_op_gpu.cc │ │ ├── db_test.cc │ │ ├── leveldb.cc │ │ ├── lmdb.cc │ │ ├── protodb.cc │ │ └── zmqdb.cc │ ├── distributed/ │ │ ├── CMakeLists.txt │ │ ├── file_store_handler.cc │ │ ├── file_store_handler.h │ │ ├── file_store_handler_op.cc │ │ ├── file_store_handler_op.h │ │ ├── file_store_handler_op_gpu.cc │ │ ├── file_store_handler_op_test.py │ │ ├── py_export.cc │ │ ├── redis_store_handler.cc │ │ ├── redis_store_handler.h │ │ ├── redis_store_handler_op.cc │ │ ├── redis_store_handler_op.h │ │ ├── redis_store_handler_op_gpu.cc │ │ ├── redis_store_handler_op_test.py │ │ ├── store_handler.cc │ │ ├── store_handler.h │ │ ├── store_ops.cc │ │ ├── store_ops.h │ │ └── store_ops_test_util.py │ ├── experiments/ │ │ ├── operators/ │ │ │ ├── fully_connected_op_decomposition.cc │ │ │ ├── fully_connected_op_decomposition.h │ │ │ ├── fully_connected_op_decomposition_gpu.cc │ │ │ ├── fully_connected_op_prune.cc │ │ │ ├── fully_connected_op_prune.h │ │ │ ├── fully_connected_op_sparse.cc │ │ │ ├── fully_connected_op_sparse.h │ │ │ ├── funhash_op.cc │ │ │ ├── funhash_op.h │ │ │ ├── sparse_funhash_op.cc │ │ │ ├── sparse_funhash_op.h │ │ │ ├── sparse_matrix_reshape_op.cc │ │ │ ├── sparse_matrix_reshape_op.h │ │ │ ├── tt_contraction_op.cc │ │ │ ├── tt_contraction_op.h │ │ │ ├── tt_contraction_op_gpu.cc │ │ │ ├── tt_pad_op.cc │ │ │ └── tt_pad_op.h │ │ └── python/ │ │ ├── SparseTransformer.py │ │ ├── convnet_benchmarks.py │ │ ├── device_reduce_sum_bench.py │ │ ├── funhash_op_test.py │ │ ├── net_construct_bench.py │ │ ├── sparse_funhash_op_test.py │ │ ├── sparse_reshape_op_test.py │ │ ├── tt_contraction_op_test.py │ │ └── tt_pad_op_test.py │ ├── image/ │ │ ├── CMakeLists.txt │ │ ├── image_input_op.cc │ │ ├── image_input_op.h │ │ ├── image_input_op_gpu.cc │ │ ├── transform_gpu.cu │ │ └── transform_gpu.h │ ├── mkl/ │ │ ├── CMakeLists.txt │ │ ├── mkl_operator.cc │ │ ├── mkl_utils.h │ │ ├── mkl_utils_test.cc │ │ ├── mklmemory_serialization.cc │ │ ├── mklmemory_serialization_test.cc │ │ ├── operators/ │ │ │ ├── concat_op.cc │ │ │ ├── conv_op.cc │ │ │ ├── conv_op_mkldnn.cc │ │ │ ├── elementwise_sum_op.cc │ │ │ ├── fully_connected_op.cc │ │ │ ├── local_response_normalization_op.cc │ │ │ ├── operator_fallback_mkl.cc │ │ │ ├── operator_fallback_mkl.h │ │ │ ├── packed_fc_op.cc │ │ │ ├── pool_op.cc │ │ │ ├── relu_op.cc │ │ │ ├── spatial_batch_norm_op.cc │ │ │ ├── squeeze_op.cc │ │ │ └── utility_ops.cc │ │ └── utils/ │ │ ├── mkl_context.cc │ │ ├── mkl_context.h │ │ ├── mkl_dnn_cppwrapper.h │ │ ├── mkl_memory.cc │ │ ├── mkl_memory.h │ │ ├── mkl_operator.h │ │ ├── mkl_version_check.h │ │ └── sgemm_pack.h │ ├── mobile/ │ │ ├── CMakeLists.txt │ │ └── contrib/ │ │ ├── CMakeLists.txt │ │ ├── arm-compute/ │ │ │ ├── CMakeLists.txt │ │ │ ├── README.md │ │ │ ├── core/ │ │ │ │ ├── CMakeLists.txt │ │ │ │ ├── context.cc │ │ │ │ ├── context.h │ │ │ │ ├── net_gl.cc │ │ │ │ ├── net_gl.h │ │ │ │ ├── operator.cc │ │ │ │ ├── operator.h │ │ │ │ ├── rewrite_net.cc │ │ │ │ └── rewrite_net.h │ │ │ ├── models/ │ │ │ │ ├── squeezenet_init.pb │ │ │ │ └── squeezenet_predict.pb │ │ │ ├── operators/ │ │ │ │ ├── CMakeLists.txt │ │ │ │ ├── activation_ops.cc │ │ │ │ ├── activation_ops.h │ │ │ │ ├── concat_op.cc │ │ │ │ ├── conv_op.cc │ │ │ │ ├── copy_op.cc │ │ │ │ ├── elementwise_sum_op.cc │ │ │ │ ├── fully_connected_op.cc │ │ │ │ ├── norm_planar_yuv_op.cc │ │ │ │ ├── pool_op.cc │ │ │ │ ├── reshape_op.cc │ │ │ │ ├── resize_op.cc │ │ │ │ ├── softmax_op.cc │ │ │ │ └── spatial_batch_norm_op.cc │ │ │ ├── run_tests.sh │ │ │ └── test/ │ │ │ ├── CMakeLists.txt │ │ │ ├── gl_activation_ops_test.cc │ │ │ ├── gl_alignment_test.cc │ │ │ ├── gl_concat_op_test.cc │ │ │ ├── gl_context_test.cc │ │ │ ├── gl_conv_op_test.cc │ │ │ ├── gl_copy_op_test.cc │ │ │ ├── gl_elementwise_sum_op_test.cc │ │ │ ├── gl_fully_connected_op_test.cc │ │ │ ├── gl_model_test.cc │ │ │ ├── gl_model_test.h │ │ │ ├── gl_norm_planar_yuv_op_test.cc │ │ │ ├── gl_operator_test.h │ │ │ ├── gl_pool_op_test.cc │ │ │ ├── gl_resize_op_test.cc │ │ │ ├── gl_softmax_op_test.cc │ │ │ └── gl_spatial_batch_norm_op_test.cc │ │ ├── ios/ │ │ │ ├── CMakeLists.txt │ │ │ ├── ios_caffe.cc │ │ │ ├── ios_caffe.h │ │ │ ├── ios_caffe_defines.h │ │ │ ├── ios_caffe_predictor.cc │ │ │ ├── ios_caffe_predictor.h │ │ │ ├── mpscnn/ │ │ │ │ ├── CMakeLists.txt │ │ │ │ ├── MPSCNN.metal │ │ │ │ ├── mpscnn.h │ │ │ │ ├── mpscnn.mm │ │ │ │ ├── mpscnn_context.h │ │ │ │ ├── mpscnn_context.mm │ │ │ │ ├── mpscnn_graph.mm │ │ │ │ ├── mpscnn_graph_mask.h │ │ │ │ ├── mpscnn_graph_mask.mm │ │ │ │ ├── mpscnn_kernels.h │ │ │ │ ├── mpscnn_test.h │ │ │ │ └── mpscnn_test.mm │ │ │ ├── pool_test.cc │ │ │ └── resize_test.cc │ │ ├── libopencl-stub/ │ │ │ ├── Android.mk │ │ │ ├── LICENSE │ │ │ ├── Makefile │ │ │ ├── README.md │ │ │ ├── include/ │ │ │ │ ├── CL/ │ │ │ │ │ ├── cl.h │ │ │ │ │ ├── cl.hpp │ │ │ │ │ ├── cl_ext.h │ │ │ │ │ ├── cl_gl.h │ │ │ │ │ ├── cl_gl_ext.h │ │ │ │ │ ├── cl_platform.h │ │ │ │ │ └── opencl.h │ │ │ │ └── libopencl.h │ │ │ └── src/ │ │ │ └── libopencl.c │ │ ├── libvulkan-stub/ │ │ │ ├── include/ │ │ │ │ ├── libvulkan-stub.h │ │ │ │ └── vulkan/ │ │ │ │ ├── vk_platform.h │ │ │ │ └── vulkan.h │ │ │ └── src/ │ │ │ └── libvulkan-stub.c │ │ ├── nnapi/ │ │ │ ├── CMakeLists.txt │ │ │ ├── NeuralNetworks.h │ │ │ ├── dlnnapi.c │ │ │ ├── dlnnapi.h │ │ │ ├── nnapi.cc │ │ │ ├── nnapi.h │ │ │ ├── nnapi_benchmark.cc │ │ │ └── nnapi_test.cc │ │ ├── opengl/ │ │ │ ├── CMakeLists.txt │ │ │ ├── android/ │ │ │ │ ├── AndroidGLContext.cc │ │ │ │ ├── AndroidGLContext.h │ │ │ │ ├── CMakeLists.txt │ │ │ │ ├── GLContext.cc │ │ │ │ ├── GLImageAllocator.cc │ │ │ │ ├── arm_neon_support.h │ │ │ │ ├── gl3stub.c │ │ │ │ └── gl3stub.h │ │ │ ├── core/ │ │ │ │ ├── CMakeLists.txt │ │ │ │ ├── DataTransfer.cc │ │ │ │ ├── DataTransfer.h │ │ │ │ ├── GL.h │ │ │ │ ├── GLContext.cc │ │ │ │ ├── GLContext.h │ │ │ │ ├── GLFilter.cc │ │ │ │ ├── GLFilter.h │ │ │ │ ├── GLImage.cc │ │ │ │ ├── GLImage.h │ │ │ │ ├── GLImageAllocator.cc │ │ │ │ ├── GLImageAllocator.h │ │ │ │ ├── GLLogging.h │ │ │ │ ├── GLPBO.cc │ │ │ │ ├── GLPBO.h │ │ │ │ ├── GLPlainTexture.cc │ │ │ │ ├── GLPlainTexture.h │ │ │ │ ├── GLPredictor.cc │ │ │ │ ├── GLPredictor.h │ │ │ │ ├── GLTexture.cc │ │ │ │ ├── GLTexture.h │ │ │ │ ├── ImageAllocator.h │ │ │ │ ├── arm_neon_support.h │ │ │ │ ├── rewrite_net.cc │ │ │ │ └── rewrite_net.h │ │ │ ├── ios/ │ │ │ │ ├── CMakeLists.txt │ │ │ │ ├── GLContext.cc │ │ │ │ ├── GLImageAllocator.cc │ │ │ │ ├── IOSGLContext.h │ │ │ │ ├── IOSGLContext.mm │ │ │ │ ├── IOSGLImageAllocator.cc │ │ │ │ ├── IOSGLImageAllocator.h │ │ │ │ ├── IOSGLTexture.h │ │ │ │ └── IOSGLTexture.mm │ │ │ ├── operators/ │ │ │ │ ├── CMakeLists.txt │ │ │ │ ├── GLAdd.cc │ │ │ │ ├── GLConcat.cc │ │ │ │ ├── GLConvolution.cc │ │ │ │ ├── GLConvolution.h │ │ │ │ ├── GLCopyOps.cc │ │ │ │ ├── GLInstanceNorm.cc │ │ │ │ ├── GLMul.cc │ │ │ │ ├── GLNormPlanarYUV.cc │ │ │ │ ├── GLPRelu.cc │ │ │ │ ├── GLPadImage.cc │ │ │ │ ├── GLPool.cc │ │ │ │ ├── GLResize.cc │ │ │ │ ├── GLSigmoid.cc │ │ │ │ ├── GLSoftmax.cc │ │ │ │ ├── GLStylizer.cc │ │ │ │ ├── GLSub.cc │ │ │ │ └── gl_tiling_utils.h │ │ │ └── test/ │ │ │ ├── TestGLConvolution.cc │ │ │ ├── TestGLConvolution.h │ │ │ ├── opengl_test.cc │ │ │ └── opengl_test.h │ │ ├── snpe/ │ │ │ ├── CMakeLists.txt │ │ │ ├── README.md │ │ │ ├── snpe_ffi.cc │ │ │ ├── snpe_ffi.h │ │ │ ├── snpe_globals.cc │ │ │ ├── snpe_op.cc │ │ │ └── snpe_op_benchmark.cc │ │ └── ulp2/ │ │ ├── ulp.cc │ │ ├── ulp.h │ │ ├── ulp_neon.cc │ │ ├── ulp_neon.h │ │ └── ulp_test.cc │ ├── mpi/ │ │ ├── CMakeLists.txt │ │ ├── mpi_common.cc │ │ ├── mpi_common.h │ │ ├── mpi_gpu_test.cc │ │ ├── mpi_ops.cc │ │ ├── mpi_ops.h │ │ ├── mpi_ops_gpu.cc │ │ └── mpi_test.cc │ ├── observers/ │ │ ├── CMakeLists.txt │ │ ├── README.md │ │ ├── operator_attaching_net_observer.h │ │ ├── runcnt_observer.cc │ │ ├── runcnt_observer.h │ │ ├── time_observer.cc │ │ ├── time_observer.h │ │ └── time_observer_test.cc │ ├── onnx/ │ │ ├── CMakeLists.txt │ │ ├── backend.cc │ │ ├── backend.h │ │ ├── backend_rep.cc │ │ ├── backend_rep.h │ │ ├── device.cc │ │ ├── device.h │ │ ├── helper.cc │ │ └── helper.h │ ├── operators/ │ │ ├── CMakeLists.txt │ │ ├── abs_op.cc │ │ ├── abs_op.cu │ │ ├── accumulate_op.cc │ │ ├── accumulate_op.cu │ │ ├── accumulate_op.h │ │ ├── accuracy_op.cc │ │ ├── accuracy_op.cu │ │ ├── accuracy_op.h │ │ ├── apmeter_op.cc │ │ ├── apmeter_op.h │ │ ├── assert_op.cc │ │ ├── assert_op.cu │ │ ├── assert_op.h │ │ ├── atomic_ops.cc │ │ ├── batch_box_cox_op.cc │ │ ├── batch_box_cox_op.h │ │ ├── batch_gather_ops.cc │ │ ├── batch_gather_ops.cu │ │ ├── batch_gather_ops.h │ │ ├── batch_matmul_op.cc │ │ ├── batch_matmul_op.cu │ │ ├── batch_matmul_op.h │ │ ├── batch_matmul_op_gpu_test.cc │ │ ├── batch_matmul_op_test.cc │ │ ├── batch_sparse_to_dense_op.cc │ │ ├── batch_sparse_to_dense_op.h │ │ ├── bbox_transform_op.cc │ │ ├── bbox_transform_op.h │ │ ├── boolean_mask_ops.cc │ │ ├── boolean_mask_ops.cu │ │ ├── boolean_mask_ops.h │ │ ├── boolean_unmask_ops.cc │ │ ├── boolean_unmask_ops.cu │ │ ├── boolean_unmask_ops.h │ │ ├── boolean_unmask_ops_test.cc │ │ ├── box_with_nms_limit_op.cc │ │ ├── box_with_nms_limit_op.h │ │ ├── cast_op.cc │ │ ├── cast_op.cu │ │ ├── cast_op.h │ │ ├── ceil_op.cc │ │ ├── ceil_op.cu │ │ ├── ceil_op.h │ │ ├── channel_backprop_stats_op.cc │ │ ├── channel_backprop_stats_op.h │ │ ├── channel_shuffle_op.cc │ │ ├── channel_shuffle_op.h │ │ ├── channel_shuffle_op_gpu.cu │ │ ├── channel_stats_op.cc │ │ ├── channel_stats_op.cu │ │ ├── channel_stats_op.h │ │ ├── clip_op.cc │ │ ├── clip_op.cu │ │ ├── clip_op.h │ │ ├── communicator_op.cc │ │ ├── communicator_op_gpu.cc │ │ ├── concat_split_op.cc │ │ ├── concat_split_op.h │ │ ├── concat_split_op_gpu.cc │ │ ├── conditional_op.cc │ │ ├── conditional_op.h │ │ ├── conv_gradient_op.cc │ │ ├── conv_op.cc │ │ ├── conv_op.h │ │ ├── conv_op_cache_cudnn.cc │ │ ├── conv_op_cache_cudnn.h │ │ ├── conv_op_cache_cudnn_test.cc │ │ ├── conv_op_cudnn.cc │ │ ├── conv_op_eigen.cc │ │ ├── conv_op_gpu.cc │ │ ├── conv_op_impl.h │ │ ├── conv_op_shared.cc │ │ ├── conv_op_shared.h │ │ ├── conv_op_shared_gpu.cc │ │ ├── conv_pool_op_base.h │ │ ├── conv_transpose_gradient_op.cc │ │ ├── conv_transpose_op.cc │ │ ├── conv_transpose_op.h │ │ ├── conv_transpose_op_cudnn.cc │ │ ├── conv_transpose_op_gpu.cc │ │ ├── conv_transpose_op_impl.h │ │ ├── conv_transpose_op_mobile.cc │ │ ├── conv_transpose_op_mobile.h │ │ ├── conv_transpose_op_mobile_impl.h │ │ ├── conv_transpose_op_mobile_test.cc │ │ ├── conv_transpose_unpool_op_base.h │ │ ├── cos_op.cc │ │ ├── cos_op.cu │ │ ├── cosine_embedding_criterion_op.cc │ │ ├── cosine_embedding_criterion_op.cu │ │ ├── cosine_embedding_criterion_op.h │ │ ├── counter_ops.cc │ │ ├── counter_ops.h │ │ ├── counter_ops_gpu.cc │ │ ├── create_scope_op.cc │ │ ├── create_scope_op.h │ │ ├── cross_entropy_op.cc │ │ ├── cross_entropy_op.cu │ │ ├── cross_entropy_op.h │ │ ├── dataset_ops.cc │ │ ├── dataset_ops.h │ │ ├── deform_conv_gradient_op.cc │ │ ├── deform_conv_op.cc │ │ ├── deform_conv_op.cu │ │ ├── deform_conv_op.h │ │ ├── deform_conv_op_impl.h │ │ ├── distance_op.cc │ │ ├── distance_op.cu │ │ ├── distance_op.h │ │ ├── do_op.cc │ │ ├── do_op.h │ │ ├── do_op_gpu.cc │ │ ├── dropout_op.cc │ │ ├── dropout_op.cu │ │ ├── dropout_op.h │ │ ├── dropout_op_cudnn.cc │ │ ├── elementwise_add_op.cc │ │ ├── elementwise_div_op.cc │ │ ├── elementwise_linear_op.cc │ │ ├── elementwise_linear_op.cu │ │ ├── elementwise_linear_op.h │ │ ├── elementwise_logical_ops.cc │ │ ├── elementwise_logical_ops.h │ │ ├── elementwise_mul_op.cc │ │ ├── elementwise_op.cc │ │ ├── elementwise_op.cu │ │ ├── elementwise_op.h │ │ ├── elementwise_op_gpu_test.cc │ │ ├── elementwise_op_schema.cc │ │ ├── elementwise_op_test.cc │ │ ├── elementwise_op_test.h │ │ ├── elementwise_sub_op.cc │ │ ├── elementwise_sum_op.cc │ │ ├── elu_op.cc │ │ ├── elu_op.cu │ │ ├── elu_op.h │ │ ├── exp_op.cc │ │ ├── exp_op.cu │ │ ├── expand_squeeze_dims_op.cc │ │ ├── expand_squeeze_dims_op.h │ │ ├── expand_squeeze_dims_op_gpu.cc │ │ ├── extend_tensor_op.cc │ │ ├── feed_blob_op.cc │ │ ├── feed_blob_op.h │ │ ├── filler_op.cc │ │ ├── filler_op.cu │ │ ├── filler_op.h │ │ ├── filler_op_gpu.cc │ │ ├── find_duplicate_elements_op.cc │ │ ├── find_duplicate_elements_op.h │ │ ├── find_op.cc │ │ ├── find_op.cu │ │ ├── find_op.h │ │ ├── flatten_op.cc │ │ ├── flatten_op.h │ │ ├── flexible_top_k.cc │ │ ├── flexible_top_k.h │ │ ├── floor_op.cc │ │ ├── floor_op.cu │ │ ├── floor_op.h │ │ ├── free_op.cc │ │ ├── free_op.h │ │ ├── free_op_gpu.cc │ │ ├── fully_connected_op.cc │ │ ├── fully_connected_op.h │ │ ├── fully_connected_op_gpu.cc │ │ ├── fused_rowwise_8bit_conversion_ops.cc │ │ ├── fused_rowwise_8bit_conversion_ops.h │ │ ├── gather_fused_8bit_rowwise_op.cc │ │ ├── gather_fused_8bit_rowwise_op.h │ │ ├── gather_ranges_to_dense_op.cc │ │ ├── gather_ranges_to_dense_op.h │ │ ├── generate_proposals_op.cc │ │ ├── generate_proposals_op.h │ │ ├── generate_proposals_op_test.cc │ │ ├── generate_proposals_op_util_boxes.h │ │ ├── generate_proposals_op_util_boxes_test.cc │ │ ├── generate_proposals_op_util_nms.h │ │ ├── generate_proposals_op_util_nms_test.cc │ │ ├── given_tensor_fill_op.cc │ │ ├── given_tensor_fill_op.cu │ │ ├── given_tensor_fill_op.h │ │ ├── glu_op.cc │ │ ├── glu_op.cu │ │ ├── glu_op.h │ │ ├── gru_unit_op.cc │ │ ├── gru_unit_op.h │ │ ├── gru_unit_op_gpu.cu │ │ ├── h_softmax_op.cc │ │ ├── h_softmax_op.h │ │ ├── half_float_ops.cc │ │ ├── half_float_ops.cu │ │ ├── half_float_ops.h │ │ ├── if_op.cc │ │ ├── if_op.h │ │ ├── if_op_gpu.cc │ │ ├── im2col_op.cc │ │ ├── im2col_op.h │ │ ├── im2col_op_gpu.cc │ │ ├── index_hash_ops.cc │ │ ├── index_hash_ops.h │ │ ├── index_ops.cc │ │ ├── instance_norm_gradient_op.cc │ │ ├── instance_norm_op.cc │ │ ├── instance_norm_op.cu │ │ ├── instance_norm_op.h │ │ ├── jsd_op.cc │ │ ├── jsd_op.h │ │ ├── key_split_ops.cc │ │ ├── key_split_ops.h │ │ ├── last_n_window_collector.cc │ │ ├── layer_norm_op.cc │ │ ├── layer_norm_op.cu │ │ ├── layer_norm_op.h │ │ ├── leaky_relu_op.cc │ │ ├── leaky_relu_op.cu │ │ ├── leaky_relu_op.h │ │ ├── lengths_reducer_fused_8bit_rowwise_ops.cc │ │ ├── lengths_reducer_fused_8bit_rowwise_ops.h │ │ ├── lengths_reducer_ops.cc │ │ ├── lengths_reducer_ops.h │ │ ├── lengths_reducer_rowwise_8bit_ops.cc │ │ ├── lengths_reducer_rowwise_8bit_ops.h │ │ ├── lengths_tile_op.cc │ │ ├── lengths_tile_op.h │ │ ├── lengths_tile_op_gpu.cc │ │ ├── lengths_top_k_op.cc │ │ ├── lengths_top_k_op.h │ │ ├── listwise_l2r_op.cc │ │ ├── listwise_l2r_op.h │ │ ├── load_save_op.cc │ │ ├── load_save_op.h │ │ ├── load_save_op_gpu.cc │ │ ├── local_response_normalization_op.cc │ │ ├── local_response_normalization_op.cu │ │ ├── local_response_normalization_op.h │ │ ├── local_response_normalization_op_cudnn.cc │ │ ├── locally_connected_op.cc │ │ ├── locally_connected_op.h │ │ ├── locally_connected_op_gpu.cc │ │ ├── locally_connected_op_impl.h │ │ ├── locally_connected_op_util.cc │ │ ├── locally_connected_op_util.h │ │ ├── log_op.cc │ │ ├── log_op.cu │ │ ├── logit_op.cc │ │ ├── logit_op.cu │ │ ├── logit_op.h │ │ ├── loss_op.cc │ │ ├── loss_op.cu │ │ ├── loss_op.h │ │ ├── lp_pool_op.cc │ │ ├── lp_pool_op.cu │ │ ├── lpnorm_op.cc │ │ ├── lpnorm_op.h │ │ ├── lstm_unit_op.cc │ │ ├── lstm_unit_op.h │ │ ├── lstm_unit_op_gpu.cu │ │ ├── map_ops.cc │ │ ├── map_ops.h │ │ ├── margin_ranking_criterion_op.cc │ │ ├── margin_ranking_criterion_op.cu │ │ ├── margin_ranking_criterion_op.h │ │ ├── math_ops.cc │ │ ├── math_ops.cu │ │ ├── math_ops.h │ │ ├── matmul_op.cc │ │ ├── matmul_op.h │ │ ├── matmul_op_gpu.cc │ │ ├── max_pool_with_index.cu │ │ ├── max_pool_with_index.h │ │ ├── mean_op.cc │ │ ├── mean_op.cu │ │ ├── mean_op.h │ │ ├── mem_query_op.cu │ │ ├── merge_id_lists_op.cc │ │ ├── merge_id_lists_op.h │ │ ├── minmax_gradient_ops.cc │ │ ├── minmax_ops.cc │ │ ├── minmax_ops.h │ │ ├── mod_op.cc │ │ ├── mod_op.h │ │ ├── multi_class_accuracy_op.cc │ │ ├── multi_class_accuracy_op.cu │ │ ├── multi_class_accuracy_op.h │ │ ├── negate_gradient_op.cc │ │ ├── negate_gradient_op.h │ │ ├── negate_gradient_op_gpu.cc │ │ ├── negative_op.cc │ │ ├── negative_op.cu │ │ ├── ngram_ops.cc │ │ ├── ngram_ops.h │ │ ├── no_default_engine_op.h │ │ ├── norm_planar_yuv_op.cc │ │ ├── normalize_l1_op.cc │ │ ├── normalize_l1_op.h │ │ ├── normalize_op.cc │ │ ├── normalize_op.h │ │ ├── normalize_ops.cu │ │ ├── one_hot_ops.cc │ │ ├── one_hot_ops.cu │ │ ├── one_hot_ops.h │ │ ├── onnx_while_op.cc │ │ ├── onnx_while_op.h │ │ ├── op_utils_cudnn.h │ │ ├── operator_fallback_gpu.h │ │ ├── operator_fallback_gpu_test.cc │ │ ├── order_switch_ops.cc │ │ ├── order_switch_ops.cu │ │ ├── order_switch_ops.h │ │ ├── pack_rnn_sequence_op.cc │ │ ├── pack_rnn_sequence_op.h │ │ ├── pack_segments.cc │ │ ├── pack_segments.cu │ │ ├── pack_segments.h │ │ ├── pad_op.cc │ │ ├── pad_op.h │ │ ├── pad_op_gpu.cu │ │ ├── partition_ops.cc │ │ ├── partition_ops.h │ │ ├── percentile_op.cc │ │ ├── percentile_op.h │ │ ├── perplexity_op.cc │ │ ├── perplexity_op.cu │ │ ├── perplexity_op.h │ │ ├── piecewise_linear_transform_op.cc │ │ ├── piecewise_linear_transform_op.cu │ │ ├── piecewise_linear_transform_op.h │ │ ├── pool_gradient_op.cc │ │ ├── pool_op.cc │ │ ├── pool_op.cu │ │ ├── pool_op.h │ │ ├── pool_op_cudnn.cu │ │ ├── pow_op.cc │ │ ├── pow_op.cu │ │ ├── pow_op.h │ │ ├── prefetch_op.h │ │ ├── prelu_op.cc │ │ ├── prelu_op.cu │ │ ├── prelu_op.h │ │ ├── prepend_dim_op.cc │ │ ├── prepend_dim_op.h │ │ ├── prepend_dim_op_gpu.cc │ │ ├── quant_decode_op.cc │ │ ├── quant_decode_op.h │ │ ├── rank_loss_op.cc │ │ ├── rank_loss_op.h │ │ ├── recurrent_network_blob_fetcher_op.cc │ │ ├── recurrent_network_blob_fetcher_op.h │ │ ├── recurrent_network_blob_fetcher_op_gpu.cc │ │ ├── recurrent_network_executor.cc │ │ ├── recurrent_network_executor.h │ │ ├── recurrent_network_executor_gpu.cc │ │ ├── recurrent_network_executor_gpu.h │ │ ├── recurrent_network_executor_incl.h │ │ ├── recurrent_network_op.cc │ │ ├── recurrent_network_op.h │ │ ├── recurrent_network_op_gpu.cu │ │ ├── recurrent_op_cudnn.cc │ │ ├── recurrent_op_cudnn.h │ │ ├── reduce_ops.cc │ │ ├── reduce_ops.h │ │ ├── reducer_functors.h │ │ ├── reduction_front_back_ops.cc │ │ ├── reduction_front_back_ops.cu │ │ ├── reduction_front_back_ops.h │ │ ├── reduction_ops.cc │ │ ├── reduction_ops.cu │ │ ├── reduction_ops.h │ │ ├── relu_op.cc │ │ ├── relu_op.cu │ │ ├── relu_op.h │ │ ├── relu_op_cudnn.cc │ │ ├── relu_op_fp16.cu │ │ ├── remove_data_blocks_op.cc │ │ ├── remove_data_blocks_op.h │ │ ├── replace_nan_op.cc │ │ ├── replace_nan_op.cu │ │ ├── replace_nan_op.h │ │ ├── reservoir_sampling.cc │ │ ├── reshape_op.cc │ │ ├── reshape_op.h │ │ ├── reshape_op_gpu.cc │ │ ├── reshape_op_gpu_test.cc │ │ ├── resize_op.cc │ │ ├── resize_op.cu │ │ ├── resize_op.h │ │ ├── reverse_packed_segs_op.cc │ │ ├── reverse_packed_segs_op.cu │ │ ├── reverse_packed_segs_op.h │ │ ├── rmac_regions_op.cc │ │ ├── rmac_regions_op.cu │ │ ├── rmac_regions_op.h │ │ ├── roi_align_gradient_op.cc │ │ ├── roi_align_gradient_op.cu │ │ ├── roi_align_gradient_op.h │ │ ├── roi_align_op.cc │ │ ├── roi_align_op.cu │ │ ├── roi_align_op.h │ │ ├── roi_align_op_gpu_test.cc │ │ ├── roi_pool_op.cc │ │ ├── roi_pool_op.cu │ │ ├── roi_pool_op.h │ │ ├── rowmul_op.cc │ │ ├── rowmul_op.h │ │ ├── scale_op.cc │ │ ├── scale_op.h │ │ ├── scale_op_gpu.cc │ │ ├── segment_reduction_op.cc │ │ ├── segment_reduction_op.h │ │ ├── segment_reduction_op_gpu.cu │ │ ├── selu_op.cc │ │ ├── selu_op.cu │ │ ├── selu_op.h │ │ ├── sequence_ops.cc │ │ ├── sequence_ops.cu │ │ ├── sequence_ops.h │ │ ├── shape_op.cc │ │ ├── shape_op.h │ │ ├── shape_op_gpu.cc │ │ ├── sigmoid_op.cc │ │ ├── sigmoid_op.cu │ │ ├── sin_op.cc │ │ ├── sin_op.cu │ │ ├── sinusoid_position_encoding_op.cc │ │ ├── sinusoid_position_encoding_op.h │ │ ├── slice_op.cc │ │ ├── slice_op.cu │ │ ├── slice_op.h │ │ ├── softmax_op.cc │ │ ├── softmax_op.h │ │ ├── softmax_op_cudnn.cc │ │ ├── softmax_ops.cu │ │ ├── softmax_shared.cc │ │ ├── softmax_shared.h │ │ ├── softmax_with_loss_op.cc │ │ ├── softmax_with_loss_op.h │ │ ├── softplus_op.cc │ │ ├── softplus_op.cu │ │ ├── softplus_op.h │ │ ├── softsign_op.cc │ │ ├── softsign_op.cu │ │ ├── space_batch_op.cc │ │ ├── space_batch_op.h │ │ ├── space_batch_op_gpu.cu │ │ ├── sparse_normalize_op.cc │ │ ├── sparse_normalize_op.h │ │ ├── sparse_to_dense_mask_op.cc │ │ ├── sparse_to_dense_mask_op.h │ │ ├── sparse_to_dense_op.cc │ │ ├── sparse_to_dense_op.cu │ │ ├── sparse_to_dense_op.h │ │ ├── spatial_batch_norm_gradient_op.cc │ │ ├── spatial_batch_norm_op.cc │ │ ├── spatial_batch_norm_op.h │ │ ├── spatial_batch_norm_op_cudnn.cc │ │ ├── spatial_softmax_with_loss_op.cc │ │ ├── spatial_softmax_with_loss_op.h │ │ ├── sqrt_op.cc │ │ ├── sqrt_op.cu │ │ ├── square_root_divide_op.cc │ │ ├── square_root_divide_op.h │ │ ├── stats_ops.cc │ │ ├── stop_gradient.cc │ │ ├── stop_gradient.h │ │ ├── stop_gradient_gpu.cc │ │ ├── string_ops.cc │ │ ├── string_ops.h │ │ ├── string_ops_test.cc │ │ ├── stylizer_ops.cc │ │ ├── summarize_op.cc │ │ ├── summarize_op.cu │ │ ├── summarize_op.h │ │ ├── swish_op.cc │ │ ├── swish_op.cu │ │ ├── swish_op.h │ │ ├── tanh_op.cc │ │ ├── tanh_op.cu │ │ ├── tensor_protos_db_input.cc │ │ ├── tensor_protos_db_input.h │ │ ├── tensor_protos_db_input_gpu.cc │ │ ├── text_file_reader.cc │ │ ├── text_file_reader_utils.cc │ │ ├── text_file_reader_utils.h │ │ ├── text_file_reader_utils_test.cc │ │ ├── thresholded_relu_op.cc │ │ ├── thresholded_relu_op.cu │ │ ├── thresholded_relu_op.h │ │ ├── tile_op.cc │ │ ├── tile_op.cu │ │ ├── tile_op.h │ │ ├── top_k.cc │ │ ├── top_k.cu │ │ ├── top_k.h │ │ ├── top_k_heap_selection.cuh │ │ ├── top_k_radix_selection.cuh │ │ ├── transpose_op.cc │ │ ├── transpose_op.cu │ │ ├── transpose_op.h │ │ ├── transpose_op_cudnn.cc │ │ ├── tt_linear_op.cc │ │ ├── tt_linear_op.h │ │ ├── utility_ops.cc │ │ ├── utility_ops.cu │ │ ├── utility_ops.h │ │ ├── utility_ops_gpu.cc │ │ ├── utility_ops_gpu_test.cc │ │ ├── utility_ops_test.cc │ │ ├── variable_length_sequence_padding.cc │ │ ├── variable_length_sequence_padding.h │ │ ├── weighted_sample_op.cc │ │ ├── weighted_sample_op.cu │ │ ├── weighted_sample_op.h │ │ ├── while_op.cc │ │ ├── while_op.h │ │ ├── while_op_gpu.cc │ │ ├── workspace_ops.cc │ │ ├── zero_gradient_op.cc │ │ ├── zero_gradient_op.h │ │ └── zero_gradient_op_gpu.cc │ ├── perfkernels/ │ │ ├── CMakeLists.txt │ │ ├── common.h │ │ ├── common_avx.cc │ │ ├── common_avx2.cc │ │ ├── cvtsh_ss_bugfix.h │ │ ├── embedding_lookup.cc │ │ ├── embedding_lookup.h │ │ ├── embedding_lookup_avx2.cc │ │ ├── embedding_lookup_fused_8bit_rowwise_avx2.cc │ │ ├── fused_8bit_rowwise_embedding_lookup.cc │ │ ├── fused_8bit_rowwise_embedding_lookup.h │ │ ├── hp_emblookup_codegen.py │ │ ├── typed_axpy.cc │ │ ├── typed_axpy.h │ │ ├── typed_axpy_avx.cc │ │ └── typed_axpy_avx2.cc │ ├── proto/ │ │ ├── CMakeLists.txt │ │ ├── __init__.py │ │ ├── caffe2.proto │ │ ├── caffe2_legacy.proto │ │ ├── hsm.proto │ │ ├── metanet.proto │ │ ├── predictor_consts.proto │ │ └── prof_dag.proto │ ├── python/ │ │ ├── CMakeLists.txt │ │ ├── __init__.py │ │ ├── _import_c_extension.py │ │ ├── allcompare_test.py │ │ ├── attention.py │ │ ├── benchmark_generator.py │ │ ├── binarysize.py │ │ ├── brew.py │ │ ├── brew_test.py │ │ ├── build.py │ │ ├── cached_reader.py │ │ ├── caffe_translator.py │ │ ├── caffe_translator_test.py │ │ ├── checkpoint.py │ │ ├── checkpoint_test.py │ │ ├── cnn.py │ │ ├── context.py │ │ ├── context_test.py │ │ ├── control.py │ │ ├── control_ops_grad.py │ │ ├── control_ops_util.py │ │ ├── control_test.py │ │ ├── convnet_benchmarks.py │ │ ├── convnet_benchmarks_test.py │ │ ├── core.py │ │ ├── core_gradients_test.py │ │ ├── core_test.py │ │ ├── crf.py │ │ ├── data_parallel_model.py │ │ ├── data_parallel_model_test.py │ │ ├── data_parallel_model_utils.py │ │ ├── data_workers.py │ │ ├── data_workers_test.py │ │ ├── dataio.py │ │ ├── dataio_test.py │ │ ├── dataset.py │ │ ├── db_test.py │ │ ├── device_checker.py │ │ ├── dlpack.h │ │ ├── docs/ │ │ │ ├── formatter.py │ │ │ ├── generator.py │ │ │ ├── github.py │ │ │ └── parser.py │ │ ├── dyndep.py │ │ ├── embedding_generation_benchmark.py │ │ ├── examples/ │ │ │ ├── char_rnn.py │ │ │ ├── lmdb_create_example.py │ │ │ └── resnet50_trainer.py │ │ ├── experiment_util.py │ │ ├── extension_loader.py │ │ ├── functional.py │ │ ├── functional_test.py │ │ ├── fused_8bit_rowwise_conversion_ops_test.py │ │ ├── gradient_check_test.py │ │ ├── gradient_checker.py │ │ ├── gru_cell.py │ │ ├── helpers/ │ │ │ ├── __init__.py │ │ │ ├── algebra.py │ │ │ ├── arg_scope.py │ │ │ ├── array_helpers.py │ │ │ ├── control_ops.py │ │ │ ├── conv.py │ │ │ ├── db_input.py │ │ │ ├── dropout.py │ │ │ ├── elementwise_linear.py │ │ │ ├── fc.py │ │ │ ├── nonlinearity.py │ │ │ ├── normalization.py │ │ │ ├── pooling.py │ │ │ ├── tools.py │ │ │ └── train.py │ │ ├── hsm_util.py │ │ ├── hypothesis_test.py │ │ ├── hypothesis_test_util.py │ │ ├── layer_model_helper.py │ │ ├── layer_model_instantiator.py │ │ ├── layer_parameter_sharing_test.py │ │ ├── layer_test_util.py │ │ ├── layers/ │ │ │ ├── __init__.py │ │ │ ├── add_bias.py │ │ │ ├── arc_cosine_feature_map.py │ │ │ ├── batch_distill_lr_loss.py │ │ │ ├── batch_lr_loss.py │ │ │ ├── batch_mse_loss.py │ │ │ ├── batch_normalization.py │ │ │ ├── batch_sigmoid_cross_entropy_loss.py │ │ │ ├── batch_softmax_loss.py │ │ │ ├── concat.py │ │ │ ├── conv.py │ │ │ ├── dropout.py │ │ │ ├── fc.py │ │ │ ├── fc_without_bias.py │ │ │ ├── feature_sparse_to_dense.py │ │ │ ├── functional.py │ │ │ ├── gather_record.py │ │ │ ├── last_n_window_collector.py │ │ │ ├── layers.py │ │ │ ├── margin_rank_loss.py │ │ │ ├── merge_id_lists.py │ │ │ ├── pairwise_dot_product.py │ │ │ ├── position_weighted.py │ │ │ ├── random_fourier_features.py │ │ │ ├── reservoir_sampling.py │ │ │ ├── sampling_train.py │ │ │ ├── sampling_trainable_mixin.py │ │ │ ├── select_record_by_context.py │ │ │ ├── semi_random_features.py │ │ │ ├── sparse_feature_hash.py │ │ │ ├── sparse_lookup.py │ │ │ ├── split.py │ │ │ ├── tags.py │ │ │ └── uniform_sampling.py │ │ ├── layers_test.py │ │ ├── lengths_reducer_fused_8bit_rowwise_ops_test.py │ │ ├── lengths_reducer_rowwise_8bit_ops_test.py │ │ ├── lstm_benchmark.py │ │ ├── memonger.py │ │ ├── memonger_test.py │ │ ├── mint/ │ │ │ ├── app.py │ │ │ ├── static/ │ │ │ │ └── css/ │ │ │ │ └── simple-sidebar.css │ │ │ └── templates/ │ │ │ └── index.html │ │ ├── mkl/ │ │ │ ├── convnet_benchmarks.py │ │ │ ├── mkl_LRN_op_test.py │ │ │ ├── mkl_LRN_speed_test.py │ │ │ ├── mkl_concat_op_test.py │ │ │ ├── mkl_conv_op_test.py │ │ │ ├── mkl_copy_op_test.py │ │ │ ├── mkl_elementwise_add_op_test.py │ │ │ ├── mkl_elementwise_sum_op_test.py │ │ │ ├── mkl_fc_op_test.py │ │ │ ├── mkl_fc_speed_test.py │ │ │ ├── mkl_fill_op_test.py │ │ │ ├── mkl_pool_op_test.py │ │ │ ├── mkl_pool_speed_test.py │ │ │ ├── mkl_relu_op_test.py │ │ │ ├── mkl_sbn_op_test.py │ │ │ ├── mkl_sbn_speed_test.py │ │ │ ├── mkl_sigmoid_op_test.py │ │ │ ├── mkl_speed_test.py │ │ │ ├── mkl_squeeze_op_test.py │ │ │ ├── rewrite_graph.py │ │ │ └── rewrite_graph_test.py │ │ ├── mkl_test_util.py │ │ ├── model_device_test.py │ │ ├── model_helper.py │ │ ├── modeling/ │ │ │ ├── __init__.py │ │ │ ├── compute_norm_for_blobs.py │ │ │ ├── compute_norm_for_blobs_test.py │ │ │ ├── compute_statistics_for_blobs.py │ │ │ ├── compute_statistics_for_blobs_test.py │ │ │ ├── initializers.py │ │ │ ├── initializers_test.py │ │ │ ├── net_modifier.py │ │ │ ├── parameter_info.py │ │ │ ├── parameter_sharing.py │ │ │ └── parameter_sharing_test.py │ │ ├── models/ │ │ │ ├── __init__.py │ │ │ ├── __sym_init__.py │ │ │ ├── download.py │ │ │ ├── resnet.py │ │ │ ├── resnet_test.py │ │ │ └── seq2seq/ │ │ │ ├── __init__.py │ │ │ ├── beam_search.py │ │ │ ├── seq2seq_beam_search_test.py │ │ │ ├── seq2seq_model_helper.py │ │ │ ├── seq2seq_model_helper_test.py │ │ │ ├── seq2seq_util.py │ │ │ ├── train.py │ │ │ └── translate.py │ │ ├── modifier_context.py │ │ ├── mpi_python.cc │ │ ├── muji.py │ │ ├── muji_test.py │ │ ├── net_builder.py │ │ ├── net_builder_test.py │ │ ├── net_drawer.py │ │ ├── net_printer.py │ │ ├── net_printer_test.py │ │ ├── numa_benchmark.py │ │ ├── numa_test.py │ │ ├── observer_test.py │ │ ├── onnx/ │ │ │ ├── ONNXOpCoverage.md │ │ │ ├── README.md │ │ │ ├── __init__.py │ │ │ ├── backend.py │ │ │ ├── backend_cpp_rep.py │ │ │ ├── backend_rep.py │ │ │ ├── bin/ │ │ │ │ ├── __init__.py │ │ │ │ └── conversion.py │ │ │ ├── error.py │ │ │ ├── frontend.py │ │ │ ├── helper.py │ │ │ ├── tests/ │ │ │ │ ├── __init__.py │ │ │ │ ├── c2_ref_test.py │ │ │ │ ├── conversion_test.py │ │ │ │ ├── helper_test.py │ │ │ │ ├── onnx_backend_test.py │ │ │ │ ├── optimize_onnx_test.py │ │ │ │ ├── ssa_test.py │ │ │ │ └── test_utils.py │ │ │ └── workspace.py │ │ ├── operator_test/ │ │ │ ├── activation_ops_test.py │ │ │ ├── adagrad_test.py │ │ │ ├── adam_test.py │ │ │ ├── apmeter_test.py │ │ │ ├── assert_test.py │ │ │ ├── atomic_ops_test.py │ │ │ ├── basic_rnn_test.py │ │ │ ├── batch_box_cox_test.py │ │ │ ├── batch_sparse_to_dense_op_test.py │ │ │ ├── bbox_transform_test.py │ │ │ ├── blobs_queue_db_test.py │ │ │ ├── boolean_mask_test.py │ │ │ ├── boolean_unmask_test.py │ │ │ ├── cast_op_test.py │ │ │ ├── ceil_op_test.py │ │ │ ├── channel_backprop_stats_op_test.py │ │ │ ├── channel_shuffle_test.py │ │ │ ├── channel_stats_op_test.py │ │ │ ├── checkpoint_test.py │ │ │ ├── clip_op_test.py │ │ │ ├── clip_tensor_op_test.py │ │ │ ├── concat_split_op_test.py │ │ │ ├── conditional_test.py │ │ │ ├── conv_test.py │ │ │ ├── conv_transpose_test.py │ │ │ ├── copy_ops_test.py │ │ │ ├── cosine_embedding_criterion_op_test.py │ │ │ ├── counter_ops_test.py │ │ │ ├── crf_test.py │ │ │ ├── cross_entropy_ops_test.py │ │ │ ├── cudnn_recurrent_test.py │ │ │ ├── dataset_ops_test.py │ │ │ ├── deform_conv_test.py │ │ │ ├── distance_op_test.py │ │ │ ├── dropout_op_test.py │ │ │ ├── duplicate_operands_test.py │ │ │ ├── elementwise_linear_op_test.py │ │ │ ├── elementwise_logical_ops_test.py │ │ │ ├── elementwise_op_broadcast_test.py │ │ │ ├── elementwise_ops_test.py │ │ │ ├── emptysample_ops_test.py │ │ │ ├── extend_tensor_op_test.py │ │ │ ├── fc_operator_test.py │ │ │ ├── filler_ops_test.py │ │ │ ├── find_op_test.py │ │ │ ├── flatten_op_test.py │ │ │ ├── flexible_top_k_test.py │ │ │ ├── floor_op_test.py │ │ │ ├── gather_ops_test.py │ │ │ ├── gather_ranges_op_test.py │ │ │ ├── given_tensor_fill_op_test.py │ │ │ ├── glu_op_test.py │ │ │ ├── group_conv_test.py │ │ │ ├── gru_test.py │ │ │ ├── hsm_test.py │ │ │ ├── im2col_col2im_test.py │ │ │ ├── image_input_op_test.py │ │ │ ├── index_hash_ops_test.py │ │ │ ├── index_ops_test.py │ │ │ ├── instance_norm_test.py │ │ │ ├── jsd_ops_test.py │ │ │ ├── key_split_ops_test.py │ │ │ ├── lars_test.py │ │ │ ├── layer_norm_op_test.py │ │ │ ├── lc_operator_test.py │ │ │ ├── leaky_relu_test.py │ │ │ ├── learning_rate_op_test.py │ │ │ ├── lengths_tile_op_test.py │ │ │ ├── lengths_top_k_ops_test.py │ │ │ ├── listwise_l2r_operator_test.py │ │ │ ├── load_save_test.py │ │ │ ├── loss_ops_test.py │ │ │ ├── lpnorm_op_test.py │ │ │ ├── map_ops_test.py │ │ │ ├── margin_ranking_criterion_op_test.py │ │ │ ├── math_ops_test.py │ │ │ ├── matmul_op_test.py │ │ │ ├── mean_op_test.py │ │ │ ├── merge_id_lists_op_test.py │ │ │ ├── mkl_conv_op_test.py │ │ │ ├── mkl_packed_fc_op_test.py │ │ │ ├── mkl_speed_test.py │ │ │ ├── mod_op_test.py │ │ │ ├── momentum_sgd_test.py │ │ │ ├── mpi_test.py │ │ │ ├── negate_gradient_op_test.py │ │ │ ├── ngram_ops_test.py │ │ │ ├── normalize_op_test.py │ │ │ ├── one_hot_ops_test.py │ │ │ ├── onnx_while_test.py │ │ │ ├── pack_ops_test.py │ │ │ ├── pack_rnn_sequence_op_test.py │ │ │ ├── pad_test.py │ │ │ ├── partition_ops_test.py │ │ │ ├── percentile_op_test.py │ │ │ ├── piecewise_linear_transform_test.py │ │ │ ├── pooling_test.py │ │ │ ├── prepend_dim_test.py │ │ │ ├── python_op_test.py │ │ │ ├── rank_loss_operator_test.py │ │ │ ├── rebatching_queue_test.py │ │ │ ├── record_queue_test.py │ │ │ ├── recurrent_net_executor_test.py │ │ │ ├── recurrent_network_test.py │ │ │ ├── reduce_ops_test.py │ │ │ ├── reduction_ops_test.py │ │ │ ├── relu_op_test.py │ │ │ ├── reshape_ops_test.py │ │ │ ├── resize_op_test.py │ │ │ ├── rmac_regions_op_test.py │ │ │ ├── rnn_cell_test.py │ │ │ ├── segment_ops_test.py │ │ │ ├── selu_op_test.py │ │ │ ├── sequence_ops_test.py │ │ │ ├── shape_inference_test.py │ │ │ ├── sinusoid_position_encoding_op_test.py │ │ │ ├── softmax_ops_test.py │ │ │ ├── softplus_op_test.py │ │ │ ├── sparse_gradient_checker_test.py │ │ │ ├── sparse_lengths_sum_benchmark.py │ │ │ ├── sparse_normalize_test.py │ │ │ ├── sparse_ops_test.py │ │ │ ├── sparse_to_dense_mask_op_test.py │ │ │ ├── spatial_bn_op_test.py │ │ │ ├── specialized_segment_ops_test.py │ │ │ ├── square_root_divide_op_test.py │ │ │ ├── stats_ops_test.py │ │ │ ├── string_ops_test.py │ │ │ ├── text_file_reader_test.py │ │ │ ├── thresholded_relu_op_test.py │ │ │ ├── tile_op_test.py │ │ │ ├── top_k_test.py │ │ │ ├── unique_uniform_fill_op_test.py │ │ │ ├── utility_ops_test.py │ │ │ ├── video_input_op_test.py │ │ │ ├── weighted_sample_test.py │ │ │ └── weighted_sum_test.py │ │ ├── optimizer.py │ │ ├── optimizer_context.py │ │ ├── optimizer_test.py │ │ ├── optimizer_test_util.py │ │ ├── parallel_workers.py │ │ ├── parallel_workers_test.py │ │ ├── parallelize_bmuf_distributed_test.py │ │ ├── pipeline.py │ │ ├── pipeline_test.py │ │ ├── predictor/ │ │ │ ├── __init__.py │ │ │ ├── mobile_exporter.py │ │ │ ├── mobile_exporter_test.py │ │ │ ├── predictor_exporter.py │ │ │ ├── predictor_exporter_test.py │ │ │ ├── predictor_py_utils.py │ │ │ ├── predictor_test.py │ │ │ └── serde.py │ │ ├── predictor_constants.py │ │ ├── pybind_state.cc │ │ ├── pybind_state.h │ │ ├── pybind_state_dlpack.cc │ │ ├── pybind_state_dlpack.h │ │ ├── pybind_state_gpu.cc │ │ ├── pybind_state_mkl.cc │ │ ├── python_op_test.py │ │ ├── queue_util.py │ │ ├── record_queue.py │ │ ├── recurrent.py │ │ ├── regularizer.py │ │ ├── regularizer_context.py │ │ ├── regularizer_test.py │ │ ├── rnn/ │ │ │ ├── lstm_comparison.py │ │ │ └── rnn_cell_test_util.py │ │ ├── rnn_cell.py │ │ ├── schema.py │ │ ├── schema_test.py │ │ ├── scope.py │ │ ├── scope_test.py │ │ ├── session.py │ │ ├── session_test.py │ │ ├── sparse_to_dense_mask_test.py │ │ ├── sparse_to_dense_test.py │ │ ├── task.py │ │ ├── test/ │ │ │ ├── blob_deallocation_test.py │ │ │ ├── do_op_test.py │ │ │ ├── executor_test.py │ │ │ └── executor_test_util.py │ │ ├── test_util.py │ │ ├── text_file_reader.py │ │ ├── timeout_guard.py │ │ ├── toy_regression_test.py │ │ ├── tt_core.py │ │ ├── tt_core_test.py │ │ ├── tutorials/ │ │ │ ├── Basics.ipynb │ │ │ ├── Control_Ops.ipynb │ │ │ ├── Getting_Caffe1_Models_for_Translation.ipynb │ │ │ ├── Image_Pre-Processing_Pipeline.ipynb │ │ │ ├── Loading_Pretrained_Models.ipynb │ │ │ ├── MNIST.ipynb │ │ │ ├── MNIST_Dataset_and_Databases.ipynb │ │ │ ├── Model_Quickload.ipynb │ │ │ ├── Multi-GPU_Training.ipynb │ │ │ ├── Python_Op.ipynb │ │ │ ├── README.md │ │ │ ├── Toy_Regression.ipynb │ │ │ ├── Training_a_Model.ipynb │ │ │ ├── create_your_own_dataset.ipynb │ │ │ ├── experimental/ │ │ │ │ └── Immediate.ipynb │ │ │ ├── helpers.py │ │ │ ├── inference_codes.txt │ │ │ ├── jupyter_notebook_config.py │ │ │ ├── py_gen/ │ │ │ │ ├── Basics.py │ │ │ │ ├── Control_Ops.py │ │ │ │ ├── Getting_Caffe1_Models_for_Translation.py │ │ │ │ ├── Image_Pre-Processing_Pipeline.py │ │ │ │ ├── Loading_Pretrained_Models.py │ │ │ │ ├── MNIST.py │ │ │ │ ├── MNIST_Dataset_and_Databases.py │ │ │ │ ├── Model_Quickload.py │ │ │ │ ├── Multi-GPU_Training.py │ │ │ │ ├── Python_Op.py │ │ │ │ ├── Toy_Regression.py │ │ │ │ ├── Training_a_Model.py │ │ │ │ ├── create_your_own_dataset.py │ │ │ │ └── sparseNN.py │ │ │ ├── start_ipython_notebook.sh │ │ │ └── tutorials_to_script_converter.py │ │ ├── utils.py │ │ ├── visualize.py │ │ ├── workspace.py │ │ └── workspace_test.py │ ├── queue/ │ │ ├── CMakeLists.txt │ │ ├── blobs_queue.cc │ │ ├── blobs_queue.h │ │ ├── blobs_queue_db.cc │ │ ├── blobs_queue_db.h │ │ ├── queue_ops.cc │ │ ├── queue_ops.h │ │ ├── queue_ops_gpu.cc │ │ ├── rebatching_queue.cc │ │ ├── rebatching_queue.h │ │ ├── rebatching_queue_ops.cc │ │ └── rebatching_queue_ops.h │ ├── sgd/ │ │ ├── CMakeLists.txt │ │ ├── adagrad_op.cc │ │ ├── adagrad_op.h │ │ ├── adagrad_op_gpu.cu │ │ ├── adam_op.cc │ │ ├── adam_op.h │ │ ├── adam_op_gpu.cu │ │ ├── clip_tensor_op.cc │ │ ├── clip_tensor_op.h │ │ ├── fp16_momentum_sgd_op.cu │ │ ├── fp16_momentum_sgd_op.h │ │ ├── fp32_momentum_sgd_op.cu │ │ ├── fp32_momentum_sgd_op.h │ │ ├── ftrl_op.cc │ │ ├── ftrl_op.h │ │ ├── iter_op.cc │ │ ├── iter_op.h │ │ ├── iter_op_gpu.cc │ │ ├── lars_op.cc │ │ ├── lars_op.h │ │ ├── lars_op_gpu.cu │ │ ├── learning_rate_functors.h │ │ ├── learning_rate_op.cc │ │ ├── learning_rate_op.h │ │ ├── learning_rate_op_gpu.cc │ │ ├── momentum_sgd_op.cc │ │ ├── momentum_sgd_op.h │ │ ├── momentum_sgd_op_gpu.cu │ │ ├── rmsprop_op.cc │ │ ├── rmsprop_op.h │ │ ├── rmsprop_op_gpu.cu │ │ ├── yellowfin_op.cc │ │ ├── yellowfin_op.h │ │ └── yellowfin_op_gpu.cu │ ├── share/ │ │ ├── CMakeLists.txt │ │ └── contrib/ │ │ ├── CMakeLists.txt │ │ ├── nnpack/ │ │ │ ├── CMakeLists.txt │ │ │ ├── conv_op.cc │ │ │ └── conv_op_test.cc │ │ └── zstd/ │ │ ├── CMakeLists.txt │ │ ├── quant_decomp_zstd_op.cc │ │ └── quant_decomp_zstd_op.h │ ├── test/ │ │ ├── assets/ │ │ │ └── squeeze_predict_net.pb │ │ └── caffe2_gtest_main.cc │ ├── transforms/ │ │ ├── CMakeLists.txt │ │ ├── common_subexpression_elimination.cc │ │ ├── common_subexpression_elimination.h │ │ ├── common_subexpression_elimination_test.cc │ │ ├── conv_to_nnpack_transform.cc │ │ ├── conv_to_nnpack_transform.h │ │ ├── conv_to_nnpack_transform_test.cc │ │ ├── pattern_net_transform.cc │ │ ├── pattern_net_transform.h │ │ ├── pattern_net_transform_test.cc │ │ ├── single_op_transform.cc │ │ └── single_op_transform.h │ ├── utils/ │ │ ├── CMakeLists.txt │ │ ├── GpuBitonicSort.cuh │ │ ├── GpuDefs.cuh │ │ ├── GpuScanUtils.cuh │ │ ├── cast.h │ │ ├── cblas.h │ │ ├── conversions.h │ │ ├── cpu_neon.h │ │ ├── cpuid.cc │ │ ├── cpuid.h │ │ ├── cpuid_test.cc │ │ ├── eigen_utils.h │ │ ├── fatal_signal_asan_no_sig_test.cc │ │ ├── fixed_divisor.h │ │ ├── fixed_divisor_test.cc │ │ ├── math-detail.h │ │ ├── math.h │ │ ├── math_cpu.cc │ │ ├── math_gpu.cu │ │ ├── math_gpu_test.cc │ │ ├── math_test.cc │ │ ├── mixed_utils.h │ │ ├── murmur_hash3.cc │ │ ├── murmur_hash3.h │ │ ├── proto_utils.cc │ │ ├── proto_utils.h │ │ ├── proto_utils_test.cc │ │ ├── signal_handler.cc │ │ ├── signal_handler.h │ │ ├── simple_queue.h │ │ ├── simple_queue_test.cc │ │ ├── smart_tensor_printer.cc │ │ ├── smart_tensor_printer.h │ │ ├── smart_tensor_printer_test.cc │ │ ├── string_utils.cc │ │ ├── string_utils.h │ │ ├── thread_pool.h │ │ ├── threadpool/ │ │ │ ├── ThreadPool.cc │ │ │ ├── ThreadPool.h │ │ │ ├── ThreadPoolCommon.h │ │ │ ├── WorkersPool.h │ │ │ ├── pthreadpool.cc │ │ │ ├── pthreadpool.h │ │ │ ├── pthreadpool_impl.cc │ │ │ └── pthreadpool_impl.h │ │ └── zmq_helper.h │ └── video/ │ ├── CMakeLists.txt │ ├── optical_flow.cc │ ├── optical_flow.h │ ├── video_decoder.cc │ ├── video_decoder.h │ ├── video_input_op.cc │ ├── video_input_op.h │ ├── video_input_op_gpu.cc │ ├── video_io.cc │ └── video_io.h ├── cmake/ │ ├── BuildVariables.cmake │ ├── Caffe2Config.cmake.in │ ├── Caffe2ConfigVersion.cmake.in │ ├── Dependencies.cmake │ ├── External/ │ │ ├── nccl.cmake │ │ └── nnpack.cmake │ ├── MiscCheck.cmake │ ├── Modules/ │ │ ├── FindAtlas.cmake │ │ ├── FindBenchmark.cmake │ │ ├── FindCUB.cmake │ │ ├── FindFFmpeg.cmake │ │ ├── FindGloo.cmake │ │ ├── FindHiredis.cmake │ │ ├── FindLAPACK.cmake │ │ ├── FindLMDB.cmake │ │ ├── FindLevelDB.cmake │ │ ├── FindMKL.cmake │ │ ├── FindMatlabMex.cmake │ │ ├── FindNCCL.cmake │ │ ├── FindNNPACK.cmake │ │ ├── FindNumPy.cmake │ │ ├── FindNuma.cmake │ │ ├── FindOpenBLAS.cmake │ │ ├── FindRocksDB.cmake │ │ ├── FindSnappy.cmake │ │ ├── FindZMQ.cmake │ │ ├── Findpybind11.cmake │ │ └── FindvecLib.cmake │ ├── Modules_CUDA_fix/ │ │ ├── FindCUDA/ │ │ │ ├── make2cmake.cmake │ │ │ ├── parse_cubin.cmake │ │ │ ├── run_nvcc.cmake │ │ │ └── select_compute_arch.cmake │ │ ├── FindCUDA.cmake │ │ ├── FindPackageHandleStandardArgs.cmake │ │ ├── FindPackageMessage.cmake │ │ └── README.txt │ ├── ProtoBuf.cmake │ ├── Summary.cmake │ ├── Utils.cmake │ ├── Whitelist.cmake │ ├── cmake_uninstall.cmake.in │ └── public/ │ ├── cuda.cmake │ ├── gflags.cmake │ ├── glog.cmake │ ├── protobuf.cmake │ ├── threads.cmake │ └── utils.cmake ├── conda/ │ ├── cuda/ │ │ ├── build.sh │ │ ├── conda_build_config.yaml │ │ └── meta.yaml │ ├── cuda_full/ │ │ ├── build.sh │ │ ├── conda_build_config.yaml │ │ └── meta.yaml │ └── no_cuda/ │ ├── build.sh │ ├── conda_build_config.yaml │ └── meta.yaml ├── docker/ │ ├── jenkins/ │ │ ├── README.md │ │ ├── build.sh │ │ ├── centos/ │ │ │ ├── .gitignore │ │ │ └── Dockerfile │ │ ├── centos-cuda/ │ │ │ ├── .gitignore │ │ │ └── Dockerfile │ │ ├── common/ │ │ │ ├── add_jenkins_user.sh │ │ │ ├── install_anaconda.sh │ │ │ ├── install_android.sh │ │ │ ├── install_base.sh │ │ │ ├── install_ccache.sh │ │ │ ├── install_clang.sh │ │ │ ├── install_cmake.sh │ │ │ ├── install_cuda.sh │ │ │ ├── install_gcc.sh │ │ │ ├── install_mkl.sh │ │ │ ├── install_nccl.sh │ │ │ └── install_python.sh │ │ ├── ubuntu/ │ │ │ ├── .gitignore │ │ │ └── Dockerfile │ │ └── ubuntu-cuda/ │ │ ├── .gitignore │ │ └── Dockerfile │ ├── readme.md │ ├── ubuntu-14.04-cpu-all-options/ │ │ └── Dockerfile │ ├── ubuntu-14.04-cpu-minimal/ │ │ └── Dockerfile │ ├── ubuntu-16.04-cpu-all-options/ │ │ └── Dockerfile │ ├── ubuntu-16.04-cpu-minimal/ │ │ └── Dockerfile │ ├── ubuntu-16.04-cuda8-cudnn6-all-options/ │ │ └── Dockerfile │ ├── ubuntu-16.04-cuda8-cudnn7-all-options/ │ │ └── Dockerfile │ └── ubuntu-16.04-gpu-tutorial/ │ └── Dockerfile ├── docs/ │ ├── .Doxyfile-c │ ├── .Doxyfile-python │ ├── DOXYGEN.md │ ├── DoxygenLayout-c.xml │ ├── DoxygenLayout-python.xml │ ├── README.md │ ├── footer.html │ ├── header.html │ ├── installation.md │ ├── main.css │ ├── process.py │ └── stylesheet.css ├── modules/ │ ├── CMakeLists.txt │ ├── detectron/ │ │ ├── CMakeLists.txt │ │ ├── affine_channel_op.cc │ │ ├── affine_channel_op.cu │ │ ├── affine_channel_op.h │ │ ├── batch_permutation_op.cc │ │ ├── batch_permutation_op.cu │ │ ├── batch_permutation_op.h │ │ ├── group_spatial_softmax_op.cc │ │ ├── group_spatial_softmax_op.cu │ │ ├── group_spatial_softmax_op.h │ │ ├── ps_roi_pool_op.cc │ │ ├── ps_roi_pool_op.cu │ │ ├── ps_roi_pool_op.h │ │ ├── roi_pool_f_op.cc │ │ ├── roi_pool_f_op.cu │ │ ├── roi_pool_f_op.h │ │ ├── sample_as_op.cc │ │ ├── sample_as_op.cu │ │ ├── sample_as_op.h │ │ ├── select_smooth_l1_loss_op.cc │ │ ├── select_smooth_l1_loss_op.cu │ │ ├── select_smooth_l1_loss_op.h │ │ ├── sigmoid_cross_entropy_loss_op.cc │ │ ├── sigmoid_cross_entropy_loss_op.cu │ │ ├── sigmoid_cross_entropy_loss_op.h │ │ ├── sigmoid_focal_loss_op.cc │ │ ├── sigmoid_focal_loss_op.cu │ │ ├── sigmoid_focal_loss_op.h │ │ ├── smooth_l1_loss_op.cc │ │ ├── smooth_l1_loss_op.cu │ │ ├── smooth_l1_loss_op.h │ │ ├── softmax_focal_loss_op.cc │ │ ├── softmax_focal_loss_op.cu │ │ ├── softmax_focal_loss_op.h │ │ ├── spatial_narrow_as_op.cc │ │ ├── spatial_narrow_as_op.cu │ │ ├── spatial_narrow_as_op.h │ │ ├── upsample_nearest_op.cc │ │ ├── upsample_nearest_op.cu │ │ └── upsample_nearest_op.h │ ├── module_test/ │ │ ├── CMakeLists.txt │ │ └── module_test_dynamic.cc │ ├── observers/ │ │ ├── CMakeLists.txt │ │ ├── net_observer_reporter.h │ │ ├── net_observer_reporter_print.cc │ │ ├── net_observer_reporter_print.h │ │ ├── observer_config.cc │ │ ├── observer_config.h │ │ ├── perf_observer.cc │ │ └── perf_observer.h │ └── rocksdb/ │ ├── CMakeLists.txt │ └── rocksdb.cc ├── release-notes.md ├── scripts/ │ ├── add_apache_header.sh │ ├── apache_header.txt │ ├── appveyor/ │ │ ├── install.bat │ │ └── install_cuda.bat │ ├── diagnose_protobuf.py │ ├── get_python_cmake_flags.py │ ├── read_conda_versions.sh │ ├── start_ipython_notebook.sh │ └── temp.sh └── setup.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitattributes ================================================ *.ipynb linguist-documentation ================================================ FILE: .github/CONTRIBUTING.md ================================================ ================================================ FILE: .github/ISSUE_TEMPLATE.md ================================================ If this is a build issue, please fill out the template below. ### System information * Operating system: * Compiler version: * CMake version: * CMake arguments: * Relevant libraries/versions (e.g. CUDA): ### CMake summary output ``` ******** Summary ******** ``` ================================================ FILE: .github/PULL_REQUEST_TEMPLATE.md ================================================ ================================================ FILE: .gitignore ================================================ ## General # Compiled Object files *.slo *.lo *.o *.cuo *.obj # Compiled Dynamic libraries *.so *.dylib *.dll # Compiled Static libraries *.lai *.la *.a *.lib # Compiled protocol buffers *.pb.h *.pb.cc *_pb2.py # Compiled python *.pyc *.pyd # Compiled MATLAB *.mex* # IPython notebook checkpoints .ipynb_checkpoints # Editor temporaries *.swn *.swo *.swp *~ # Sublime Text settings *.sublime-workspace *.sublime-project # Eclipse Project settings *.*project .settings # QtCreator files *.user # PyCharm files .idea # Visual Studio Code files .vscode .vs # OSX dir files .DS_Store ## Caffe2 # build, distribute, and bins (+ python proto bindings) build build_host_protoc build_android build_ios build_* .build_debug/* .build_release/* distribute/* *.testbin *.bin cmake_build .cmake_build gen .setuptools-cmake-build .pytest_cache # Bram plsdontbreak # Generated documentation docs/_site docs/gathered _site doxygen docs/dev # LevelDB files *.sst *.ldb LOCK LOG* CURRENT MANIFEST-* # generated version file caffe2/version.py # setup.py intermediates .eggs caffe2.egg-info # Atom/Watchman required file .watchmanconfig ================================================ FILE: .gitmodules ================================================ [submodule "third_party/pybind11"] path = third_party/pybind11 url = https://github.com/pybind/pybind11.git [submodule "third_party/nccl"] path = third_party/nccl url = https://github.com/nvidia/nccl.git [submodule "third_party/cub"] path = third_party/cub url = https://github.com/NVlabs/cub.git [submodule "third_party/eigen"] path = third_party/eigen url = https://github.com/RLovelett/eigen.git [submodule "third_party/googletest"] path = third_party/googletest url = https://github.com/google/googletest.git [submodule "third_party/nervanagpu"] path = third_party/nervanagpu url = https://github.com/NervanaSystems/nervanagpu.git [submodule "third_party/benchmark"] path = third_party/benchmark url = https://github.com/google/benchmark.git [submodule "third_party/protobuf"] path = third_party/protobuf url = https://github.com/google/protobuf.git [submodule "third_party/ios-cmake"] path = third_party/ios-cmake url = https://github.com/Yangqing/ios-cmake.git [submodule "third_party/NNPACK"] path = third_party/NNPACK url = https://github.com/Maratyszcza/NNPACK.git [submodule "third_party/gloo"] path = third_party/gloo url = https://github.com/facebookincubator/gloo [submodule "third_party/NNPACK_deps/pthreadpool"] path = third_party/pthreadpool url = https://github.com/Maratyszcza/pthreadpool.git [submodule "third_party/NNPACK_deps/FXdiv"] path = third_party/FXdiv url = https://github.com/Maratyszcza/FXdiv.git [submodule "third_party/NNPACK_deps/FP16"] path = third_party/FP16 url = https://github.com/Maratyszcza/FP16.git [submodule "third_party/NNPACK_deps/psimd"] path = third_party/psimd url = https://github.com/Maratyszcza/psimd.git [submodule "third_party/aten"] path = third_party/aten url = https://github.com/zdevito/aten [submodule "third_party/zstd"] path = third_party/zstd url = https://github.com/facebook/zstd.git [submodule "third-party/cpuinfo"] path = third_party/cpuinfo url = https://github.com/Maratyszcza/cpuinfo.git [submodule "third_party/python-enum"] path = third_party/python-enum url = https://github.com/PeachPy/enum34.git [submodule "third_party/python-peachpy"] path = third_party/python-peachpy url = https://github.com/Maratyszcza/PeachPy.git [submodule "third_party/python-six"] path = third_party/python-six url = https://github.com/benjaminp/six.git [submodule "third_party/ComputeLibrary"] path = third_party/ComputeLibrary url = https://github.com/ARM-software/ComputeLibrary.git [submodule "third_party/onnx"] path = third_party/onnx url = https://github.com/onnx/onnx.git ================================================ FILE: .jenkins/README.md ================================================ # Jenkins The scripts in this directory are the entrypoint for testing Caffe2. The environment variable `BUILD_ENVIRONMENT` is expected to be set to the build environment you intend to test. It is a hint for the build and test scripts to configure Caffe2 a certain way and include/exclude tests. Docker images, they equal the name of the image itself. For example: `py2-cuda9.0-cudnn7-ubuntu16.04`. The Docker images that are built on Jenkins and are used in triggered builds already have this environment variable set in their manifest. Also see `./docker/jenkins/*/Dockerfile` and search for `BUILD_ENVIRONMENT`. Our Jenkins installation is located at https://ci.pytorch.org/jenkins/. ================================================ FILE: .jenkins/build.sh ================================================ #!/bin/bash set -ex LOCAL_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) ROOT_DIR=$(cd "$LOCAL_DIR"/.. && pwd) # Setup sccache if SCCACHE_BUCKET is set if [ -n "${SCCACHE_BUCKET}" ]; then mkdir -p ./sccache SCCACHE="$(which sccache)" if [ -z "${SCCACHE}" ]; then echo "Unable to find sccache..." exit 1 fi # Setup wrapper scripts for compiler in cc c++ gcc g++ x86_64-linux-gnu-gcc; do ( echo "#!/bin/sh" echo "exec $SCCACHE $(which $compiler) \"\$@\"" ) > "./sccache/$compiler" chmod +x "./sccache/$compiler" done # CMake must find these wrapper scripts export PATH="$PWD/sccache:$PATH" fi # Setup ccache if configured to use it (and not sccache) if [ -z "${SCCACHE}" ] && which ccache > /dev/null; then mkdir -p ./ccache ln -sf "$(which ccache)" ./ccache/cc ln -sf "$(which ccache)" ./ccache/c++ ln -sf "$(which ccache)" ./ccache/gcc ln -sf "$(which ccache)" ./ccache/g++ ln -sf "$(which ccache)" ./ccache/x86_64-linux-gnu-gcc export CCACHE_WRAPPER_DIR="$PWD/ccache" export PATH="$CCACHE_WRAPPER_DIR:$PATH" fi CMAKE_ARGS=("-DBUILD_BINARY=ON") CMAKE_ARGS+=("-DUSE_OBSERVERS=ON") CMAKE_ARGS+=("-DUSE_ZSTD=ON") # Run build script from scripts if applicable if [[ "${BUILD_ENVIRONMENT}" == *-android* ]]; then export ANDROID_NDK=/opt/ndk "${ROOT_DIR}/scripts/build_android.sh" ${CMAKE_ARGS[*]} "$@" exit 0 fi if [[ "${BUILD_ENVIRONMENT}" == conda* ]]; then # click (required by onnx) wants these set export LANG=C.UTF-8 export LC_ALL=C.UTF-8 # SKIP_CONDA_TESTS refers to only the 'test' section of the meta.yaml export SKIP_CONDA_TESTS=1 export CONDA_INSTALL_LOCALLY=1 "${ROOT_DIR}/scripts/build_anaconda.sh" "$@" # The tests all need hypothesis, tabulate, and pydot, which aren't included # in the conda packages conda install -y hypothesis tabulate pydot # This build will be tested against onnx tests, which needs onnx installed. # Onnx should be built against the same protobuf that Caffe2 uses, which is # only installed in the conda environment when Caffe2 is. # This path comes from install_anaconda.sh which installs Anaconda into the # docker image PROTOBUF_INCDIR=/opt/conda/include pip install "${ROOT_DIR}/third_party/onnx" exit 0 fi # Run cmake from ./build directory mkdir -p ./build cd ./build INSTALL_PREFIX="/usr/local/caffe2" CMAKE_ARGS+=("-DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX}") # Explicitly set Python executable. # On Ubuntu 16.04 the default Python is still 2.7. PYTHON="$(which python)" if [[ "${BUILD_ENVIRONMENT}" == py3* ]]; then PYTHON=/usr/bin/python3 CMAKE_ARGS+=("-DPYTHON_EXECUTABLE=${PYTHON}") fi case "${BUILD_ENVIRONMENT}" in *-mkl*) CMAKE_ARGS+=("-DBLAS=MKL") ;; *-cuda*) CMAKE_ARGS+=("-DUSE_CUDA=ON") CMAKE_ARGS+=("-DCUDA_ARCH_NAME=Maxwell") CMAKE_ARGS+=("-DUSE_NNPACK=OFF") # Add ccache symlink for nvcc ln -sf "$(which ccache)" "${CCACHE_WRAPPER_DIR}/nvcc" # Explicitly set path to NVCC such that the symlink to ccache is used CMAKE_ARGS+=("-DCUDA_NVCC_EXECUTABLE=${CCACHE_WRAPPER_DIR}/nvcc") # Ensure FindCUDA.cmake can infer the right path to the CUDA toolkit. # Setting PATH to resolve to the right nvcc alone isn't enough. # See /usr/share/cmake-3.5/Modules/FindCUDA.cmake, block at line 589. export CUDA_PATH="/usr/local/cuda" # Ensure the ccache symlink can still find the real nvcc binary. export PATH="/usr/local/cuda/bin:$PATH" ;; esac # Try to include Redis support for Linux builds if [ "$(uname)" == "Linux" ]; then CMAKE_ARGS+=("-DUSE_REDIS=ON") fi # Currently, on Jenkins mac os, we will use custom protobuf. Mac OS # contbuild at the moment is minimal dependency - it doesn't use glog # or gflags either. if [ "$(uname)" == "Darwin" ]; then CMAKE_ARGS+=("-DBUILD_CUSTOM_PROTOBUF=ON") fi # We test the presence of cmake3 (for platforms like Centos and Ubuntu 14.04) # and use that if so. if [[ -x "$(command -v cmake3)" ]]; then CMAKE_BINARY=cmake3 else CMAKE_BINARY=cmake fi # Configure ${CMAKE_BINARY} "${ROOT_DIR}" ${CMAKE_ARGS[*]} "$@" # Build if [ "$(uname)" == "Linux" ]; then make "-j$(nproc)" install else echo "Don't know how to build on $(uname)" exit 1 fi # Install ONNX into a local directory ONNX_INSTALL_PATH="/usr/local/onnx" pip install "${ROOT_DIR}/third_party/onnx" -t "${ONNX_INSTALL_PATH}" # Symlink the caffe2 base python path into the system python path, # so that we can import caffe2 without having to change $PYTHONPATH. # Run in a subshell to contain environment set by /etc/os-release. # # This is only done when running on Jenkins! We don't want to pollute # the user environment with Python symlinks and ld.so.conf.d hacks. # if [ -n "${JENKINS_URL}" ]; then ( source /etc/os-release function python_version() { "$PYTHON" -c 'import sys; print("python%d.%d" % sys.version_info[0:2])' } # Debian/Ubuntu if [[ "$ID_LIKE" == *debian* ]]; then python_path="/usr/local/lib/$(python_version)/dist-packages" sudo ln -sf "${INSTALL_PREFIX}/caffe2" "${python_path}" sudo ln -sf "${ONNX_INSTALL_PATH}/onnx" "${python_path}" fi # RHEL/CentOS if [[ "$ID_LIKE" == *rhel* ]]; then python_path="/usr/lib64/$(python_version)/site-packages/" sudo ln -sf "${INSTALL_PREFIX}/caffe2" "${python_path}" sudo ln -sf "${ONNX_INSTALL_PATH}/onnx" "${python_path}" fi # /etc/ld.so.conf.d is used on both Debian and RHEL echo "${INSTALL_PREFIX}/lib" | sudo tee /etc/ld.so.conf.d/caffe2.conf sudo ldconfig ) fi ================================================ FILE: .jenkins/test.sh ================================================ #!/bin/bash set -ex # Figure out which Python to use PYTHON="python" if [ -n "$BUILD_ENVIRONMENT" ]; then if [[ "$BUILD_ENVIRONMENT" == py2* ]]; then PYTHON="python2" elif [[ "$BUILD_ENVIRONMENT" == py3* ]]; then PYTHON="python3" fi fi # The prefix must mirror the setting from build.sh INSTALL_PREFIX="/usr/local/caffe2" # Anaconda builds have a special install prefix and python if [[ "$BUILD_ENVIRONMENT" == conda* ]]; then # This path comes from install_anaconda.sh which installs Anaconda into the # docker image PYTHON="/opt/conda/bin/python" INSTALL_PREFIX="/opt/conda/" fi # Add the site-packages in the caffe2 install prefix to the PYTHONPATH SITE_DIR=$($PYTHON -c "from distutils import sysconfig; print(sysconfig.get_python_lib(prefix=''))") INSTALL_SITE_DIR="${INSTALL_PREFIX}/${SITE_DIR}" LOCAL_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) ROOT_DIR=$(cd "$LOCAL_DIR"/.. && pwd) # Skip tests in environments where they are not built/applicable if [[ "${BUILD_ENVIRONMENT}" == *-android* ]]; then echo 'Skipping tests' exit 0 fi # Set PYTHONPATH and LD_LIBRARY_PATH so that python can find the installed # Caffe2. This shouldn't be done on Anaconda, as Anaconda should handle this. if [[ "$BUILD_ENVIRONMENT" != conda* ]]; then export PYTHONPATH="${PYTHONPATH}:$INSTALL_SITE_DIR" export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:${INSTALL_PREFIX}/lib" fi exit_code=0 cd "$ROOT_DIR"/caffe2/python/tutorials python tutorials_to_script_converter.py git status if git diff --quiet HEAD; then echo "Source tree is clean." else echo "After running a tutorial -> script sync there are changes. This probably means you edited an ipython notebook without a proper sync to a script. Please see caffe2/python/tutorials/README.md for more information" if [ "$exit_code" -eq 0 ]; then exit_code=1 fi fi cd "$ROOT_DIR" if [ -d ./test ]; then echo "Directory ./test already exists; please remove it..." exit 1 fi mkdir -p ./test/{cpp,python} TEST_DIR="$PWD/test" cd ${INSTALL_PREFIX} # Commands below may exit with non-zero status set +e # C++ tests echo "Running C++ tests.." for test in ./test/*; do # Skip tests we know are hanging or bad case "$(basename "$test")" in mkl_utils_test) continue ;; # TODO investigate conv_op_test failures when using MKL conv_op_test) continue ;; esac "$test" --gtest_output=xml:"$TEST_DIR"/cpp/$(basename "$test").xml tmp_exit_code="$?" if [ "$exit_code" -eq 0 ]; then exit_code="$tmp_exit_code" fi done # Get the relative path to where the caffe2 python module was installed CAFFE2_PYPATH="$INSTALL_SITE_DIR/caffe2" # Collect additional tests to run (outside caffe2/python) EXTRA_TESTS=() # CUDA builds always include NCCL support if [[ "$BUILD_ENVIRONMENT" == *-cuda* ]]; then EXTRA_TESTS+=("$CAFFE2_PYPATH/contrib/nccl") fi # Python tests echo "Running Python tests.." "$PYTHON" \ -m pytest \ -x \ -v \ --junit-xml="$TEST_DIR/python/result.xml" \ --ignore "$CAFFE2_PYPATH/python/test/executor_test.py" \ --ignore "$CAFFE2_PYPATH/python/operator_test/matmul_op_test.py" \ --ignore "$CAFFE2_PYPATH/python/operator_test/pack_ops_test.py" \ --ignore "$CAFFE2_PYPATH/python/mkl/mkl_sbn_speed_test.py" \ "$CAFFE2_PYPATH/python" \ "${EXTRA_TESTS[@]}" tmp_exit_code="$?" if [ "$exit_code" -eq 0 ]; then exit_code="$tmp_exit_code" fi # Exit with the first non-zero status we got exit "$exit_code" ================================================ FILE: .travis/build.sh ================================================ #!/bin/bash set -e set -x LOCAL_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd ) ROOT_DIR=$(dirname "$LOCAL_DIR") cd "$ROOT_DIR" mkdir build cd build # Special cases - run script and exit if [ "$BUILD_ANDROID" = 'true' ]; then export ANDROID_NDK=/opt/android_ndk "${ROOT_DIR}/scripts/build_android.sh" exit 0 fi if [ "$BUILD_IOS" = 'true' ]; then "${ROOT_DIR}/scripts/build_ios.sh" -DCMAKE_OSX_ARCHITECTURES=arm64 exit 0 fi # Configure CMAKE_ARGS=('-DCMAKE_VERBOSE_MAKEFILE=ON') CMAKE_ARGS+=('-DCMAKE_INSTALL_PREFIX=../install') if [ "$BUILD_CUDA" = 'true' ]; then CMAKE_ARGS+=('-DUSE_CUDA=ON') CMAKE_ARGS+=('-DCUDA_ARCH_NAME=Pascal') CMAKE_ARGS+=('-DCUDA_NVCC_EXECUTABLE=/usr/local/bin/nvcc') export PATH="/usr/local/cuda/bin:${PATH}" CMAKE_ARGS+=('-DUSE_NNPACK=OFF') else CMAKE_ARGS+=('-DUSE_CUDA=OFF') fi if [ "$BUILD_MKL" = 'true' ]; then CMAKE_ARGS+=('-DBLAS=MKL') fi if [ "$BUILD_TESTS" = 'false' ]; then CMAKE_ARGS+=('-DBUILD_TEST=OFF') fi CMAKE_ARGS+=$(python $ROOT_DIR/scripts/get_python_cmake_flags.py) cmake .. ${CMAKE_ARGS[*]} # Build if [ "$TRAVIS_OS_NAME" = 'linux' ]; then make "-j$(nproc)" install elif [ "$TRAVIS_OS_NAME" = 'osx' ]; then make "-j$(sysctl -n hw.ncpu)" install fi ================================================ FILE: .travis/install.sh ================================================ #!/bin/bash set -e set -x LOCAL_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd ) ROOT_DIR=$(dirname "$LOCAL_DIR") cd "$ROOT_DIR" APT_INSTALL_CMD='sudo apt-get install -y --no-install-recommends' if [ "$TRAVIS_OS_NAME" = 'linux' ]; then #################### # apt dependencies # #################### sudo apt-get update $APT_INSTALL_CMD \ asciidoc \ autoconf \ automake \ build-essential \ ca-certificates \ ccache \ docbook-xml \ docbook-xsl \ git \ gperf \ libatlas-base-dev \ libgoogle-glog-dev \ libiomp-dev \ libleveldb-dev \ liblmdb-dev \ libopencv-dev \ libprotobuf-dev \ libpthread-stubs0-dev \ libsnappy-dev \ protobuf-compiler \ python \ python-dev \ python-pip \ python-wheel \ software-properties-common \ xsltproc # Install ccache symlink wrappers pushd /usr/local/bin sudo ln -sf "$(which ccache)" gcc sudo ln -sf "$(which ccache)" g++ popd if [ "$BUILD_GCC5" = 'true' ]; then ################ # Install GCC5 # ################ sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test sudo apt-get update $APT_INSTALL_CMD g++-5 sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-5 60 \ --slave /usr/bin/g++ g++ /usr/bin/g++-5 fi if [ "$BUILD_CUDA" = 'true' ]; then ################## # Install ccache # ################## # Needs specific branch to work with nvcc (ccache/ccache#145) if [ -e "${BUILD_CCACHE_DIR}/ccache" ]; then echo "Using cached ccache build at \"$BUILD_CCACHE_DIR\" ..." else git clone https://github.com/colesbury/ccache -b ccbin "$BUILD_CCACHE_DIR" pushd "$BUILD_CCACHE_DIR" ./autogen.sh ./configure make "-j$(nproc)" popd fi # Overwrite ccache symlink wrappers pushd /usr/local/bin sudo ln -sf "${BUILD_CCACHE_DIR}/ccache" gcc sudo ln -sf "${BUILD_CCACHE_DIR}/ccache" g++ sudo ln -sf "${BUILD_CCACHE_DIR}/ccache" nvcc popd ################# # Install CMake # ################# # Newer version required to get cmake+ccache+nvcc to work _cmake_installer=/tmp/cmake.sh wget -O "$_cmake_installer" https://cmake.org/files/v3.8/cmake-3.8.2-Linux-x86_64.sh sudo bash "$_cmake_installer" --prefix=/usr/local --skip-license rm -rf "$_cmake_installer" ################ # Install CUDA # ################ CUDA_REPO_PKG='cuda-repo-ubuntu1404_8.0.44-1_amd64.deb' CUDA_PKG_VERSION='8-0' CUDA_VERSION='8.0' wget "https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1404/x86_64/${CUDA_REPO_PKG}" sudo dpkg -i "$CUDA_REPO_PKG" rm -f "$CUDA_REPO_PKG" sudo apt-get update $APT_INSTALL_CMD \ "cuda-core-${CUDA_PKG_VERSION}" \ "cuda-cublas-dev-${CUDA_PKG_VERSION}" \ "cuda-cudart-dev-${CUDA_PKG_VERSION}" \ "cuda-curand-dev-${CUDA_PKG_VERSION}" \ "cuda-driver-dev-${CUDA_PKG_VERSION}" \ "cuda-nvrtc-dev-${CUDA_PKG_VERSION}" # Manually create CUDA symlink sudo ln -sf /usr/local/cuda-$CUDA_VERSION /usr/local/cuda ################# # Install cuDNN # ################# CUDNN_REPO_PKG='nvidia-machine-learning-repo-ubuntu1404_4.0-2_amd64.deb' CUDNN_PKG_VERSION='6.0.20-1+cuda8.0' wget "https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1404/x86_64/${CUDNN_REPO_PKG}" sudo dpkg -i "$CUDNN_REPO_PKG" rm -f "$CUDNN_REPO_PKG" sudo apt-get update $APT_INSTALL_CMD \ "libcudnn6=${CUDNN_PKG_VERSION}" \ "libcudnn6-dev=${CUDNN_PKG_VERSION}" fi if [ "$BUILD_MKL" = 'true' ]; then ############### # Install MKL # ############### _mkl_key=/tmp/mkl.pub wget -O "$_mkl_key" http://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS-2019.PUB sudo apt-key add "$_mkl_key" rm -f "$_mkl_key" echo 'deb http://apt.repos.intel.com/mkl all main' | sudo tee /etc/apt/sources.list.d/intel-mkl.list sudo apt-get update $APT_INSTALL_CMD intel-mkl-64bit fi elif [ "$TRAVIS_OS_NAME" = 'osx' ]; then ##################### # brew dependencies # ##################### brew update brew install python pip uninstall -y numpy # use brew version (opencv dependency) brew tap homebrew/science # for OpenCV brew install \ ccache \ glog \ leveldb \ lmdb \ protobuf # Install ccache symlink wrappers pushd /usr/local/bin sudo ln -sf "$(which ccache)" clang sudo ln -sf "$(which ccache)" clang++ popd else echo "OS \"$TRAVIS_OS_NAME\" is unknown" exit 1 fi #################### # pip dependencies # #################### sudo pip install \ future \ hypothesis \ numpy \ protobuf \ pytest \ scikit-image if [ "$BUILD_ANDROID" = 'true' ]; then ####################### # Install Android NDK # ####################### _ndk_zip=/tmp/ndk.zip if [ "$TRAVIS_OS_NAME" = 'linux' ]; then $APT_INSTALL_CMD autotools-dev autoconf wget -O "$_ndk_zip" https://dl.google.com/android/repository/android-ndk-r13b-linux-x86_64.zip elif [ "$TRAVIS_OS_NAME" = 'osx' ]; then brew install libtool wget -O "$_ndk_zip" https://dl.google.com/android/repository/android-ndk-r13b-darwin-x86_64.zip else echo "OS \"$TRAVIS_OS_NAME\" is unknown" exit 1 fi _ndk_dir=/opt/android_ndk sudo mkdir -p "$_ndk_dir" sudo chmod a+rwx "$_ndk_dir" unzip -qo "$_ndk_zip" -d "$_ndk_dir" rm -f "$_ndk_zip" _versioned_dir=$(find $_ndk_dir/ -mindepth 1 -maxdepth 1 -type d) mv "$_versioned_dir"/* "$_ndk_dir"/ rmdir "$_versioned_dir" fi if [ "$BUILD_NNPACK" = 'true' ]; then ################# # Install ninja # ################# if [ "$TRAVIS_OS_NAME" = 'linux' ]; then # NNPACK needs a recent version if [ -e "${BUILD_NINJA_DIR}/ninja" ]; then echo "Using cached ninja build at \"$BUILD_NINJA_DIR\" ..." else git clone https://github.com/ninja-build/ninja.git -b release "$BUILD_NINJA_DIR" pushd "$BUILD_NINJA_DIR" python configure.py --bootstrap popd fi sudo install -m 755 "${BUILD_NINJA_DIR}/ninja" /usr/local/bin/ninja elif [ "$TRAVIS_OS_NAME" = 'osx' ]; then brew install ninja else echo "OS \"$TRAVIS_OS_NAME\" is unknown" exit 1 fi sudo pip install git+https://github.com/Maratyszcza/PeachPy sudo pip install git+https://github.com/Maratyszcza/confu fi ================================================ FILE: .travis/setup.sh ================================================ #!/bin/bash # This script should be sourced, not executed set -e export BUILD_ANDROID=false export BUILD_CUDA=false export BUILD_GCC5=false export BUILD_IOS=false export BUILD_MKL=false export BUILD_NNPACK=true export BUILD_TESTS=true if [ "$BUILD" = 'linux' ]; then : elif [ "$BUILD" = 'linux-gcc5' ]; then export BUILD_GCC5=true elif [ "$BUILD" = 'linux-cuda' ]; then export BUILD_CUDA=true export BUILD_NNPACK=false export BUILD_TESTS=false elif [ "$BUILD" = 'linux-mkl' ]; then export BUILD_MKL=true export BUILD_TESTS=false elif [ "$BUILD" = 'linux-android' ]; then export BUILD_ANDROID=true export BUILD_TESTS=false elif [ "$BUILD" = 'osx' ]; then # TODO(lukeyeager): enable after caffe2/caffe2#785 export BUILD_TESTS=false # Since Python 2.7.14, HomeBrew does not link python and pip in /usr/local/bin/, # but they are available in /usr/local/opt/python/libexec/bin/ export PATH="/usr/local/opt/python/libexec/bin:${PATH}" elif [ "$BUILD" = 'osx-ios' ]; then export BUILD_IOS=true export BUILD_TESTS=false # Since Python 2.7.14, HomeBrew does not link python and pip in /usr/local/bin/, # but they are available in /usr/local/opt/python/libexec/bin/ export PATH="/usr/local/opt/python/libexec/bin:${PATH}" elif [ "$BUILD" = 'osx-android' ]; then export BUILD_ANDROID=true export BUILD_TESTS=false # Since Python 2.7.14, HomeBrew does not link python and pip in /usr/local/bin/, # but they are available in /usr/local/opt/python/libexec/bin/ export PATH="/usr/local/opt/python/libexec/bin:${PATH}" else echo "BUILD \"$BUILD\" is unknown" exit 1 fi ================================================ FILE: .travis/test.sh ================================================ #!/bin/bash set -e set -x LOCAL_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd ) ROOT_DIR=$(dirname "$LOCAL_DIR") cd "$ROOT_DIR" if [ "$BUILD_TESTS" = 'false' ]; then echo 'Skipping tests' exit 0 fi # Ctests pushd build CTEST_OUTPUT_ON_FAILURE=1 make test popd # Python tests export PYTHONPATH="${PYTHONPATH}:${ROOT_DIR}/install" export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:${ROOT_DIR}/install/lib" python -m pytest -v install/caffe2/python ================================================ FILE: .travis.yml ================================================ os: linux dist: trusty sudo: required language: cpp compiler: gcc env: global: - BUILD_CCACHE_DIR=~/build/ccache - BUILD_NINJA_DIR=~/build/ninja matrix: - BUILD=linux - BUILD=linux-gcc5 - BUILD=linux-cuda - BUILD=linux-mkl - BUILD=linux-android matrix: include: - env: BUILD=osx os: osx osx_image: xcode8.3 compiler: clang - env: BUILD=osx-ios os: osx osx_image: xcode8.3 compiler: clang - env: BUILD=osx-android os: osx osx_image: xcode8.3 compiler: clang cache: directories: - $BUILD_CCACHE_DIR - $BUILD_NINJA_DIR - $HOME/.ccache before_install: - source .travis/setup.sh install: - ./.travis/install.sh - ./.travis/build.sh script: - ./.travis/test.sh ================================================ FILE: CMakeLists.txt ================================================ cmake_minimum_required(VERSION 3.2 FATAL_ERROR) #cmake_policy(SET CMP0022 NEW) #cmake_policy(SET CMP0023 NEW) # ---[ Project and semantic versioning. project(Caffe2 CXX C) set(CAFFE2_VERSION_MAJOR 0) set(CAFFE2_VERSION_MINOR 8) set(CAFFE2_VERSION_PATCH 2) set(CAFFE2_VERSION "${CAFFE2_VERSION_MAJOR}.${CAFFE2_VERSION_MINOR}.${CAFFE2_VERSION_PATCH}") # One variable that determines whether the current cmake process is being run # with the main Caffe2 library. This is useful for building modules - if # modules are built with the main Caffe2 library then one does not need to do # find caffe2 in the cmake script. One can usually guard it in some way like # if (NOT CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO) # find_package(Caffe2 REQUIRED) # endif() set(CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO ON) # ---[ Options. # Note to developers: if you add an option below, make sure you also add it to # cmake/Summary.cmake so that the summary prints out the option values. include(CMakeDependentOption) option(BUILD_BINARY "Build C++ binaries" ON) option(BUILD_DOCS "Build documentation" OFF) option(BUILD_CUSTOM_PROTOBUF "If set, build Caffe2's own protobuf under third_party" OFF) option(BUILD_PYTHON "Build Python binaries" ON) option(BUILD_SHARED_LIBS "Build libcaffe2.so" ON) cmake_dependent_option( CAFFE2_USE_MSVC_STATIC_RUNTIME "Using MSVC static runtime libraries" ON "NOT BUILD_SHARED_LIBS" OFF) option(BUILD_TEST "Build C++ test binaries (need gtest and gbenchmark)" ON) option(USE_ACL "Use ARM Compute Library" OFF) option(USE_ASAN "Use Address Sanitizer" OFF) option(USE_ATEN "Use ATen" OFF) option(USE_CUDA "Use Cuda" ON) option(USE_FFMPEG "Use ffmpeg" OFF) option(USE_GFLAGS "Use GFLAGS" ON) option(USE_GLOG "Use GLOG" ON) option(USE_GLOO "Use Gloo" ON) option(USE_LEVELDB "Use LEVELDB" ON) option(USE_LITE_PROTO "Use lite protobuf instead of full." OFF) option(USE_LMDB "Use LMDB" ON) option(USE_METAL "Use Metal for iOS build" ON) option(USE_MOBILE_OPENGL "Use OpenGL for mobile code" ON) option(USE_MPI "Use MPI" ON) option(USE_NATIVE_ARCH "Use -march=native" OFF) option(USE_NCCL "Use NCCL" ON) option(USE_NERVANA_GPU "Use Nervana GPU backend" OFF) option(USE_NNAPI "Use NNAPI" OFF) option(USE_NNPACK "Use NNPACK" ON) option(USE_NUMA "Use NUMA (only available on Linux)" ON) option(USE_OBSERVERS "Use observers module." OFF) option(USE_OPENCV "Use openCV" ON) option(USE_OPENMP "Use OpenMP for parallel code" OFF) option(USE_PROF "Use profiling" OFF) option(USE_REDIS "Use Redis" OFF) option(USE_ROCKSDB "Use RocksDB" OFF) option(USE_SNPE "Use Qualcomm's SNPE library" OFF) option(USE_THREADS "Use Threads" ON) option(USE_ZMQ "Use ZMQ" OFF) option(USE_ZSTD "Use ZSTD" OFF) # ---[ CMake scripts + modules list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake/Modules) if (MSVC AND ${BUILD_SHARED_LIBS}) set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON) endif() # ---[ CMake build directories set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib) set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib) set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin) enable_testing() # ---[ Misc checks to cope with various compiler modes include(cmake/MiscCheck.cmake) include(cmake/BuildVariables.cmake) # External projects include(ExternalProject) # TODO: merge the following 3 files into cmake/public/utils.cmake. include(cmake/Utils.cmake) include(cmake/public/utils.cmake) set(CAFFE2_WHITELIST "" CACHE STRING "A whitelist file of files that one should build.") # Set default build type if(NOT CMAKE_BUILD_TYPE) message(STATUS "Build type not set - defaulting to Release") set(CMAKE_BUILD_TYPE "Release" CACHE STRING "Choose the type of build from: Debug Release RelWithDebInfo MinSizeRel Coverage." FORCE) endif() # ---[ Dependencies include(cmake/Dependencies.cmake) # ---[ Whitelist file if whitelist is specified include(cmake/Whitelist.cmake) # ---[ Set link flag, handle additional deps for gcc 4.8 and above if(CMAKE_COMPILER_IS_GNUCXX AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 4.8.0 AND NOT ANDROID) message(STATUS "GCC ${CMAKE_CXX_COMPILER_VERSION}: Adding gcc and gcc_s libs to link line") list(APPEND Caffe2_DEPENDENCY_LIBS gcc_s gcc) endif() # ---[ Build flags set(CMAKE_C_STANDARD 99) set(CMAKE_CXX_STANDARD 11) if(NOT MSVC) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2 -fPIC") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing") # Eigen fails to build with some versions, so convert this to a warning # Details at http://eigen.tuxfamily.org/bz/show_bug.cgi?id=1459 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-invalid-partial-specialization") else() foreach(flag_var CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO) if (${CAFFE2_USE_MSVC_STATIC_RUNTIME}) if(${flag_var} MATCHES "/MD") string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}") endif(${flag_var} MATCHES "/MD") else() if(${flag_var} MATCHES "/MT") string(REGEX REPLACE "/MT" "/MD" ${flag_var} "${${flag_var}}") endif() endif() set(${flag_var} "${${flag_var}} /MP /bigobj") endforeach(flag_var) endif() if(ANDROID) if(CMAKE_COMPILER_IS_GNUCXX) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -s") else() set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -s") endif() endif() if(NOT APPLE AND UNIX) list(APPEND Caffe2_DEPENDENCY_LIBS dl) endif() # Prefix path to Caffe2 headers. # If a directory containing installed Caffe2 headers was inadvertently # added to the list of include directories, prefixing # PROJECT_SOURCE_DIR means this source tree always takes precedence. include_directories(BEFORE ${PROJECT_SOURCE_DIR}) # Prefix path to generated Caffe2 headers. # These need to take precedence over their empty counterparts located # in PROJECT_SOURCE_DIR. include_directories(BEFORE ${PROJECT_BINARY_DIR}) # ---[ Old caffe protobuf. add_subdirectory(caffe/proto) # ---[ Main build add_subdirectory(caffe2) # Documentation Option if(BUILD_DOCS) # check if Doxygen is installed find_package(Doxygen) if (DOXYGEN_FOUND) message("Generating documentation") set(DOXYGEN_C_IN ${CMAKE_CURRENT_SOURCE_DIR}/docs/.Doxyfile-c) set(DOXYGEN_C_OUT ${CMAKE_CURRENT_SOURCE_DIR}/docs/Doxyfile-c) set(DOXYGEN_P_IN ${CMAKE_CURRENT_SOURCE_DIR}/docs/.Doxyfile-python) set(DOXYGEN_P_OUT ${CMAKE_CURRENT_SOURCE_DIR}/docs/Doxyfile-python) if(EXISTS ${CMAKE_CURRENT_BINARY_DIR}/docs) file(REMOVE_RECURSE ${CMAKE_CURRENT_BINARY_DIR}/docs) endif (EXISTS ${CMAKE_CURRENT_BINARY_DIR}/docs) file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/docs) configure_file(${DOXYGEN_C_IN} ${DOXYGEN_C_OUT} @ONLY) configure_file(${DOXYGEN_P_IN} ${DOXYGEN_P_OUT} @ONLY) add_custom_target(doc_doxygen_c ALL COMMAND ${DOXYGEN_EXECUTABLE} ${DOXYGEN_C_OUT} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMENT "Generating C++ API documentation with Doxygen" VERBATIM) add_custom_target(doc_doxygen_python ALL COMMAND ${DOXYGEN_EXECUTABLE} ${DOXYGEN_P_OUT} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMENT "Generating Python API documentation with Doxygen" VERBATIM) else (DOXYGEN_FOUND) message(FATAL_ERROR "Doxygen needs to be installed to generate the documentation") endif (DOXYGEN_FOUND) endif (BUILD_DOCS) # ---[ CMake related files # Uninistall option. if(NOT TARGET caffe2_uninstall) configure_file( ${CMAKE_CURRENT_SOURCE_DIR}/cmake/cmake_uninstall.cmake.in ${CMAKE_CURRENT_BINARY_DIR}/cmake_uninstall.cmake IMMEDIATE @ONLY) add_custom_target(caffe2_uninstall COMMAND ${CMAKE_COMMAND} -P ${CMAKE_CURRENT_BINARY_DIR}/cmake_uninstall.cmake) endif() # ---[ Make configuration files for cmake to allow dependent libraries # easier access to Caffe2. if ((NOT USE_GLOG) OR (NOT USE_GFLAGS) OR BUILD_CUSTOM_PROTOBUF) message(WARNING "Generated cmake files are only fully tested if one builds " "with system glog, gflags, and protobuf. Other settings may " "generate files that are not well tested.") endif() if (USE_CUDA) # TODO: check if we should include other cuda dependency libraries # to the interface as well. endif() # Note(jiayq): when building static libraries, all PRIVATE dependencies # will also become interface libraries, and as a result if there are any # dependency libraries that are not exported, the following install export # script will fail. As a result, we will only provide the targets cmake # files for shared lib installation. For more info, read: # https://cmake.org/pipermail/cmake/2016-May/063400.html if (BUILD_SHARED_LIBS) configure_file( ${PROJECT_SOURCE_DIR}/cmake/Caffe2ConfigVersion.cmake.in ${PROJECT_BINARY_DIR}/Caffe2ConfigVersion.cmake @ONLY) configure_file( ${PROJECT_SOURCE_DIR}/cmake/Caffe2Config.cmake.in ${PROJECT_BINARY_DIR}/Caffe2Config.cmake @ONLY) install(FILES ${PROJECT_BINARY_DIR}/Caffe2ConfigVersion.cmake ${PROJECT_BINARY_DIR}/Caffe2Config.cmake DESTINATION share/cmake/Caffe2 COMPONENT dev) install(FILES ${PROJECT_SOURCE_DIR}/cmake/public/cuda.cmake ${PROJECT_SOURCE_DIR}/cmake/public/glog.cmake ${PROJECT_SOURCE_DIR}/cmake/public/gflags.cmake ${PROJECT_SOURCE_DIR}/cmake/public/protobuf.cmake ${PROJECT_SOURCE_DIR}/cmake/public/threads.cmake ${PROJECT_SOURCE_DIR}/cmake/public/utils.cmake DESTINATION share/cmake/Caffe2/public COMPONENT dev) install(EXPORT Caffe2Targets DESTINATION share/cmake/Caffe2 FILE Caffe2Targets.cmake COMPONENT dev) else() message(WARNING "Generated cmake files are only available when building " "shared libs.") endif() # ---[ Modules add_subdirectory(modules) # ---[ Binaries # Binaries will be built after the Caffe2 main libraries and the modules # are built. For the binaries, they will be linked to the Caffe2 main # libraries, as well as all the modules that are built with Caffe2 (the ones # built in the previous Modules section above). if (BUILD_BINARY) add_subdirectory(binaries) endif() include(cmake/Summary.cmake) caffe2_print_configuration_summary() ================================================ FILE: LICENSE ================================================ Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright [yyyy] [name of copyright owner] Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ================================================ FILE: Makefile ================================================ # This makefile does nothing but delegating the actual building to cmake. all: @mkdir -p build && cd build && cmake .. $(shell python ./scripts/get_python_cmake_flags.py) && $(MAKE) local: @./scripts/build_local.sh android: @./scripts/build_android.sh ios: @./scripts/build_ios.sh clean: # This will remove ALL build folders. @rm -r build*/ linecount: @cloc --read-lang-def=caffe.cloc caffe2 || \ echo "Cloc is not available on the machine. You can install cloc with " && \ echo " sudo apt-get install cloc" ================================================ FILE: NOTICE ================================================ Copyright (c) 2016-present, Facebook Inc. All rights reserved. All contributions by Facebook: Copyright (c) 2016 Facebook Inc. All contributions by Google: Copyright (c) 2015 Google Inc. All rights reserved. All contributions by Yangqing Jia: Copyright (c) 2015 Yangqing Jia All rights reserved. All contributions from Caffe: Copyright(c) 2013, 2014, 2015, the respective contributors All rights reserved. All other contributions: Copyright(c) 2015, 2016 the respective contributors All rights reserved. Caffe2 uses a copyright model similar to Caffe: each contributor holds copyright over their contributions to Caffe2. The project versioning records all such contribution and copyright details. If a contributor wants to further mark their specific copyright on a particular contribution, they should indicate their copyright solely in the commit message of the change when it is committed. ======================================================================= Software under third_party ======================================================================= Software libraries under third_party are provided as github submodule links, and their content is not part of the Caffe2 codebase. Their licences can be found under the respective software repositories. ======================================================================= Earlier BSD License ======================================================================= Early development of Caffe2 in 2015 and early 2016 is licensed under the BSD license. The license is attached below: All contributions by Facebook: Copyright (c) 2016 Facebook Inc. All contributions by Google: Copyright (c) 2015 Google Inc. All rights reserved. All contributions by Yangqing Jia: Copyright (c) 2015 Yangqing Jia All rights reserved. All other contributions: Copyright(c) 2015, 2016 the respective contributors All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ======================================================================= Caffe's BSD License ======================================================================= Some parts of the caffe2 code is derived from the original Caffe code, which is created by Yangqing Jia and is now a BSD-licensed open-source project. The Caffe license is as follows: COPYRIGHT All contributions by the University of California: Copyright (c) 2014, The Regents of the University of California (Regents) All rights reserved. All other contributions: Copyright (c) 2014, the respective contributors All rights reserved. Caffe uses a shared copyright model: each contributor holds copyright over their contributions to Caffe. The project versioning records all such contribution and copyright details. If a contributor wants to further mark their specific copyright on a particular contribution, they should indicate their copyright solely in the commit message of the change when it is committed. LICENSE Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. CONTRIBUTION AGREEMENT By contributing to the BVLC/caffe repository through pull-request, comment, or otherwise, the contributor releases their content to the license and copyright terms herein. ================================================ FILE: README.md ================================================ # Caffe2 [![License](https://img.shields.io/badge/License-Apache%202.0-brightgreen.svg)](https://opensource.org/licenses/Apache-2.0) [![Jenkins Build Status](https://ci.pytorch.org/jenkins/job/caffe2-master/badge/icon)](https://ci.pytorch.org/jenkins/job/caffe2-master) [![Appveyor Build Status](https://img.shields.io/appveyor/ci/Yangqing/caffe2.svg)](https://ci.appveyor.com/project/Yangqing/caffe2) Caffe2 is a lightweight, modular, and scalable deep learning framework. Building on the original [Caffe](http://caffe.berkeleyvision.org), Caffe2 is designed with expression, speed, and modularity in mind. ## Questions and Feedback Please use Github issues (https://github.com/caffe2/caffe2/issues) to ask questions, report bugs, and request new features. Please participate in our survey (https://www.surveymonkey.com/r/caffe2). We will send you information about new releases and special developer events/webinars. ## License Caffe2 is released under the [Apache 2.0 license](https://github.com/caffe2/caffe2/blob/master/LICENSE). See the [NOTICE](https://github.com/caffe2/caffe2/blob/master/NOTICE) file for details. ### Further Resources on [Caffe2.ai](http://caffe2.ai) * [Installation](http://caffe2.ai/docs/getting-started.html) * [Learn More](http://caffe2.ai/docs/learn-more.html) * [Upgrading to Caffe2](http://caffe2.ai/docs/caffe-migration.html) * [Datasets](http://caffe2.ai/docs/datasets.html) * [Model Zoo](http://caffe2.ai/docs/zoo.html) * [Tutorials](http://caffe2.ai/docs/tutorials.html) * [Operators Catalogue](http://caffe2.ai/docs/operators-catalogue.html) * [C++ API](http://caffe2.ai/doxygen-c/html/classes.html) * [Python API](http://caffe2.ai/doxygen-python/html/namespaces.html) ================================================ FILE: VERSION_NUMBER ================================================ 0.8.1 ================================================ FILE: appveyor.yml ================================================ version: '{build}' clone_folder: c:\projects\caffe2 environment: matrix: - USE_CUDA: OFF CMAKE_BUILD_TYPE: Release APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2017 # Building CUDA with Visual Studio 2017 is yet to be supported by # NVidia, so we canot enable it right now. #- USE_CUDA: ON # CMAKE_BUILD_TYPE: Release # APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2017 # Building CUDA currently causes a timeout in appveyor. In the interest # of properly monitoring the rest, we will disable cuda contbuild for now. #- USE_CUDA: ON # CMAKE_BUILD_TYPE: Release # APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015 - USE_CUDA: OFF CMAKE_BUILD_TYPE: Release APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015 # Debug build is not a top priority for us right now, so in the # interest of contbuild time, we disable it. #- USE_CUDA: OFF # CMAKE_BUILD_TYPE: Debug # APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2017 # Currently, CUDA + Debug does not work due to an error of using # std::_Debug_lt in device code. Not sure where this comes from yet, # but it is probably safe to assume that very few are going to build # debug mode with CUDA and Windows. #- USE_CUDA: ON # CMAKE_BUILD_TYPE: Debug install: - cmd: c:\projects\caffe2\scripts\appveyor\install.bat build_script: - cmd: >- cd c:\projects\caffe2 git submodule update --init call scripts\build_windows.bat ================================================ FILE: binaries/CMakeLists.txt ================================================ caffe2_binary_target("convert_caffe_image_db.cc") caffe2_binary_target("convert_db.cc") caffe2_binary_target("make_cifar_db.cc") caffe2_binary_target("make_mnist_db.cc") caffe2_binary_target("predictor_verifier.cc") caffe2_binary_target("print_registered_core_operators.cc") caffe2_binary_target("run_plan.cc") caffe2_binary_target("speed_benchmark.cc") caffe2_binary_target("split_db.cc") caffe2_binary_target("db_throughput.cc") if (USE_CUDA) caffe2_binary_target("inspect_gpus.cc") target_link_libraries(inspect_gpus ${CUDA_LIBRARIES}) caffe2_binary_target("print_core_object_sizes.cc") if (BUILD_TEST) # Core overhead benchmark caffe2_binary_target("core_overhead_benchmark.cc") target_link_libraries(core_overhead_benchmark benchmark ${CUDA_curand_LIBRARY}) endif() endif() if (USE_ZMQ) caffe2_binary_target("zmq_feeder.cc") target_link_libraries(zmq_feeder ${ZMQ_LIBRARIES}) endif() if(USE_MPI) caffe2_binary_target("run_plan_mpi.cc") target_link_libraries(run_plan_mpi ${MPI_CXX_LIBRARIES}) endif() if (USE_OPENCV AND USE_LEVELDB) caffe2_binary_target("convert_encoded_to_raw_leveldb.cc") target_link_libraries( convert_encoded_to_raw_leveldb ${OpenCV_LIBS} ${LevelDB_LIBRARIES} ${Snappy_LIBRARIES}) endif() if (USE_OPENCV) caffe2_binary_target("make_image_db.cc") target_link_libraries(make_image_db ${OpenCV_LIBS}) endif() if (USE_OBSERVERS) caffe2_binary_target("caffe2_benchmark.cc") endif() # ---[ tutorials caffe2_binary_target("tutorial_blob.cc") ================================================ FILE: binaries/caffe2_benchmark.cc ================================================ #include #include #include #include "caffe2/core/blob_serialization.h" #include "caffe2/core/init.h" #include "caffe2/core/logging.h" #include "caffe2/core/operator.h" #include "caffe2/proto/caffe2.pb.h" #include "caffe2/utils/proto_utils.h" #include "caffe2/utils/string_utils.h" #include "observers/observer_config.h" CAFFE2_DEFINE_string( backend, "builtin", "The backend to use when running the model. The allowed " "backend choices are: builtin, default, nnpack, eigen, mkl"); CAFFE2_DEFINE_string( init_net, "", "The given net to initialize any parameters."); CAFFE2_DEFINE_string( input, "", "Input that is needed for running the network. If " "multiple input needed, use comma separated string."); CAFFE2_DEFINE_string( input_dims, "", "Alternate to input_files, if all inputs are simple " "float TensorCPUs, specify the dimension using comma " "separated numbers. If multiple input needed, use " "semicolon to separate the dimension of different " "tensors."); CAFFE2_DEFINE_string( input_file, "", "Input file that contain the serialized protobuf for " "the input blobs. If multiple input needed, use comma " "separated string. Must have the same number of items " "as input does."); CAFFE2_DEFINE_string( input_type, "float", "Input type when specifying the input dimension." "The supported types are float, uint8_t."); CAFFE2_DEFINE_int(iter, 10, "The number of iterations to run."); CAFFE2_DEFINE_string(net, "", "The given net to benchmark."); CAFFE2_DEFINE_string( output, "", "Output that should be dumped after the execution " "finishes. If multiple outputs are needed, use comma " "separated string. If you want to dump everything, pass " "'*' as the output value."); CAFFE2_DEFINE_string( output_folder, "", "The folder that the output should be written to. This " "folder must already exist in the file system."); CAFFE2_DEFINE_bool( run_individual, false, "Whether to benchmark individual operators."); CAFFE2_DEFINE_bool( text_output, false, "Whether to write out output in text format for regression purpose."); CAFFE2_DEFINE_int(warmup, 0, "The number of iterations to warm up."); using std::string; using std::unique_ptr; using std::vector; static void writeTextOutput( caffe2::TensorCPU* tensor, const string& output_prefix, const string& name) { string output_name = output_prefix + "/" + name + ".txt"; caffe2::TensorSerializer ser; caffe2::BlobProto blob_proto; ser.Serialize( *tensor, output_name, blob_proto.mutable_tensor(), 0, tensor->size()); blob_proto.set_name(output_name); blob_proto.set_type("Tensor"); CAFFE_ENFORCE(blob_proto.has_tensor()); caffe2::TensorProto tensor_proto = blob_proto.tensor(); vector data; switch (tensor_proto.data_type()) { case caffe2::TensorProto::FLOAT: { std::copy( tensor_proto.float_data().begin(), tensor_proto.float_data().end(), std::back_inserter(data)); break; } case caffe2::TensorProto::INT32: { std::copy( tensor_proto.int32_data().begin(), tensor_proto.int32_data().end(), std::back_inserter(data)); break; } default: CAFFE_THROW("Unimplemented Blob type."); } std::ofstream output_file(output_name); std::ostream_iterator output_iterator(output_file, "\n"); std::copy(data.begin(), data.end(), output_iterator); } int main(int argc, char** argv) { caffe2::GlobalInit(&argc, &argv); caffe2::ShowLogInfoToStderr(); unique_ptr workspace(new caffe2::Workspace()); // Run initialization network. caffe2::NetDef init_net_def; CAFFE_ENFORCE(ReadProtoFromFile(caffe2::FLAGS_init_net, &init_net_def)); CAFFE_ENFORCE(workspace->RunNetOnce(init_net_def)); // Load input. if (caffe2::FLAGS_input.size()) { vector input_names = caffe2::split(',', caffe2::FLAGS_input); if (caffe2::FLAGS_input_file.size()) { vector input_files = caffe2::split(',', caffe2::FLAGS_input_file); CAFFE_ENFORCE_EQ( input_names.size(), input_files.size(), "Input name and file should have the same number."); for (int i = 0; i < input_names.size(); ++i) { caffe2::BlobProto blob_proto; CAFFE_ENFORCE(caffe2::ReadProtoFromFile(input_files[i], &blob_proto)); workspace->CreateBlob(input_names[i])->Deserialize(blob_proto); } } else if (caffe2::FLAGS_input_dims.size()) { vector input_dims_list = caffe2::split(';', caffe2::FLAGS_input_dims); CAFFE_ENFORCE_EQ( input_names.size(), input_dims_list.size(), "Input name and dims should have the same number of items."); for (int i = 0; i < input_names.size(); ++i) { vector input_dims_str = caffe2::split(',', input_dims_list[i]); vector input_dims; for (const string& s : input_dims_str) { input_dims.push_back(caffe2::stoi(s)); } if (!workspace->HasBlob(input_names[i])) { workspace->CreateBlob(input_names[i]); } caffe2::TensorCPU* tensor = workspace->GetBlob(input_names[i])->GetMutable(); tensor->Resize(input_dims); if (caffe2::FLAGS_input_type == "float") { tensor->mutable_data(); } else { CAFFE_ENFORCE( caffe2::FLAGS_input_type == "uint8_t", "Only supported input types are: float, uint8_t"); tensor->mutable_data(); } } } else { CAFFE_THROW( "You requested input tensors, but neither input_file nor " "input_dims is set."); } } // Run main network. caffe2::NetDef net_def; CAFFE_ENFORCE(ReadProtoFromFile(caffe2::FLAGS_net, &net_def)); if (caffe2::FLAGS_backend != "builtin") { std::string engine = caffe2::FLAGS_backend == "nnpack" ? "NNPACK" : caffe2::FLAGS_backend == "eigen" ? "EIGEN" : caffe2::FLAGS_backend == "mkl" ? "MKLDNN" : caffe2::FLAGS_backend == "default" ? "" : "NONE"; CAFFE_ENFORCE(engine != "NONE", "Backend is not supported"); for (int i = 0; i < net_def.op_size(); i++) { caffe2::OperatorDef* op_def = net_def.mutable_op(i); op_def->set_engine(engine); } } caffe2::NetBase* net = workspace->CreateNet(net_def); CHECK_NOTNULL(net); LOG(INFO) << "Starting benchmark."; caffe2::ObserverConfig::initSampleRate( 1, 1, 1, caffe2::FLAGS_run_individual, caffe2::FLAGS_warmup); LOG(INFO) << "Running warmup runs."; for (int i = 0; i < caffe2::FLAGS_warmup; ++i) { CAFFE_ENFORCE(net->Run(), "Warmup run ", i, " has failed."); } LOG(INFO) << "Main runs."; CAFFE_ENFORCE( caffe2::FLAGS_iter >= 0, "Number of main runs should be non negative, provided ", caffe2::FLAGS_iter, "."); for (int i = 0; i < caffe2::FLAGS_iter; ++i) { caffe2::ObserverConfig::initSampleRate(1, 1, 1, 0, caffe2::FLAGS_warmup); CAFFE_ENFORCE(net->Run(), "Main run ", i, " has failed."); if (caffe2::FLAGS_run_individual) { caffe2::ObserverConfig::initSampleRate(1, 1, 1, 1, caffe2::FLAGS_warmup); CAFFE_ENFORCE(net->Run(), "Main run ", i, " with operator has failed."); } } string output_prefix = caffe2::FLAGS_output_folder.size() ? caffe2::FLAGS_output_folder + "/" : ""; if (caffe2::FLAGS_output.size()) { vector output_names = caffe2::split(',', caffe2::FLAGS_output); if (caffe2::FLAGS_output == "*") { output_names = workspace->Blobs(); } for (const string& name : output_names) { CAFFE_ENFORCE( workspace->HasBlob(name), "You requested a non-existing blob: ", name); if (caffe2::FLAGS_text_output) { auto blob = workspace->GetBlob(name)->GetMutable(); writeTextOutput(blob, output_prefix, name); } else { string serialized = workspace->GetBlob(name)->Serialize(name); string output_filename = output_prefix + name; caffe2::WriteStringToFile(serialized, output_filename.c_str()); } } } return 0; } ================================================ FILE: binaries/convert_caffe_image_db.cc ================================================ /** * Copyright (c) 2016-present, Facebook, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "caffe2/core/db.h" #include "caffe2/core/init.h" #include "caffe2/proto/caffe2.pb.h" #include "caffe/proto/caffe.pb.h" #include "caffe2/core/logging.h" CAFFE2_DEFINE_string(input_db, "", "The input db."); CAFFE2_DEFINE_string(input_db_type, "", "The input db type."); CAFFE2_DEFINE_string(output_db, "", "The output db."); CAFFE2_DEFINE_string(output_db_type, "", "The output db type."); CAFFE2_DEFINE_int(batch_size, 1000, "The write batch size."); using caffe2::db::Cursor; using caffe2::db::DB; using caffe2::db::Transaction; using caffe2::TensorProto; using caffe2::TensorProtos; int main(int argc, char** argv) { caffe2::GlobalInit(&argc, &argv); std::unique_ptr in_db(caffe2::db::CreateDB( caffe2::FLAGS_input_db_type, caffe2::FLAGS_input_db, caffe2::db::READ)); std::unique_ptr out_db(caffe2::db::CreateDB( caffe2::FLAGS_output_db_type, caffe2::FLAGS_output_db, caffe2::db::NEW)); std::unique_ptr cursor(in_db->NewCursor()); std::unique_ptr transaction(out_db->NewTransaction()); int count = 0; for (; cursor->Valid(); cursor->Next()) { caffe::Datum datum; CAFFE_ENFORCE(datum.ParseFromString(cursor->value())); TensorProtos protos; TensorProto* data = protos.add_protos(); TensorProto* label = protos.add_protos(); label->set_data_type(TensorProto::INT32); label->add_dims(1); label->add_int32_data(datum.label()); if (datum.encoded()) { // This is an encoded image. we will copy over the data directly. data->set_data_type(TensorProto::STRING); data->add_dims(1); data->add_string_data(datum.data()); } else { // float data not supported right now. CAFFE_ENFORCE_EQ(datum.float_data_size(), 0); std::vector buffer_vec(datum.data().size()); char* buffer = buffer_vec.data(); // swap order from CHW to HWC int channels = datum.channels(); int size = datum.height() * datum.width(); CAFFE_ENFORCE_EQ(datum.data().size(), channels * size); for (int c = 0; c < channels; ++c) { char* dst = buffer + c; const char* src = datum.data().c_str() + c * size; for (int n = 0; n < size; ++n) { dst[n*channels] = src[n]; } } data->set_data_type(TensorProto::BYTE); data->add_dims(datum.height()); data->add_dims(datum.width()); data->add_dims(datum.channels()); data->set_byte_data(buffer, datum.data().size()); } transaction->Put(cursor->key(), protos.SerializeAsString()); if (++count % caffe2::FLAGS_batch_size == 0) { transaction->Commit(); LOG(INFO) << "Converted " << count << " items so far."; } } LOG(INFO) << "A total of " << count << " items processed."; return 0; } ================================================ FILE: binaries/convert_db.cc ================================================ /** * Copyright (c) 2016-present, Facebook, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "caffe2/core/db.h" #include "caffe2/core/init.h" #include "caffe2/proto/caffe2.pb.h" #include "caffe2/core/logging.h" CAFFE2_DEFINE_string(input_db, "", "The input db."); CAFFE2_DEFINE_string(input_db_type, "", "The input db type."); CAFFE2_DEFINE_string(output_db, "", "The output db."); CAFFE2_DEFINE_string(output_db_type, "", "The output db type."); CAFFE2_DEFINE_int(batch_size, 1000, "The write batch size."); using caffe2::db::Cursor; using caffe2::db::DB; using caffe2::db::Transaction; int main(int argc, char** argv) { caffe2::GlobalInit(&argc, &argv); std::unique_ptr in_db(caffe2::db::CreateDB( caffe2::FLAGS_input_db_type, caffe2::FLAGS_input_db, caffe2::db::READ)); std::unique_ptr out_db(caffe2::db::CreateDB( caffe2::FLAGS_output_db_type, caffe2::FLAGS_output_db, caffe2::db::NEW)); std::unique_ptr cursor(in_db->NewCursor()); std::unique_ptr transaction(out_db->NewTransaction()); int count = 0; for (; cursor->Valid(); cursor->Next()) { transaction->Put(cursor->key(), cursor->value()); if (++count % caffe2::FLAGS_batch_size == 0) { transaction->Commit(); LOG(INFO) << "Converted " << count << " items so far."; } } LOG(INFO) << "A total of " << count << " items processed."; return 0; } ================================================ FILE: binaries/convert_encoded_to_raw_leveldb.cc ================================================ /** * Copyright (c) 2016-present, Facebook, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ // This script converts an image dataset to leveldb. // // caffe2::FLAGS_input_folder is the root folder that holds all the images, and // caffe2::FLAGS_list_file should be a list of files as well as their labels, in the // format as // subfolder1/file1.JPEG 7 // .... #include #include // NOLINT(readability/streams) #include #include #include #include "caffe2/core/init.h" #include "caffe2/proto/caffe2.pb.h" #include "caffe2/core/logging.h" #include "leveldb/db.h" #include "leveldb/write_batch.h" CAFFE2_DEFINE_string(input_db_name, "", "The input image file name."); CAFFE2_DEFINE_string(output_db_name, "", "The output training leveldb name."); CAFFE2_DEFINE_bool(color, true, "If set, load images in color."); CAFFE2_DEFINE_int(scale, 256, "If caffe2::FLAGS_raw is set, scale all the images' shorter edge to the given " "value."); CAFFE2_DEFINE_bool(warp, false, "If warp is set, warp the images to square."); namespace caffe2 { using std::string; using std::unique_ptr; void ConvertToRawDataset( const string& input_db_name, const string& output_db_name) { // input leveldb std::unique_ptr input_db; LOG(INFO) << "Opening input leveldb " << input_db_name; { leveldb::Options options; options.create_if_missing = false; leveldb::DB* db_temp; leveldb::Status status = leveldb::DB::Open( options, input_db_name, &db_temp); CAFFE_ENFORCE(status.ok(), "Failed to open leveldb ", input_db_name, "."); input_db.reset(db_temp); } // output leveldb std::unique_ptr output_db; std::unique_ptr batch; LOG(INFO) << "Opening leveldb " << output_db_name; { leveldb::Options options; options.error_if_exists = true; options.create_if_missing = true; options.write_buffer_size = 268435456; leveldb::DB* db_temp; leveldb::Status status = leveldb::DB::Open( options, output_db_name, &db_temp); CAFFE_ENFORCE( status.ok(), "Failed to open leveldb ", output_db_name, ". Is it already existing?"); output_db.reset(db_temp); } batch.reset(new leveldb::WriteBatch()); TensorProtos input_protos; TensorProtos output_protos; TensorProto* data = output_protos.add_protos(); TensorProto* label = output_protos.add_protos(); data->set_data_type(TensorProto::BYTE); data->add_dims(0); data->add_dims(0); if (caffe2::FLAGS_color) { data->add_dims(3); } string value; unique_ptr iter; iter.reset(input_db->NewIterator(leveldb::ReadOptions())); iter->SeekToFirst(); int count = 0; for (; iter->Valid(); iter->Next()) { CAFFE_ENFORCE(input_protos.ParseFromString(iter->value().ToString())); label->CopyFrom(input_protos.protos(1)); const string& encoded_image = input_protos.protos(0).string_data(0); int encoded_size = encoded_image.size(); cv::Mat img = cv::imdecode( cv::Mat(1, &encoded_size, CV_8UC1, const_cast(encoded_image.data())), caffe2::FLAGS_color ? CV_LOAD_IMAGE_COLOR : CV_LOAD_IMAGE_GRAYSCALE); cv::Mat resized_img; int scaled_width, scaled_height; if (caffe2::FLAGS_warp) { scaled_width = caffe2::FLAGS_scale; scaled_height = caffe2::FLAGS_scale; } else if (img.rows > img.cols) { scaled_width = caffe2::FLAGS_scale; scaled_height = static_cast(img.rows) * caffe2::FLAGS_scale / img.cols; } else { scaled_height = caffe2::FLAGS_scale; scaled_width = static_cast(img.cols) * caffe2::FLAGS_scale / img.rows; } cv::resize(img, resized_img, cv::Size(scaled_width, scaled_height), 0, 0, cv::INTER_LINEAR); data->set_dims(0, scaled_height); data->set_dims(1, scaled_width); DCHECK(resized_img.isContinuous()); data->set_byte_data(resized_img.ptr(), scaled_height * scaled_width * (caffe2::FLAGS_color ? 3 : 1)); output_protos.SerializeToString(&value); // Put in db batch->Put(iter->key(), value); if (++count % 1000 == 0) { output_db->Write(leveldb::WriteOptions(), batch.get()); batch.reset(new leveldb::WriteBatch()); LOG(INFO) << "Processed " << count << " files."; } } // write the last batch if (count % 1000 != 0) { output_db->Write(leveldb::WriteOptions(), batch.get()); } LOG(INFO) << "Processed a total of " << count << " files."; } } // namespace caffe2 int main(int argc, char** argv) { caffe2::GlobalInit(&argc, &argv); caffe2::ConvertToRawDataset( caffe2::FLAGS_input_db_name, caffe2::FLAGS_output_db_name); return 0; } ================================================ FILE: binaries/core_overhead_benchmark.cc ================================================ /** * Copyright (c) 2016-present, Facebook, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "benchmark/benchmark.h" #include "caffe2/core/context.h" #include "caffe2/core/context_gpu.h" #include "caffe2/core/operator.h" #define CAFFE2_SKIP_IF_NO_GPU \ if (!caffe2::NumCudaDevices()) { \ state.SkipWithError("No CUDA available, skipping benchmark."); \ return; \ } using namespace caffe2; static void BM_CUDAContextCreation(benchmark::State& state) { CAFFE2_SKIP_IF_NO_GPU; volatile CUDAContext context_so_we_do_initialization_work; while (state.KeepRunning()) { volatile CUDAContext context; } } BENCHMARK(BM_CUDAContextCreation); static void BM_CUDAContextStreamAccess(benchmark::State& state) { CAFFE2_SKIP_IF_NO_GPU; CUDAContext context; while (state.KeepRunning()) { volatile cudaStream_t stream = context.cuda_stream(); } } BENCHMARK(BM_CUDAContextStreamAccess); static void BM_cudaGetDevice(benchmark::State& state) { CAFFE2_SKIP_IF_NO_GPU; int id; while (state.KeepRunning()) { CUDA_ENFORCE(cudaGetDevice(&id)); } } BENCHMARK(BM_cudaGetDevice); static void BM_cudaSetDevice(benchmark::State& state) { CAFFE2_SKIP_IF_NO_GPU; int total = NumCudaDevices(); int i = 0; while (state.KeepRunning()) { CUDA_ENFORCE(cudaSetDevice((i++) % total)); } } BENCHMARK(BM_cudaSetDevice); static void BM_cudaSetAndGetDevice(benchmark::State& state) { CAFFE2_SKIP_IF_NO_GPU; int total = NumCudaDevices(); int i = 0; int id; while (state.KeepRunning()) { CUDA_ENFORCE(cudaSetDevice((i++) % total)); CUDA_ENFORCE(cudaGetDevice(&id)); } } BENCHMARK(BM_cudaSetAndGetDevice); static void BM_cudaSetSameDevice(benchmark::State& state) { CAFFE2_SKIP_IF_NO_GPU; while (state.KeepRunning()) { CUDA_ENFORCE(cudaSetDevice(0)); } } BENCHMARK(BM_cudaSetSameDevice); static void BM_cudaStreamCreateSyncDelete(benchmark::State& state) { CAFFE2_SKIP_IF_NO_GPU; cudaStream_t stream; while (state.KeepRunning()) { CUDA_ENFORCE(cudaStreamCreate(&stream)); CUDA_ENFORCE(cudaStreamSynchronize(stream)); CUDA_ENFORCE(cudaStreamDestroy(stream)); } } BENCHMARK(BM_cudaStreamCreateSyncDelete); static void BM_cudaStreamSynchronize(benchmark::State& state) { CAFFE2_SKIP_IF_NO_GPU; cudaStream_t stream; CUDA_ENFORCE(cudaStreamCreate(&stream)); while (state.KeepRunning()) { CUDA_ENFORCE(cudaStreamSynchronize(stream)); } } BENCHMARK(BM_cudaStreamSynchronize); static void BM_cudaEventRecord(benchmark::State& state) { CAFFE2_SKIP_IF_NO_GPU; cudaStream_t stream; cudaEvent_t event; CUDA_ENFORCE(cudaStreamCreate(&stream)); CUDA_ENFORCE(cudaEventCreateWithFlags( &event, cudaEventDefault | cudaEventDisableTiming)); while (state.KeepRunning()) { CUDA_ENFORCE(cudaEventRecord(event, stream)); } } BENCHMARK(BM_cudaEventRecord); static void BM_cudaStreamWaitEventThenStreamSynchronize( benchmark::State& state) { CAFFE2_SKIP_IF_NO_GPU; cudaStream_t stream; cudaEvent_t event; CUDA_ENFORCE(cudaStreamCreate(&stream)); CUDA_ENFORCE(cudaEventCreateWithFlags( &event, cudaEventDefault | cudaEventDisableTiming)); CUDA_ENFORCE(cudaEventRecord(event, stream)); CUDA_ENFORCE(cudaStreamWaitEvent(stream, event, 0)); CUDA_ENFORCE(cudaStreamSynchronize(stream)); while (state.KeepRunning()) { CUDA_ENFORCE(cudaStreamWaitEvent(stream, event, 0)); CUDA_ENFORCE(cudaStreamSynchronize(stream)); } } BENCHMARK(BM_cudaStreamWaitEventThenStreamSynchronize); static void BM_CudaPointerAffinity(benchmark::State& state) { CAFFE2_SKIP_IF_NO_GPU; TensorCUDA tensor(vector{1, 2, 3, 4}); float* ptr = tensor.mutable_data(); while (state.KeepRunning()) { volatile int id = GetGPUIDForPointer(ptr); } } BENCHMARK(BM_CudaPointerAffinity); namespace { template class DummyEmptyOp : public Operator { public: DummyEmptyOp(const OperatorDef& def, Workspace* ws) : Operator(def, ws) {} bool RunOnDevice() final { return true; } }; REGISTER_CPU_OPERATOR(DummyEmpty, DummyEmptyOp); REGISTER_CUDA_OPERATOR(DummyEmpty, DummyEmptyOp); OPERATOR_SCHEMA(DummyEmpty); } // namespace static void BM_OperatorCreationCPU(benchmark::State& state) { std::unique_ptr op; OperatorDef def; Workspace ws; def.set_type("DummyEmpty"); def.mutable_device_option()->set_device_type(CPU); while (state.KeepRunning()) { op = CreateOperator(def, &ws); } } BENCHMARK(BM_OperatorCreationCPU); static void BM_OperatorCreationCUDA(benchmark::State& state) { CAFFE2_SKIP_IF_NO_GPU; std::unique_ptr op; OperatorDef def; Workspace ws; def.set_type("DummyEmpty"); def.mutable_device_option()->set_device_type(CUDA); while (state.KeepRunning()) { op = CreateOperator(def, &ws); } } BENCHMARK(BM_OperatorCreationCUDA); static void BM_RawAllocDeallocCPU(benchmark::State& state) { while (state.KeepRunning()) { // Allocating only 1 byte in order to measure the overhead. auto ptr_and_deleter = GetCPUAllocator()->New(1); // Deallocate. ptr_and_deleter.second(ptr_and_deleter.first); } } BENCHMARK(BM_RawAllocDeallocCPU); static void BM_TensorAllocDeallocCPU(benchmark::State& state) { Tensor tensor; // small allocation tensor.Resize(32, 32); while (state.KeepRunning()) { CHECK(tensor.mutable_data()); tensor.FreeMemory(); } } BENCHMARK(BM_TensorAllocDeallocCPU); static void BM_TensorAllocDeallocCUDA(benchmark::State& state) { CAFFE2_SKIP_IF_NO_GPU; Tensor tensor; // small allocation tensor.Resize(32, 32); while (state.KeepRunning()) { CHECK(tensor.mutable_data()); tensor.FreeMemory(); } } BENCHMARK(BM_TensorAllocDeallocCUDA); BENCHMARK_MAIN() ================================================ FILE: binaries/db_throughput.cc ================================================ /** * Copyright (c) 2016-present, Facebook, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include #include #include #include "caffe2/core/db.h" #include "caffe2/core/init.h" #include "caffe2/core/timer.h" #include "caffe2/core/logging.h" CAFFE2_DEFINE_string(input_db, "", "The input db."); CAFFE2_DEFINE_string(input_db_type, "", "The input db type."); CAFFE2_DEFINE_int(report_interval, 1000, "The report interval."); CAFFE2_DEFINE_int(repeat, 10, "The number to repeat the throughput test."); CAFFE2_DEFINE_bool(use_reader, false, "If true, use the reader interface."); CAFFE2_DEFINE_int(num_read_threads, 1, "The number of concurrent reading threads."); using caffe2::db::Cursor; using caffe2::db::DB; using caffe2::db::DBReader; using caffe2::string; void TestThroughputWithDB() { std::unique_ptr in_db(caffe2::db::CreateDB( caffe2::FLAGS_input_db_type, caffe2::FLAGS_input_db, caffe2::db::READ)); std::unique_ptr cursor(in_db->NewCursor()); for (int iter_id = 0; iter_id < caffe2::FLAGS_repeat; ++iter_id) { caffe2::Timer timer; for (int i = 0; i < caffe2::FLAGS_report_interval; ++i) { string key = cursor->key(); string value = cursor->value(); //VLOG(1) << "Key " << key; cursor->Next(); if (!cursor->Valid()) { cursor->SeekToFirst(); } } double elapsed_seconds = timer.Seconds(); printf("Iteration %03d, took %4.5f seconds, throughput %f items/sec.\n", iter_id, elapsed_seconds, caffe2::FLAGS_report_interval / elapsed_seconds); } } void TestThroughputWithReaderWorker(const DBReader* reader, int thread_id) { string key, value; for (int iter_id = 0; iter_id < caffe2::FLAGS_repeat; ++iter_id) { caffe2::Timer timer; for (int i = 0; i < caffe2::FLAGS_report_interval; ++i) { reader->Read(&key, &value); } double elapsed_seconds = timer.Seconds(); printf("Thread %03d iteration %03d, took %4.5f seconds, " "throughput %f items/sec.\n", thread_id, iter_id, elapsed_seconds, caffe2::FLAGS_report_interval / elapsed_seconds); } } void TestThroughputWithReader() { caffe2::db::DBReader reader( caffe2::FLAGS_input_db_type, caffe2::FLAGS_input_db); std::vector> reading_threads( caffe2::FLAGS_num_read_threads); for (int i = 0; i < reading_threads.size(); ++i) { reading_threads[i].reset(new std::thread( TestThroughputWithReaderWorker, &reader, i)); } for (int i = 0; i < reading_threads.size(); ++i) { reading_threads[i]->join(); } } int main(int argc, char** argv) { caffe2::GlobalInit(&argc, &argv); if (caffe2::FLAGS_use_reader) { TestThroughputWithReader(); } else { TestThroughputWithDB(); } return 0; } ================================================ FILE: binaries/inspect_gpus.cc ================================================ /** * Copyright (c) 2016-present, Facebook, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include #include #include #include "caffe2/core/common_gpu.h" #include "caffe2/core/init.h" #include "caffe2/core/logging.h" using std::vector; CAFFE2_DECLARE_int(caffe2_log_level); int main(int argc, char** argv) { caffe2::GlobalInit(&argc, &argv); caffe2::SetUsageMessage( "Inspects the GPUs on the current machine and prints out their details " "provided by cuda."); int gpu_count; CUDA_ENFORCE(cudaGetDeviceCount(&gpu_count)); for (int i = 0; i < gpu_count; ++i) { LOG(INFO) << "Querying device ID = " << i; caffe2::DeviceQuery(i); } vector > access_pattern; CAFFE_ENFORCE(caffe2::GetCudaPeerAccessPattern(&access_pattern)); std::stringstream sstream; // Find topology for (int i = 0; i < gpu_count; ++i) { for (int j = 0; j < gpu_count; ++j) { sstream << (access_pattern[i][j] ? "+" : "-") << " "; } sstream << std::endl; } LOG(INFO) << "Access pattern: " << std::endl << sstream.str(); return 0; } ================================================ FILE: binaries/make_cifar_db.cc ================================================ /** * Copyright (c) 2016-present, Facebook, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ // // This script converts the CIFAR dataset to the leveldb format used // by caffe to perform classification. // Usage: // convert_cifar_data input_folder output_db_file // The CIFAR dataset could be downloaded at // http://www.cs.toronto.edu/~kriz/cifar.html #include #include // NOLINT(readability/streams) #include #include #include "caffe2/core/common.h" #include "caffe2/core/db.h" #include "caffe2/core/init.h" #include "caffe2/proto/caffe2.pb.h" #include "caffe2/core/logging.h" CAFFE2_DEFINE_string(input_folder, "", "The input folder name."); CAFFE2_DEFINE_string(output_train_db_name, "", "The output training db name."); CAFFE2_DEFINE_string(output_test_db_name, "", "The output testing db name."); CAFFE2_DEFINE_string(db, "leveldb", "The db type."); CAFFE2_DEFINE_bool(is_cifar100, false, "If set, convert cifar100. Otherwise do cifar10."); namespace caffe2 { using std::stringstream; const int kCIFARSize = 32; const int kCIFARImageNBytes = kCIFARSize * kCIFARSize * 3; const int kCIFAR10BatchSize = 10000; const int kCIFAR10TestDataSize = 10000; const int kCIFAR10TrainBatches = 5; const int kCIFAR100TrainDataSize = 50000; const int kCIFAR100TestDataSize = 10000; void ReadImage(std::ifstream* file, int* label, char* buffer) { char label_char; if (caffe2::FLAGS_is_cifar100) { // Skip the coarse label. file->read(&label_char, 1); } file->read(&label_char, 1); *label = label_char; // Yes, there are better ways to do it, like in-place swap... but I am too // lazy so let's just write it in a memory-wasteful way. std::array channel_first_storage; file->read(channel_first_storage.data(), kCIFARImageNBytes); for (int c = 0; c < 3; ++c) { for (int i = 0; i < kCIFARSize * kCIFARSize; ++i) { buffer[i * 3 + c] = channel_first_storage[c * kCIFARSize * kCIFARSize + i]; } } return; } void WriteToDB(const string& filename, const int num_items, const int& offset, db::DB* db) { TensorProtos protos; TensorProto* data = protos.add_protos(); TensorProto* label = protos.add_protos(); data->set_data_type(TensorProto::BYTE); data->add_dims(kCIFARSize); data->add_dims(kCIFARSize); data->add_dims(3); label->set_data_type(TensorProto::INT32); label->add_dims(1); label->add_int32_data(0); LOG(INFO) << "Converting file " << filename; std::ifstream data_file(filename.c_str(), std::ios::in | std::ios::binary); CAFFE_ENFORCE(data_file, "Unable to open file ", filename); char str_buffer[kCIFARImageNBytes]; int label_value; string serialized_protos; std::unique_ptr transaction(db->NewTransaction()); for (int itemid = 0; itemid < num_items; ++itemid) { ReadImage(&data_file, &label_value, str_buffer); data->set_byte_data(str_buffer, kCIFARImageNBytes); label->set_int32_data(0, label_value); protos.SerializeToString(&serialized_protos); snprintf(str_buffer, kCIFARImageNBytes, "%05d", offset + itemid); transaction->Put(string(str_buffer), serialized_protos); } } void ConvertCIFAR() { std::unique_ptr train_db( db::CreateDB(caffe2::FLAGS_db, caffe2::FLAGS_output_train_db_name, db::NEW)); std::unique_ptr test_db( db::CreateDB(caffe2::FLAGS_db, caffe2::FLAGS_output_test_db_name, db::NEW)); if (!caffe2::FLAGS_is_cifar100) { // This is cifar 10. for (int fileid = 0; fileid < kCIFAR10TrainBatches; ++fileid) { stringstream train_file; train_file << caffe2::FLAGS_input_folder << "/data_batch_" << fileid + 1 << ".bin"; WriteToDB(train_file.str(), kCIFAR10BatchSize, fileid * kCIFAR10BatchSize, train_db.get()); } stringstream test_file; test_file << caffe2::FLAGS_input_folder << "/test_batch.bin"; WriteToDB(test_file.str(), kCIFAR10TestDataSize, 0, test_db.get()); } else { // This is cifar 100. stringstream train_file; train_file << caffe2::FLAGS_input_folder << "/train.bin"; WriteToDB(train_file.str(), kCIFAR100TrainDataSize, 0, train_db.get()); stringstream test_file; test_file << caffe2::FLAGS_input_folder << "/test.bin"; WriteToDB(test_file.str(), kCIFAR100TestDataSize, 0, test_db.get()); } } } // namespace caffe2 int main(int argc, char** argv) { caffe2::GlobalInit(&argc, &argv); caffe2::ConvertCIFAR(); return 0; } ================================================ FILE: binaries/make_image_db.cc ================================================ /** * Copyright (c) 2016-present, Facebook, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ // This script converts an image dataset to a database. // // caffe2::FLAGS_input_folder is the root folder that holds all the images // // caffe2::FLAGS_list_file is the path to a file containing a list of files // and their labels, as follows: // // subfolder1/file1.JPEG 7 // subfolder1/file2.JPEG 7 // subfolder2/file1.JPEG 8 // ... // #include #include #include #include #include #include #include #include "caffe2/core/common.h" #include "caffe2/core/db.h" #include "caffe2/core/init.h" #include "caffe2/proto/caffe2.pb.h" #include "caffe2/core/logging.h" CAFFE2_DEFINE_bool(shuffle, false, "Randomly shuffle the order of images and their labels"); CAFFE2_DEFINE_string(input_folder, "", "The input image file name."); CAFFE2_DEFINE_string( list_file, "", "The text file containing the list of images."); CAFFE2_DEFINE_string(output_db_name, "", "The output training leveldb name."); CAFFE2_DEFINE_string(db, "leveldb", "The db type."); CAFFE2_DEFINE_bool(raw, false, "If set, we pre-read the images and store the raw buffer."); CAFFE2_DEFINE_bool(color, true, "If set, load images in color."); CAFFE2_DEFINE_int( scale, 256, "If caffe2::FLAGS_raw is set, scale the shorter edge to the given value."); CAFFE2_DEFINE_bool(warp, false, "If warp is set, warp the images to square."); CAFFE2_DEFINE_int( num_threads, -1, "Number of image parsing and conversion threads."); namespace caffe2 { class Converter { public: explicit Converter() { data_ = protos_.add_protos(); label_ = protos_.add_protos(); if (caffe2::FLAGS_raw) { data_->set_data_type(TensorProto::BYTE); data_->add_dims(0); data_->add_dims(0); if (caffe2::FLAGS_color) { data_->add_dims(3); } } else { data_->set_data_type(TensorProto::STRING); data_->add_dims(1); data_->add_string_data(""); } label_->set_data_type(TensorProto::INT32); label_->add_dims(1); label_->add_int32_data(0); } ~Converter() { if (thread_.joinable()) { thread_.join(); } } void queue(const std::pair& pair) { in_.push(pair); } void start() { thread_ = std::thread(&Converter::run, this); } std::string get() { std::unique_lock lock(mutex_); while (out_.empty()) { cv_.wait(lock); } auto value = out_.front(); out_.pop(); cv_.notify_one(); return value; } void run() { const auto& input_folder = caffe2::FLAGS_input_folder; std::unique_lock lock(mutex_); std::string value; while (!in_.empty()) { auto pair = in_.front(); in_.pop(); lock.unlock(); label_->set_int32_data(0, pair.second); // Add raw file contents to DB if !raw if (!caffe2::FLAGS_raw) { std::ifstream image_file_stream(input_folder + pair.first); if (!image_file_stream) { LOG(ERROR) << "Cannot open " << input_folder << pair.first << ". Skipping."; } else { data_->mutable_string_data(0)->assign( std::istreambuf_iterator(image_file_stream), std::istreambuf_iterator()); } } else { // Load image cv::Mat img = cv::imread( input_folder + pair.first, caffe2::FLAGS_color ? CV_LOAD_IMAGE_COLOR : CV_LOAD_IMAGE_GRAYSCALE); // Resize image cv::Mat resized_img; int scaled_width, scaled_height; if (caffe2::FLAGS_warp) { scaled_width = caffe2::FLAGS_scale; scaled_height = caffe2::FLAGS_scale; } else if (img.rows > img.cols) { scaled_width = caffe2::FLAGS_scale; scaled_height = static_cast(img.rows) * caffe2::FLAGS_scale / img.cols; } else { scaled_height = caffe2::FLAGS_scale; scaled_width = static_cast(img.cols) * caffe2::FLAGS_scale / img.rows; } cv::resize( img, resized_img, cv::Size(scaled_width, scaled_height), 0, 0, cv::INTER_LINEAR); data_->set_dims(0, scaled_height); data_->set_dims(1, scaled_width); // Assert we don't have to deal with alignment DCHECK(resized_img.isContinuous()); auto nbytes = resized_img.total() * resized_img.elemSize(); data_->set_byte_data(resized_img.ptr(), nbytes); } protos_.SerializeToString(&value); // Add serialized proto to out queue or wait if it is not empty lock.lock(); while (!out_.empty()) { cv_.wait(lock); } out_.push(value); cv_.notify_one(); } } protected: TensorProtos protos_; TensorProto* data_; TensorProto* label_; std::queue> in_; std::queue out_; std::mutex mutex_; std::condition_variable cv_; std::thread thread_; }; void ConvertImageDataset( const string& input_folder, const string& list_filename, const string& output_db_name, const bool /*shuffle*/) { std::ifstream list_file(list_filename); std::vector > lines; std::string filename; int file_label; while (list_file >> filename >> file_label) { lines.push_back(std::make_pair(filename, file_label)); } if (caffe2::FLAGS_shuffle) { LOG(INFO) << "Shuffling data"; std::shuffle(lines.begin(), lines.end(), std::default_random_engine(1701)); } auto num_threads = caffe2::FLAGS_num_threads; if (num_threads < 1) { num_threads = std::thread::hardware_concurrency(); } LOG(INFO) << "Processing " << lines.size() << " images..."; LOG(INFO) << "Opening DB " << output_db_name; auto db = db::CreateDB(caffe2::FLAGS_db, output_db_name, db::NEW); auto transaction = db->NewTransaction(); LOG(INFO) << "Using " << num_threads << " processing threads..."; std::vector converters(num_threads); // Queue entries across converters for (auto i = 0; i < lines.size(); i++) { converters[i % converters.size()].queue(lines[i]); } // Start all converters for (auto& converter : converters) { converter.start(); } constexpr auto key_max_length = 256; char key_cstr[key_max_length]; string value; int count = 0; for (auto i = 0; i < lines.size(); i++) { // Get serialized proto for this entry auto value = converters[i % converters.size()].get(); // Synthesize key for this entry auto key_len = snprintf( key_cstr, sizeof(key_cstr), "%08d_%s", i, lines[i].first.c_str()); DCHECK_LE(key_len, sizeof(key_cstr)); // Put in db transaction->Put(string(key_cstr), value); if (++count % 1000 == 0) { // Commit the current writes. transaction->Commit(); LOG(INFO) << "Processed " << count << " files."; } } // Commit final transaction transaction->Commit(); LOG(INFO) << "Processed " << count << " files."; } } // namespace caffe2 int main(int argc, char** argv) { caffe2::GlobalInit(&argc, &argv); caffe2::ConvertImageDataset( caffe2::FLAGS_input_folder, caffe2::FLAGS_list_file, caffe2::FLAGS_output_db_name, caffe2::FLAGS_shuffle); return 0; } ================================================ FILE: binaries/make_mnist_db.cc ================================================ /** * Copyright (c) 2016-present, Facebook, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ // This script converts the MNIST dataset to leveldb. // The MNIST dataset could be downloaded at // http://yann.lecun.com/exdb/mnist/ #include // NOLINT(readability/streams) #include #include "caffe2/core/common.h" #include "caffe2/core/db.h" #include "caffe2/core/init.h" #include "caffe2/proto/caffe2.pb.h" #include "caffe2/core/logging.h" CAFFE2_DEFINE_string(image_file, "", "The input image file name."); CAFFE2_DEFINE_string(label_file, "", "The label file name."); CAFFE2_DEFINE_string(output_file, "", "The output db name."); CAFFE2_DEFINE_string(db, "leveldb", "The db type."); CAFFE2_DEFINE_int(data_limit, -1, "If set, only output this number of data points."); CAFFE2_DEFINE_bool(channel_first, false, "If set, write the data as channel-first (CHW order) as the old " "Caffe does."); namespace caffe2 { uint32_t swap_endian(uint32_t val) { val = ((val << 8) & 0xFF00FF00) | ((val >> 8) & 0xFF00FF); return (val << 16) | (val >> 16); } void convert_dataset(const char* image_filename, const char* label_filename, const char* db_path, const int data_limit) { // Open files std::ifstream image_file(image_filename, std::ios::in | std::ios::binary); std::ifstream label_file(label_filename, std::ios::in | std::ios::binary); CAFFE_ENFORCE(image_file, "Unable to open file ", image_filename); CAFFE_ENFORCE(label_file, "Unable to open file ", label_filename); // Read the magic and the meta data uint32_t magic; uint32_t num_items; uint32_t num_labels; uint32_t rows; uint32_t cols; image_file.read(reinterpret_cast(&magic), 4); magic = swap_endian(magic); if (magic == 529205256) { LOG(FATAL) << "It seems that you forgot to unzip the mnist dataset. You should " "first unzip them using e.g. gunzip on Linux."; } CAFFE_ENFORCE_EQ(magic, 2051, "Incorrect image file magic."); label_file.read(reinterpret_cast(&magic), 4); magic = swap_endian(magic); CAFFE_ENFORCE_EQ(magic, 2049, "Incorrect label file magic."); image_file.read(reinterpret_cast(&num_items), 4); num_items = swap_endian(num_items); label_file.read(reinterpret_cast(&num_labels), 4); num_labels = swap_endian(num_labels); CAFFE_ENFORCE_EQ(num_items, num_labels); image_file.read(reinterpret_cast(&rows), 4); rows = swap_endian(rows); image_file.read(reinterpret_cast(&cols), 4); cols = swap_endian(cols); // leveldb std::unique_ptr mnist_db(db::CreateDB(caffe2::FLAGS_db, db_path, db::NEW)); std::unique_ptr transaction(mnist_db->NewTransaction()); // Storing to db char label_value; std::vector pixels(rows * cols); int count = 0; const int kMaxKeyLength = 10; char key_cstr[kMaxKeyLength]; string value; TensorProtos protos; TensorProto* data = protos.add_protos(); TensorProto* label = protos.add_protos(); data->set_data_type(TensorProto::BYTE); if (caffe2::FLAGS_channel_first) { data->add_dims(1); data->add_dims(rows); data->add_dims(cols); } else { data->add_dims(rows); data->add_dims(cols); data->add_dims(1); } label->set_data_type(TensorProto::INT32); label->add_int32_data(0); LOG(INFO) << "A total of " << num_items << " items."; LOG(INFO) << "Rows: " << rows << " Cols: " << cols; for (int item_id = 0; item_id < num_items; ++item_id) { image_file.read(pixels.data(), rows * cols); label_file.read(&label_value, 1); for (int i = 0; i < rows * cols; ++i) { data->set_byte_data(pixels.data(), rows * cols); } label->set_int32_data(0, static_cast(label_value)); snprintf(key_cstr, kMaxKeyLength, "%08d", item_id); protos.SerializeToString(&value); string keystr(key_cstr); // Put in db transaction->Put(keystr, value); if (++count % 1000 == 0) { transaction->Commit(); } if (data_limit > 0 && count == data_limit) { LOG(INFO) << "Reached data limit of " << data_limit << ", stop."; break; } } } } // namespace caffe2 int main(int argc, char** argv) { caffe2::GlobalInit(&argc, &argv); caffe2::convert_dataset(caffe2::FLAGS_image_file.c_str(), caffe2::FLAGS_label_file.c_str(), caffe2::FLAGS_output_file.c_str(), caffe2::FLAGS_data_limit); return 0; } ================================================ FILE: binaries/predictor_verifier.cc ================================================ /** * Copyright (c) 2016-present, Facebook, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "caffe2/core/flags.h" #include "caffe2/core/init.h" #include "caffe2/core/predictor.h" #include "caffe2/utils/proto_utils.h" CAFFE2_DEFINE_string(init_net, "", "The given path to the init protobuffer."); CAFFE2_DEFINE_string( predict_net, "", "The given path to the predict protobuffer."); namespace caffe2 { void run() { if (FLAGS_init_net.empty()) { LOG(FATAL) << "No init net specified. Use --init_net=/path/to/net."; } if (FLAGS_predict_net.empty()) { LOG(FATAL) << "No predict net specified. Use --predict_net=/path/to/net."; } caffe2::NetDef init_net, predict_net; CAFFE_ENFORCE(ReadProtoFromFile(FLAGS_init_net, &init_net)); CAFFE_ENFORCE(ReadProtoFromFile(FLAGS_predict_net, &predict_net)); // Can be large due to constant fills VLOG(1) << "Init net: " << ProtoDebugString(init_net); LOG(INFO) << "Predict net: " << ProtoDebugString(predict_net); auto predictor = caffe2::make_unique(init_net, predict_net); LOG(INFO) << "Checking that a null forward-pass works"; Predictor::TensorVector inputVec, outputVec; predictor->run(inputVec, &outputVec); CAFFE_ENFORCE_GT(outputVec.size(), 0); } } int main(int argc, char** argv) { caffe2::GlobalInit(&argc, &argv); caffe2::run(); // This is to allow us to use memory leak checks. google::protobuf::ShutdownProtobufLibrary(); return 0; } ================================================ FILE: binaries/print_core_object_sizes.cc ================================================ /** * Copyright (c) 2016-present, Facebook, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include #include "caffe2/core/init.h" #include "caffe2/core/operator.h" #include "caffe2/core/context.h" #include "caffe2/core/context_gpu.h" #include "caffe2/proto/caffe2.pb.h" #define PRINT_SIZE(cls) \ std::cout << "Size of " #cls ": " << sizeof(cls) << " bytes." \ << std::endl; int main(int /* unused */, char** /* unused */) { PRINT_SIZE(caffe2::Blob); PRINT_SIZE(caffe2::Tensor); PRINT_SIZE(caffe2::Tensor); PRINT_SIZE(caffe2::CPUContext); PRINT_SIZE(caffe2::CUDAContext); PRINT_SIZE(caffe2::OperatorBase); PRINT_SIZE(caffe2::OperatorDef); PRINT_SIZE(caffe2::Operator); PRINT_SIZE(caffe2::Operator); PRINT_SIZE(caffe2::TypeMeta); PRINT_SIZE(caffe2::Workspace); return 0; } ================================================ FILE: binaries/print_registered_core_operators.cc ================================================ /** * Copyright (c) 2016-present, Facebook, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include #include #include "caffe2/core/init.h" #include "caffe2/core/operator.h" #include "caffe2/core/operator_schema.h" CAFFE2_DEFINE_string(schema, "", "Print doc and schema of a particular operator"); static bool HasSchema(const std::string& str) { return caffe2::OpSchemaRegistry::Schema(str); } static bool HasDoc(const std::string& str) { const auto* schema = caffe2::OpSchemaRegistry::Schema(str); return (schema != nullptr) && (schema->doc() != nullptr); } int main(int argc, char** argv) { caffe2::GlobalInit(&argc, &argv); if (!caffe2::FLAGS_schema.empty()) { const auto* schema = caffe2::OpSchemaRegistry::Schema( caffe2::FLAGS_schema); if (!schema) { std::cerr << "Operator " << caffe2::FLAGS_schema << " doesn't have a schema" << std::endl; return 1; } std::cout << "Operator " << caffe2::FLAGS_schema << ": " << std::endl << *schema; return 0; } for (const auto& pair : *caffe2::gDeviceTypeRegistry()) { std::cout << "Device type " << pair.first #ifndef CAFFE2_USE_LITE_PROTO << " (" << caffe2::DeviceType_Name( static_cast(pair.first)) << ")" #endif << std::endl; for (const auto& key : pair.second->Keys()) { std::cout << "\t(schema: " << HasSchema(key) << ", doc: " << HasDoc(key) << ")\t" << key << std::endl; } } std::cout << "Operators that have gradients registered:" << std::endl; for (const auto& key : caffe2::GradientRegistry()->Keys()) { std::cout << "\t(schema: " << HasSchema(key) << ", doc: " << HasDoc(key) << ")\t" << key << std::endl; } return 0; } ================================================ FILE: binaries/run_plan.cc ================================================ /** * Copyright (c) 2016-present, Facebook, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "caffe2/core/init.h" #include "caffe2/core/operator.h" #include "caffe2/proto/caffe2.pb.h" #include "caffe2/utils/proto_utils.h" #include "caffe2/core/logging.h" CAFFE2_DEFINE_string(plan, "", "The given path to the plan protobuffer."); int main(int argc, char** argv) { caffe2::GlobalInit(&argc, &argv); if (caffe2::FLAGS_plan.size() == 0) { LOG(ERROR) << "No plan specified. Use --plan=/path/to/plan."; return 0; } LOG(INFO) << "Loading plan: " << caffe2::FLAGS_plan; caffe2::PlanDef plan_def; CAFFE_ENFORCE(ReadProtoFromFile(caffe2::FLAGS_plan, &plan_def)); std::unique_ptr workspace(new caffe2::Workspace()); workspace->RunPlan(plan_def); // This is to allow us to use memory leak checks. google::protobuf::ShutdownProtobufLibrary(); return 0; } ================================================ FILE: binaries/run_plan_mpi.cc ================================================ /** * Copyright (c) 2016-present, Facebook, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include #include "caffe2/core/init.h" #include "caffe2/core/operator.h" #include "caffe2/proto/caffe2.pb.h" #include "caffe2/utils/proto_utils.h" #include "caffe2/core/logging.h" CAFFE2_DEFINE_string(plan, "", "The given path to the plan protobuffer."); int main(int argc, char** argv) { caffe2::SetUsageMessage("Runs a caffe2 plan that has MPI operators in it."); int mpi_ret; MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &mpi_ret); if (mpi_ret != MPI_THREAD_MULTIPLE && mpi_ret != MPI_THREAD_SERIALIZED) { std::cerr << "Caffe2 MPI requires the underlying MPI to support the " "MPI_THREAD_SERIALIZED or MPI_THREAD_MULTIPLE mode.\n"; return 1; } caffe2::GlobalInit(&argc, &argv); LOG(INFO) << "Loading plan: " << caffe2::FLAGS_plan; caffe2::PlanDef plan_def; CAFFE_ENFORCE(ReadProtoFromFile(caffe2::FLAGS_plan, &plan_def)); std::unique_ptr workspace(new caffe2::Workspace()); workspace->RunPlan(plan_def); // This is to allow us to use memory leak checks. google::protobuf::ShutdownProtobufLibrary(); MPI_Finalize(); return 0; } ================================================ FILE: binaries/speed_benchmark.cc ================================================ /** * Copyright (c) 2016-present, Facebook, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include #include "caffe2/core/init.h" #include "caffe2/core/logging.h" #include "caffe2/core/operator.h" #include "caffe2/proto/caffe2.pb.h" #include "caffe2/utils/proto_utils.h" #include "caffe2/utils/string_utils.h" CAFFE2_DEFINE_string(net, "", "The given net to benchmark."); CAFFE2_DEFINE_string( init_net, "", "The given net to initialize any parameters."); CAFFE2_DEFINE_string( input, "", "Input that is needed for running the network. If " "multiple input needed, use comma separated string."); CAFFE2_DEFINE_string( input_file, "", "Input file that contain the serialized protobuf for " "the input blobs. If multiple input needed, use comma " "separated string. Must have the same number of items " "as input does."); CAFFE2_DEFINE_string( input_dims, "", "Alternate to input_files, if all inputs are simple " "float TensorCPUs, specify the dimension using comma " "separated numbers. If multiple input needed, use " "semicolon to separate the dimension of different " "tensors."); CAFFE2_DEFINE_string( output, "", "Output that should be dumped after the execution " "finishes. If multiple outputs are needed, use comma " "separated string. If you want to dump everything, pass " "'*' as the output value."); CAFFE2_DEFINE_string( output_folder, "", "The folder that the output should be written to. This " "folder must already exist in the file system."); CAFFE2_DEFINE_int(warmup, 0, "The number of iterations to warm up."); CAFFE2_DEFINE_int(iter, 10, "The number of iterations to run."); CAFFE2_DEFINE_bool( run_individual, false, "Whether to benchmark individual operators."); CAFFE2_DEFINE_bool(force_engine, false, "Force engine field for all operators"); CAFFE2_DEFINE_string(engine, "", "Forced engine field value"); CAFFE2_DEFINE_bool(force_algo, false, "Force algo arg for all operators"); CAFFE2_DEFINE_string(algo, "", "Forced algo arg value"); using std::string; using std::unique_ptr; using std::vector; int main(int argc, char** argv) { caffe2::GlobalInit(&argc, &argv); unique_ptr workspace(new caffe2::Workspace()); // Run initialization network. caffe2::NetDef net_def; CAFFE_ENFORCE(ReadProtoFromFile(caffe2::FLAGS_init_net, &net_def)); CAFFE_ENFORCE(workspace->RunNetOnce(net_def)); // Load input. if (caffe2::FLAGS_input.size()) { vector input_names = caffe2::split(',', caffe2::FLAGS_input); if (caffe2::FLAGS_input_file.size()) { vector input_files = caffe2::split(',', caffe2::FLAGS_input_file); CAFFE_ENFORCE_EQ( input_names.size(), input_files.size(), "Input name and file should have the same number."); for (int i = 0; i < input_names.size(); ++i) { caffe2::BlobProto blob_proto; CAFFE_ENFORCE(caffe2::ReadProtoFromFile(input_files[i], &blob_proto)); workspace->CreateBlob(input_names[i])->Deserialize(blob_proto); } } else if (caffe2::FLAGS_input_dims.size()) { vector input_dims_list = caffe2::split(';', caffe2::FLAGS_input_dims); CAFFE_ENFORCE_EQ( input_names.size(), input_dims_list.size(), "Input name and dims should have the same number of items."); for (int i = 0; i < input_names.size(); ++i) { vector input_dims_str = caffe2::split(',', input_dims_list[i]); vector input_dims; for (const string& s : input_dims_str) { input_dims.push_back(caffe2::stoi(s)); } caffe2::TensorCPU* tensor = workspace->GetBlob(input_names[i])->GetMutable(); tensor->Resize(input_dims); tensor->mutable_data(); } } else { CAFFE_THROW( "You requested input tensors, but neither input_file nor " "input_dims is set."); } } // Run main network. CAFFE_ENFORCE(ReadProtoFromFile(caffe2::FLAGS_net, &net_def)); // force changing engine and algo if (caffe2::FLAGS_force_engine) { LOG(INFO) << "force engine be: " << caffe2::FLAGS_engine; for (const auto& op : net_def.op()) { const_cast(&op)->set_engine(caffe2::FLAGS_engine); } } if (caffe2::FLAGS_force_algo) { LOG(INFO) << "force algo be: " << caffe2::FLAGS_algo; for (const auto& op : net_def.op()) { caffe2::GetMutableArgument( "algo", true, const_cast(&op)) ->set_s(caffe2::FLAGS_algo); } } caffe2::NetBase* net = workspace->CreateNet(net_def); CHECK_NOTNULL(net); net->TEST_Benchmark( caffe2::FLAGS_warmup, caffe2::FLAGS_iter, caffe2::FLAGS_run_individual); string output_prefix = caffe2::FLAGS_output_folder.size() ? caffe2::FLAGS_output_folder + "/" : ""; if (caffe2::FLAGS_output.size()) { vector output_names = caffe2::split(',', caffe2::FLAGS_output); if (caffe2::FLAGS_output == "*") { output_names = workspace->Blobs(); } for (const string& name : output_names) { CAFFE_ENFORCE( workspace->HasBlob(name), "You requested a non-existing blob: ", name); string serialized = workspace->GetBlob(name)->Serialize(name); string output_filename = output_prefix + name; caffe2::WriteStringToFile(serialized, output_filename.c_str()); } } return 0; } ================================================ FILE: binaries/split_db.cc ================================================ /** * Copyright (c) 2016-present, Facebook, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include #include #include "caffe2/core/db.h" #include "caffe2/core/init.h" #include "caffe2/proto/caffe2.pb.h" #include "caffe2/core/logging.h" CAFFE2_DEFINE_string(input_db, "", "The input db."); CAFFE2_DEFINE_int(splits, 0, "The number of splits."); CAFFE2_DEFINE_string(db_type, "", "The db type."); CAFFE2_DEFINE_int(batch_size, 1000, "The write batch size."); namespace caffe2 { static int Split(int argc, char** argv) { GlobalInit(&argc, &argv); CAFFE_ENFORCE(FLAGS_input_db.size(), "Must specify --input_db=/path/to/db."); CAFFE_ENFORCE(FLAGS_splits > 0, "Must specify a nonnegative split number."); CAFFE_ENFORCE(FLAGS_db_type.size(), "Must specify --db_type=[a db type]."); unique_ptr in_db( db::CreateDB(FLAGS_db_type, FLAGS_input_db, db::READ)); CAFFE_ENFORCE(in_db != nullptr, "Cannot open input db: ", FLAGS_input_db); unique_ptr cursor(in_db->NewCursor()); // This usually won't happen, but FWIW. CAFFE_ENFORCE( cursor != nullptr, "Cannot obtain cursor for input db: ", FLAGS_input_db); vector> out_dbs; vector> transactions; for (int i = 0; i < FLAGS_splits; ++i) { out_dbs.push_back(unique_ptr(db::CreateDB( FLAGS_db_type, FLAGS_input_db + "_split_" + to_string(i), db::NEW))); CAFFE_ENFORCE(out_dbs.back().get(), "Cannot create output db #", i); transactions.push_back( unique_ptr(out_dbs[i]->NewTransaction())); CAFFE_ENFORCE( transactions.back().get(), "Cannot get transaction for output db #", i); } int count = 0; for (; cursor->Valid(); cursor->Next()) { transactions[count % FLAGS_splits]->Put(cursor->key(), cursor->value()); if (++count % FLAGS_batch_size == 0) { for (int i = 0; i < FLAGS_splits; ++i) { transactions[i]->Commit(); } LOG(INFO) << "Split " << count << " items so far."; } } LOG(INFO) << "A total of " << count << " items processed."; return 0; } } // namespace caffe2 int main(int argc, char** argv) { return caffe2::Split(argc, argv); } ================================================ FILE: binaries/tutorial_blob.cc ================================================ /** * Copyright (c) 2016-present, Facebook, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "caffe2/core/blob.h" #include "caffe2/core/init.h" #include "caffe2/core/tensor.h" #include "caffe2/core/logging.h" // We will be lazy and just use the whole namespace. using namespace caffe2; int main(int argc, char** argv) { caffe2::GlobalInit(&argc, &argv); caffe2::ShowLogInfoToStderr(); LOG(INFO) << "This script corresponds to the Blob part of the Caffe2 C++ " "tutorial."; LOG(INFO) << "Let's create a blob myblob."; Blob myblob; LOG(INFO) << "Let's set it to int and set the value to 10."; int* myint = myblob.GetMutable(); *myint = 10; LOG(INFO) << "Is the blob type int? " << myblob.IsType(); LOG(INFO) << "Is the blob type float? " << myblob.IsType(); const int& myint_const = myblob.Get(); LOG(INFO) << "The value of the int number stored in the blob is: " << myint_const; LOG(INFO) << "Let's try to get a float pointer. This will trigger an exception."; try { const float& myfloat = myblob.Get(); LOG(FATAL) << "This line should never happen."; } catch (std::exception& e) { LOG(INFO) << "As expected, we got an exception. Its content says: " << e.what(); } LOG(INFO) << "However, we can change the content type (and destroy the old " "content) by calling GetMutable. Let's change it to double."; double* mydouble = myblob.GetMutable(); *mydouble = 3.14; LOG(INFO) << "The new content is: " << myblob.Get(); LOG(INFO) << "If we have a pre-created object, we can use Reset() to transfer the " "object to a blob."; std::string* pvec = new std::string(); myblob.Reset(pvec); // no need to release pvec, myblob takes ownership. LOG(INFO) << "Is the blob now of type string? " << myblob.IsType(); LOG(INFO) << "This concludes the blob tutorial."; return 0; } ================================================ FILE: binaries/zmq_feeder.cc ================================================ /** * Copyright (c) 2016-present, Facebook, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ // This binary provides an easy way to open a zeromq server and feeds data to // clients connect to it. It uses the Caffe2 db as the backend, thus allowing // one to convert any db-compliant storage to a zeromq service. #include "caffe2/core/db.h" #include "caffe2/core/init.h" #include "caffe2/core/logging.h" #include "caffe2/utils/zmq_helper.h" CAFFE2_DEFINE_string(server, "tcp://*:5555", "The server address."); CAFFE2_DEFINE_string(input_db, "", "The input db."); CAFFE2_DEFINE_string(input_db_type, "", "The input db type."); using caffe2::db::DB; using caffe2::db::Cursor; using caffe2::string; int main(int argc, char** argv) { caffe2::GlobalInit(&argc, &argv); LOG(INFO) << "Opening DB..."; auto in_db = caffe2::db::CreateDB( caffe2::FLAGS_input_db_type, caffe2::FLAGS_input_db, caffe2::db::READ); CAFFE_ENFORCE( in_db, "Cannot load input db " + caffe2::FLAGS_input_db + " of expected type " + caffe2::FLAGS_input_db_type); auto cursor = in_db->NewCursor(); LOG(INFO) << "DB opened."; LOG(INFO) << "Starting ZeroMQ server..."; // Socket to talk to clients caffe2::ZmqSocket sender(ZMQ_PUSH); sender.Bind(caffe2::FLAGS_server); LOG(INFO) << "Server created at " << caffe2::FLAGS_server; while (1) { VLOG(1) << "Sending " << cursor->key(); sender.SendTillSuccess(cursor->key(), ZMQ_SNDMORE); sender.SendTillSuccess(cursor->value(), 0); cursor->Next(); if (!cursor->Valid()) { cursor->SeekToFirst(); } } // We do not do an elegant quit since this binary is going to be terminated by // control+C. return 0; } ================================================ FILE: caffe/__init__.py ================================================ ================================================ FILE: caffe/proto/CMakeLists.txt ================================================ file(GLOB Caffe_PROTOBUF_FILES "${CMAKE_CURRENT_SOURCE_DIR}/*.proto") caffe2_protobuf_generate_cpp_py(Caffe_PROTO_SRCS Caffe_PROTO_HEADERS Caffe_PROTO_PY ${Caffe_PROTOBUF_FILES}) add_library(Caffe_PROTO OBJECT ${Caffe_PROTO_HEADERS} ${Caffe_PROTO_SRCS}) if (MSVC) if(BUILD_SHARED_LIBS) set(Caffe2_API_DEFINE "-DCAFFE2_API=__declspec(dllexport)") else() set(Caffe2_API_DEFINE "-DCAFFE2_API=") endif() target_compile_definitions( Caffe_PROTO PRIVATE ${Caffe2_API_DEFINE}) endif() install(FILES ${Caffe_PROTO_HEADERS} DESTINATION include/caffe/proto) ================================================ FILE: caffe/proto/__init__.py ================================================ ================================================ FILE: caffe/proto/caffe.proto ================================================ syntax = "proto2"; package caffe; // Specifies the shape (dimensions) of a Blob. message BlobShape { repeated int64 dim = 1 [packed = true]; } message BlobProto { optional BlobShape shape = 7; repeated float data = 5 [packed = true]; repeated float diff = 6 [packed = true]; repeated double double_data = 8 [packed = true]; repeated double double_diff = 9 [packed = true]; // 4D dimensions -- deprecated. Use "shape" instead. optional int32 num = 1 [default = 0]; optional int32 channels = 2 [default = 0]; optional int32 height = 3 [default = 0]; optional int32 width = 4 [default = 0]; } // The BlobProtoVector is simply a way to pass multiple blobproto instances // around. message BlobProtoVector { repeated BlobProto blobs = 1; } message Datum { optional int32 channels = 1; optional int32 height = 2; optional int32 width = 3; // the actual image data, in bytes optional bytes data = 4; optional int32 label = 5; // Optionally, the datum could also hold float data. repeated float float_data = 6; // If true data contains an encoded image that need to be decoded optional bool encoded = 7 [default = false]; } message FillerParameter { // The filler type. optional string type = 1 [default = 'constant']; optional float value = 2 [default = 0]; // the value in constant filler optional float min = 3 [default = 0]; // the min value in uniform filler optional float max = 4 [default = 1]; // the max value in uniform filler optional float mean = 5 [default = 0]; // the mean value in Gaussian filler optional float std = 6 [default = 1]; // the std value in Gaussian filler // The expected number of non-zero output weights for a given input in // Gaussian filler -- the default -1 means don't perform sparsification. optional int32 sparse = 7 [default = -1]; // Normalize the filler variance by fan_in, fan_out, or their average. // Applies to 'xavier' and 'msra' fillers. enum VarianceNorm { FAN_IN = 0; FAN_OUT = 1; AVERAGE = 2; } optional VarianceNorm variance_norm = 8 [default = FAN_IN]; } message NetParameter { optional string name = 1; // consider giving the network a name // DEPRECATED. See InputParameter. The input blobs to the network. repeated string input = 3; // DEPRECATED. See InputParameter. The shape of the input blobs. repeated BlobShape input_shape = 8; // 4D input dimensions -- deprecated. Use "input_shape" instead. // If specified, for each input blob there should be four // values specifying the num, channels, height and width of the input blob. // Thus, there should be a total of (4 * #input) numbers. repeated int32 input_dim = 4; // Whether the network will force every layer to carry out backward operation. // If set False, then whether to carry out backward is determined // automatically according to the net structure and learning rates. optional bool force_backward = 5 [default = false]; // The current "state" of the network, including the phase, level, and stage. // Some layers may be included/excluded depending on this state and the states // specified in the layers' include and exclude fields. optional NetState state = 6; // Print debugging information about results while running Net::Forward, // Net::Backward, and Net::Update. optional bool debug_info = 7 [default = false]; // The layers that make up the net. Each of their configurations, including // connectivity and behavior, is specified as a LayerParameter. repeated LayerParameter layer = 100; // ID 100 so layers are printed last. // DEPRECATED: use 'layer' instead. repeated V1LayerParameter layers = 2; } // NOTE // Update the next available ID when you add a new SolverParameter field. // // SolverParameter next available ID: 41 (last added: type) message SolverParameter { ////////////////////////////////////////////////////////////////////////////// // Specifying the train and test networks // // Exactly one train net must be specified using one of the following fields: // train_net_param, train_net, net_param, net // One or more test nets may be specified using any of the following fields: // test_net_param, test_net, net_param, net // If more than one test net field is specified (e.g., both net and // test_net are specified), they will be evaluated in the field order given // above: (1) test_net_param, (2) test_net, (3) net_param/net. // A test_iter must be specified for each test_net. // A test_level and/or a test_stage may also be specified for each test_net. ////////////////////////////////////////////////////////////////////////////// // Proto filename for the train net, possibly combined with one or more // test nets. optional string net = 24; // Inline train net param, possibly combined with one or more test nets. optional NetParameter net_param = 25; optional string train_net = 1; // Proto filename for the train net. repeated string test_net = 2; // Proto filenames for the test nets. optional NetParameter train_net_param = 21; // Inline train net params. repeated NetParameter test_net_param = 22; // Inline test net params. // The states for the train/test nets. Must be unspecified or // specified once per net. // // By default, all states will have solver = true; // train_state will have phase = TRAIN, // and all test_state's will have phase = TEST. // Other defaults are set according to the NetState defaults. optional NetState train_state = 26; repeated NetState test_state = 27; // The number of iterations for each test net. repeated int32 test_iter = 3; // The number of iterations between two testing phases. optional int32 test_interval = 4 [default = 0]; optional bool test_compute_loss = 19 [default = false]; // If true, run an initial test pass before the first iteration, // ensuring memory availability and printing the starting value of the loss. optional bool test_initialization = 32 [default = true]; optional float base_lr = 5; // The base learning rate // the number of iterations between displaying info. If display = 0, no info // will be displayed. optional int32 display = 6; // Display the loss averaged over the last average_loss iterations optional int32 average_loss = 33 [default = 1]; optional int32 max_iter = 7; // the maximum number of iterations // accumulate gradients over `iter_size` x `batch_size` instances optional int32 iter_size = 36 [default = 1]; // The learning rate decay policy. The currently implemented learning rate // policies are as follows: // - fixed: always return base_lr. // - step: return base_lr * gamma ^ (floor(iter / step)) // - exp: return base_lr * gamma ^ iter // - inv: return base_lr * (1 + gamma * iter) ^ (- power) // - multistep: similar to step but it allows non uniform steps defined by // stepvalue // - poly: the effective learning rate follows a polynomial decay, to be // zero by the max_iter. return base_lr (1 - iter/max_iter) ^ (power) // - sigmoid: the effective learning rate follows a sigmod decay // return base_lr ( 1/(1 + exp(-gamma * (iter - stepsize)))) // // where base_lr, max_iter, gamma, step, stepvalue and power are defined // in the solver parameter protocol buffer, and iter is the current iteration. optional string lr_policy = 8; optional float gamma = 9; // The parameter to compute the learning rate. optional float power = 10; // The parameter to compute the learning rate. optional float momentum = 11; // The momentum value. optional float weight_decay = 12; // The weight decay. // regularization types supported: L1 and L2 // controlled by weight_decay optional string regularization_type = 29 [default = "L2"]; // the stepsize for learning rate policy "step" optional int32 stepsize = 13; // the stepsize for learning rate policy "multistep" repeated int32 stepvalue = 34; // Set clip_gradients to >= 0 to clip parameter gradients to that L2 norm, // whenever their actual L2 norm is larger. optional float clip_gradients = 35 [default = -1]; optional int32 snapshot = 14 [default = 0]; // The snapshot interval optional string snapshot_prefix = 15; // The prefix for the snapshot. // whether to snapshot diff in the results or not. Snapshotting diff will help // debugging but the final protocol buffer size will be much larger. optional bool snapshot_diff = 16 [default = false]; enum SnapshotFormat { HDF5 = 0; BINARYPROTO = 1; } optional SnapshotFormat snapshot_format = 37 [default = BINARYPROTO]; // the mode solver will use: 0 for CPU and 1 for GPU. Use GPU in default. enum SolverMode { CPU = 0; GPU = 1; } optional SolverMode solver_mode = 17 [default = GPU]; // the device_id will that be used in GPU mode. Use device_id = 0 in default. optional int32 device_id = 18 [default = 0]; // If non-negative, the seed with which the Solver will initialize the Caffe // random number generator -- useful for reproducible results. Otherwise, // (and by default) initialize using a seed derived from the system clock. optional int64 random_seed = 20 [default = -1]; // type of the solver optional string type = 40 [default = "SGD"]; // numerical stability for RMSProp, AdaGrad and AdaDelta and Adam optional float delta = 31 [default = 1e-8]; // parameters for the Adam solver optional float momentum2 = 39 [default = 0.999]; // RMSProp decay value // MeanSquare(t) = rms_decay*MeanSquare(t-1) + (1-rms_decay)*SquareGradient(t) optional float rms_decay = 38; // If true, print information about the state of the net that may help with // debugging learning problems. optional bool debug_info = 23 [default = false]; // If false, don't save a snapshot after training finishes. optional bool snapshot_after_train = 28 [default = true]; // DEPRECATED: old solver enum types, use string instead enum SolverType { SGD = 0; NESTEROV = 1; ADAGRAD = 2; RMSPROP = 3; ADADELTA = 4; ADAM = 5; } // DEPRECATED: use type instead of solver_type optional SolverType solver_type = 30 [default = SGD]; } // A message that stores the solver snapshots message SolverState { optional int32 iter = 1; // The current iteration optional string learned_net = 2; // The file that stores the learned net. repeated BlobProto history = 3; // The history for sgd solvers optional int32 current_step = 4 [default = 0]; // The current step for learning rate } enum Phase { TRAIN = 0; TEST = 1; } message NetState { optional Phase phase = 1 [default = TEST]; optional int32 level = 2 [default = 0]; repeated string stage = 3; } message NetStateRule { // Set phase to require the NetState have a particular phase (TRAIN or TEST) // to meet this rule. optional Phase phase = 1; // Set the minimum and/or maximum levels in which the layer should be used. // Leave undefined to meet the rule regardless of level. optional int32 min_level = 2; optional int32 max_level = 3; // Customizable sets of stages to include or exclude. // The net must have ALL of the specified stages and NONE of the specified // "not_stage"s to meet the rule. // (Use multiple NetStateRules to specify conjunctions of stages.) repeated string stage = 4; repeated string not_stage = 5; } // Specifies training parameters (multipliers on global learning constants, // and the name and other settings used for weight sharing). message ParamSpec { // The names of the parameter blobs -- useful for sharing parameters among // layers, but never required otherwise. To share a parameter between two // layers, give it a (non-empty) name. optional string name = 1; // Whether to require shared weights to have the same shape, or just the same // count -- defaults to STRICT if unspecified. optional DimCheckMode share_mode = 2; enum DimCheckMode { // STRICT (default) requires that num, channels, height, width each match. STRICT = 0; // PERMISSIVE requires only the count (num*channels*height*width) to match. PERMISSIVE = 1; } // The multiplier on the global learning rate for this parameter. optional float lr_mult = 3 [default = 1.0]; // The multiplier on the global weight decay for this parameter. optional float decay_mult = 4 [default = 1.0]; } // NOTE // Update the next available ID when you add a new LayerParameter field. // // LayerParameter next available layer-specific ID: 147 (last added: recurrent_param) message LayerParameter { optional string name = 1; // the layer name optional string type = 2; // the layer type repeated string bottom = 3; // the name of each bottom blob repeated string top = 4; // the name of each top blob // The train / test phase for computation. optional Phase phase = 10; // The amount of weight to assign each top blob in the objective. // Each layer assigns a default value, usually of either 0 or 1, // to each top blob. repeated float loss_weight = 5; // Specifies training parameters (multipliers on global learning constants, // and the name and other settings used for weight sharing). repeated ParamSpec param = 6; // The blobs containing the numeric parameters of the layer. repeated BlobProto blobs = 7; // Specifies whether to backpropagate to each bottom. If unspecified, // Caffe will automatically infer whether each input needs backpropagation // to compute parameter gradients. If set to true for some inputs, // backpropagation to those inputs is forced; if set false for some inputs, // backpropagation to those inputs is skipped. // // The size must be either 0 or equal to the number of bottoms. repeated bool propagate_down = 11; // Rules controlling whether and when a layer is included in the network, // based on the current NetState. You may specify a non-zero number of rules // to include OR exclude, but not both. If no include or exclude rules are // specified, the layer is always included. If the current NetState meets // ANY (i.e., one or more) of the specified rules, the layer is // included/excluded. repeated NetStateRule include = 8; repeated NetStateRule exclude = 9; // Parameters for data pre-processing. optional TransformationParameter transform_param = 100; // Parameters shared by loss layers. optional LossParameter loss_param = 101; // Layer type-specific parameters. // // Note: certain layers may have more than one computational engine // for their implementation. These layers include an Engine type and // engine parameter for selecting the implementation. // The default for the engine is set by the ENGINE switch at compile-time. optional AccuracyParameter accuracy_param = 102; optional ArgMaxParameter argmax_param = 103; optional BatchNormParameter batch_norm_param = 139; optional BiasParameter bias_param = 141; optional ConcatParameter concat_param = 104; optional ContrastiveLossParameter contrastive_loss_param = 105; optional ConvolutionParameter convolution_param = 106; optional CropParameter crop_param = 144; optional DataParameter data_param = 107; optional DropoutParameter dropout_param = 108; optional DummyDataParameter dummy_data_param = 109; optional EltwiseParameter eltwise_param = 110; optional ELUParameter elu_param = 140; optional EmbedParameter embed_param = 137; optional ExpParameter exp_param = 111; optional FlattenParameter flatten_param = 135; optional HDF5DataParameter hdf5_data_param = 112; optional HDF5OutputParameter hdf5_output_param = 113; optional HingeLossParameter hinge_loss_param = 114; optional ImageDataParameter image_data_param = 115; optional InfogainLossParameter infogain_loss_param = 116; optional InnerProductParameter inner_product_param = 117; optional InputParameter input_param = 143; optional LogParameter log_param = 134; optional LRNParameter lrn_param = 118; optional MemoryDataParameter memory_data_param = 119; optional MVNParameter mvn_param = 120; optional ParameterParameter parameter_param = 145; optional PoolingParameter pooling_param = 121; optional PowerParameter power_param = 122; optional PReLUParameter prelu_param = 131; optional PythonParameter python_param = 130; optional RecurrentParameter recurrent_param = 146; optional ReductionParameter reduction_param = 136; optional ReLUParameter relu_param = 123; optional ReshapeParameter reshape_param = 133; optional ScaleParameter scale_param = 142; optional SigmoidParameter sigmoid_param = 124; optional SoftmaxParameter softmax_param = 125; optional SPPParameter spp_param = 132; optional SliceParameter slice_param = 126; optional TanHParameter tanh_param = 127; optional ThresholdParameter threshold_param = 128; optional TileParameter tile_param = 138; optional WindowDataParameter window_data_param = 129; } // Message that stores parameters used to apply transformation // to the data layer's data message TransformationParameter { // For data pre-processing, we can do simple scaling and subtracting the // data mean, if provided. Note that the mean subtraction is always carried // out before scaling. optional float scale = 1 [default = 1]; // Specify if we want to randomly mirror data. optional bool mirror = 2 [default = false]; // Specify if we would like to randomly crop an image. optional uint32 crop_size = 3 [default = 0]; // mean_file and mean_value cannot be specified at the same time optional string mean_file = 4; // if specified can be repeated once (would substract it from all the channels) // or can be repeated the same number of times as channels // (would subtract them from the corresponding channel) repeated float mean_value = 5; // Force the decoded image to have 3 color channels. optional bool force_color = 6 [default = false]; // Force the decoded image to have 1 color channels. optional bool force_gray = 7 [default = false]; } // Message that stores parameters shared by loss layers message LossParameter { // If specified, ignore instances with the given label. optional int32 ignore_label = 1; // How to normalize the loss for loss layers that aggregate across batches, // spatial dimensions, or other dimensions. Currently only implemented in // SoftmaxWithLoss layer. enum NormalizationMode { // Divide by the number of examples in the batch times spatial dimensions. // Outputs that receive the ignore label will NOT be ignored in computing // the normalization factor. FULL = 0; // Divide by the total number of output locations that do not take the // ignore_label. If ignore_label is not set, this behaves like FULL. VALID = 1; // Divide by the batch size. BATCH_SIZE = 2; // Do not normalize the loss. NONE = 3; } optional NormalizationMode normalization = 3 [default = VALID]; // Deprecated. Ignored if normalization is specified. If normalization // is not specified, then setting this to false will be equivalent to // normalization = BATCH_SIZE to be consistent with previous behavior. optional bool normalize = 2; } // Messages that store parameters used by individual layer types follow, in // alphabetical order. message AccuracyParameter { // When computing accuracy, count as correct by comparing the true label to // the top k scoring classes. By default, only compare to the top scoring // class (i.e. argmax). optional uint32 top_k = 1 [default = 1]; // The "label" axis of the prediction blob, whose argmax corresponds to the // predicted label -- may be negative to index from the end (e.g., -1 for the // last axis). For example, if axis == 1 and the predictions are // (N x C x H x W), the label blob is expected to contain N*H*W ground truth // labels with integer values in {0, 1, ..., C-1}. optional int32 axis = 2 [default = 1]; // If specified, ignore instances with the given label. optional int32 ignore_label = 3; } message ArgMaxParameter { // If true produce pairs (argmax, maxval) optional bool out_max_val = 1 [default = false]; optional uint32 top_k = 2 [default = 1]; // The axis along which to maximise -- may be negative to index from the // end (e.g., -1 for the last axis). // By default ArgMaxLayer maximizes over the flattened trailing dimensions // for each index of the first / num dimension. optional int32 axis = 3; } message ConcatParameter { // The axis along which to concatenate -- may be negative to index from the // end (e.g., -1 for the last axis). Other axes must have the // same dimension for all the bottom blobs. // By default, ConcatLayer concatenates blobs along the "channels" axis (1). optional int32 axis = 2 [default = 1]; // DEPRECATED: alias for "axis" -- does not support negative indexing. optional uint32 concat_dim = 1 [default = 1]; } message BatchNormParameter { // If false, accumulate global mean/variance values via a moving average. If // true, use those accumulated values instead of computing mean/variance // across the batch. optional bool use_global_stats = 1; // How much does the moving average decay each iteration? optional float moving_average_fraction = 2 [default = .999]; // Small value to add to the variance estimate so that we don't divide by // zero. optional float eps = 3 [default = 1e-5]; } message BiasParameter { // The first axis of bottom[0] (the first input Blob) along which to apply // bottom[1] (the second input Blob). May be negative to index from the end // (e.g., -1 for the last axis). // // For example, if bottom[0] is 4D with shape 100x3x40x60, the output // top[0] will have the same shape, and bottom[1] may have any of the // following shapes (for the given value of axis): // (axis == 0 == -4) 100; 100x3; 100x3x40; 100x3x40x60 // (axis == 1 == -3) 3; 3x40; 3x40x60 // (axis == 2 == -2) 40; 40x60 // (axis == 3 == -1) 60 // Furthermore, bottom[1] may have the empty shape (regardless of the value of // "axis") -- a scalar bias. optional int32 axis = 1 [default = 1]; // (num_axes is ignored unless just one bottom is given and the bias is // a learned parameter of the layer. Otherwise, num_axes is determined by the // number of axes by the second bottom.) // The number of axes of the input (bottom[0]) covered by the bias // parameter, or -1 to cover all axes of bottom[0] starting from `axis`. // Set num_axes := 0, to add a zero-axis Blob: a scalar. optional int32 num_axes = 2 [default = 1]; // (filler is ignored unless just one bottom is given and the bias is // a learned parameter of the layer.) // The initialization for the learned bias parameter. // Default is the zero (0) initialization, resulting in the BiasLayer // initially performing the identity operation. optional FillerParameter filler = 3; } message ContrastiveLossParameter { // margin for dissimilar pair optional float margin = 1 [default = 1.0]; // The first implementation of this cost did not exactly match the cost of // Hadsell et al 2006 -- using (margin - d^2) instead of (margin - d)^2. // legacy_version = false (the default) uses (margin - d)^2 as proposed in the // Hadsell paper. New models should probably use this version. // legacy_version = true uses (margin - d^2). This is kept to support / // reproduce existing models and results optional bool legacy_version = 2 [default = false]; } message ConvolutionParameter { optional uint32 num_output = 1; // The number of outputs for the layer optional bool bias_term = 2 [default = true]; // whether to have bias terms // Pad, kernel size, and stride are all given as a single value for equal // dimensions in all spatial dimensions, or once per spatial dimension. repeated uint32 pad = 3; // The padding size; defaults to 0 repeated uint32 kernel_size = 4; // The kernel size repeated uint32 stride = 6; // The stride; defaults to 1 // Factor used to dilate the kernel, (implicitly) zero-filling the resulting // holes. (Kernel dilation is sometimes referred to by its use in the // algorithme à trous from Holschneider et al. 1987.) repeated uint32 dilation = 18; // The dilation; defaults to 1 // For 2D convolution only, the *_h and *_w versions may also be used to // specify both spatial dimensions. optional uint32 pad_h = 9 [default = 0]; // The padding height (2D only) optional uint32 pad_w = 10 [default = 0]; // The padding width (2D only) optional uint32 kernel_h = 11; // The kernel height (2D only) optional uint32 kernel_w = 12; // The kernel width (2D only) optional uint32 stride_h = 13; // The stride height (2D only) optional uint32 stride_w = 14; // The stride width (2D only) optional uint32 group = 5 [default = 1]; // The group size for group conv optional FillerParameter weight_filler = 7; // The filler for the weight optional FillerParameter bias_filler = 8; // The filler for the bias enum Engine { DEFAULT = 0; CAFFE = 1; CUDNN = 2; } optional Engine engine = 15 [default = DEFAULT]; // The axis to interpret as "channels" when performing convolution. // Preceding dimensions are treated as independent inputs; // succeeding dimensions are treated as "spatial". // With (N, C, H, W) inputs, and axis == 1 (the default), we perform // N independent 2D convolutions, sliding C-channel (or (C/g)-channels, for // groups g>1) filters across the spatial axes (H, W) of the input. // With (N, C, D, H, W) inputs, and axis == 1, we perform // N independent 3D convolutions, sliding (C/g)-channels // filters across the spatial axes (D, H, W) of the input. optional int32 axis = 16 [default = 1]; // Whether to force use of the general ND convolution, even if a specific // implementation for blobs of the appropriate number of spatial dimensions // is available. (Currently, there is only a 2D-specific convolution // implementation; for input blobs with num_axes != 2, this option is // ignored and the ND implementation will be used.) optional bool force_nd_im2col = 17 [default = false]; } message CropParameter { // To crop, elements of the first bottom are selected to fit the dimensions // of the second, reference bottom. The crop is configured by // - the crop `axis` to pick the dimensions for cropping // - the crop `offset` to set the shift for all/each dimension // to align the cropped bottom with the reference bottom. // All dimensions up to but excluding `axis` are preserved, while // the dimensions including and trailing `axis` are cropped. // If only one `offset` is set, then all dimensions are offset by this amount. // Otherwise, the number of offsets must equal the number of cropped axes to // shift the crop in each dimension accordingly. // Note: standard dimensions are N,C,H,W so the default is a spatial crop, // and `axis` may be negative to index from the end (e.g., -1 for the last // axis). optional int32 axis = 1 [default = 2]; repeated uint32 offset = 2; } message DataParameter { enum DB { LEVELDB = 0; LMDB = 1; } // Specify the data source. optional string source = 1; // Specify the batch size. optional uint32 batch_size = 4; // The rand_skip variable is for the data layer to skip a few data points // to avoid all asynchronous sgd clients to start at the same point. The skip // point would be set as rand_skip * rand(0,1). Note that rand_skip should not // be larger than the number of keys in the database. // DEPRECATED. Each solver accesses a different subset of the database. optional uint32 rand_skip = 7 [default = 0]; optional DB backend = 8 [default = LEVELDB]; // DEPRECATED. See TransformationParameter. For data pre-processing, we can do // simple scaling and subtracting the data mean, if provided. Note that the // mean subtraction is always carried out before scaling. optional float scale = 2 [default = 1]; optional string mean_file = 3; // DEPRECATED. See TransformationParameter. Specify if we would like to randomly // crop an image. optional uint32 crop_size = 5 [default = 0]; // DEPRECATED. See TransformationParameter. Specify if we want to randomly mirror // data. optional bool mirror = 6 [default = false]; // Force the encoded image to have 3 color channels optional bool force_encoded_color = 9 [default = false]; // Prefetch queue (Number of batches to prefetch to host memory, increase if // data access bandwidth varies). optional uint32 prefetch = 10 [default = 4]; } message DropoutParameter { optional float dropout_ratio = 1 [default = 0.5]; // dropout ratio } // DummyDataLayer fills any number of arbitrarily shaped blobs with random // (or constant) data generated by "Fillers" (see "message FillerParameter"). message DummyDataParameter { // This layer produces N >= 1 top blobs. DummyDataParameter must specify 1 or N // shape fields, and 0, 1 or N data_fillers. // // If 0 data_fillers are specified, ConstantFiller with a value of 0 is used. // If 1 data_filler is specified, it is applied to all top blobs. If N are // specified, the ith is applied to the ith top blob. repeated FillerParameter data_filler = 1; repeated BlobShape shape = 6; // 4D dimensions -- deprecated. Use "shape" instead. repeated uint32 num = 2; repeated uint32 channels = 3; repeated uint32 height = 4; repeated uint32 width = 5; } message EltwiseParameter { enum EltwiseOp { PROD = 0; SUM = 1; MAX = 2; } optional EltwiseOp operation = 1 [default = SUM]; // element-wise operation repeated float coeff = 2; // blob-wise coefficient for SUM operation // Whether to use an asymptotically slower (for >2 inputs) but stabler method // of computing the gradient for the PROD operation. (No effect for SUM op.) optional bool stable_prod_grad = 3 [default = true]; } // Message that stores parameters used by ELULayer message ELUParameter { // Described in: // Clevert, D.-A., Unterthiner, T., & Hochreiter, S. (2015). Fast and Accurate // Deep Network Learning by Exponential Linear Units (ELUs). arXiv optional float alpha = 1 [default = 1]; } // Message that stores parameters used by EmbedLayer message EmbedParameter { optional uint32 num_output = 1; // The number of outputs for the layer // The input is given as integers to be interpreted as one-hot // vector indices with dimension num_input. Hence num_input should be // 1 greater than the maximum possible input value. optional uint32 input_dim = 2; optional bool bias_term = 3 [default = true]; // Whether to use a bias term optional FillerParameter weight_filler = 4; // The filler for the weight optional FillerParameter bias_filler = 5; // The filler for the bias } // Message that stores parameters used by ExpLayer message ExpParameter { // ExpLayer computes outputs y = base ^ (shift + scale * x), for base > 0. // Or if base is set to the default (-1), base is set to e, // so y = exp(shift + scale * x). optional float base = 1 [default = -1.0]; optional float scale = 2 [default = 1.0]; optional float shift = 3 [default = 0.0]; } /// Message that stores parameters used by FlattenLayer message FlattenParameter { // The first axis to flatten: all preceding axes are retained in the output. // May be negative to index from the end (e.g., -1 for the last axis). optional int32 axis = 1 [default = 1]; // The last axis to flatten: all following axes are retained in the output. // May be negative to index from the end (e.g., the default -1 for the last // axis). optional int32 end_axis = 2 [default = -1]; } // Message that stores parameters used by HDF5DataLayer message HDF5DataParameter { // Specify the data source. optional string source = 1; // Specify the batch size. optional uint32 batch_size = 2; // Specify whether to shuffle the data. // If shuffle == true, the ordering of the HDF5 files is shuffled, // and the ordering of data within any given HDF5 file is shuffled, // but data between different files are not interleaved; all of a file's // data are output (in a random order) before moving onto another file. optional bool shuffle = 3 [default = false]; } message HDF5OutputParameter { optional string file_name = 1; } message HingeLossParameter { enum Norm { L1 = 1; L2 = 2; } // Specify the Norm to use L1 or L2 optional Norm norm = 1 [default = L1]; } message ImageDataParameter { // Specify the data source. optional string source = 1; // Specify the batch size. optional uint32 batch_size = 4 [default = 1]; // The rand_skip variable is for the data layer to skip a few data points // to avoid all asynchronous sgd clients to start at the same point. The skip // point would be set as rand_skip * rand(0,1). Note that rand_skip should not // be larger than the number of keys in the database. optional uint32 rand_skip = 7 [default = 0]; // Whether or not ImageLayer should shuffle the list of files at every epoch. optional bool shuffle = 8 [default = false]; // It will also resize images if new_height or new_width are not zero. optional uint32 new_height = 9 [default = 0]; optional uint32 new_width = 10 [default = 0]; // Specify if the images are color or gray optional bool is_color = 11 [default = true]; // DEPRECATED. See TransformationParameter. For data pre-processing, we can do // simple scaling and subtracting the data mean, if provided. Note that the // mean subtraction is always carried out before scaling. optional float scale = 2 [default = 1]; optional string mean_file = 3; // DEPRECATED. See TransformationParameter. Specify if we would like to randomly // crop an image. optional uint32 crop_size = 5 [default = 0]; // DEPRECATED. See TransformationParameter. Specify if we want to randomly mirror // data. optional bool mirror = 6 [default = false]; optional string root_folder = 12 [default = ""]; } message InfogainLossParameter { // Specify the infogain matrix source. optional string source = 1; } message InnerProductParameter { optional uint32 num_output = 1; // The number of outputs for the layer optional bool bias_term = 2 [default = true]; // whether to have bias terms optional FillerParameter weight_filler = 3; // The filler for the weight optional FillerParameter bias_filler = 4; // The filler for the bias // The first axis to be lumped into a single inner product computation; // all preceding axes are retained in the output. // May be negative to index from the end (e.g., -1 for the last axis). optional int32 axis = 5 [default = 1]; // Specify whether to transpose the weight matrix or not. // If transpose == true, any operations will be performed on the transpose // of the weight matrix. The weight matrix itself is not going to be transposed // but rather the transfer flag of operations will be toggled accordingly. optional bool transpose = 6 [default = false]; } message InputParameter { // This layer produces N >= 1 top blob(s) to be assigned manually. // Define N shapes to set a shape for each top. // Define 1 shape to set the same shape for every top. // Define no shape to defer to reshaping manually. repeated BlobShape shape = 1; } // Message that stores parameters used by LogLayer message LogParameter { // LogLayer computes outputs y = log_base(shift + scale * x), for base > 0. // Or if base is set to the default (-1), base is set to e, // so y = ln(shift + scale * x) = log_e(shift + scale * x) optional float base = 1 [default = -1.0]; optional float scale = 2 [default = 1.0]; optional float shift = 3 [default = 0.0]; } // Message that stores parameters used by LRNLayer message LRNParameter { optional uint32 local_size = 1 [default = 5]; optional float alpha = 2 [default = 1.]; optional float beta = 3 [default = 0.75]; enum NormRegion { ACROSS_CHANNELS = 0; WITHIN_CHANNEL = 1; } optional NormRegion norm_region = 4 [default = ACROSS_CHANNELS]; optional float k = 5 [default = 1.]; enum Engine { DEFAULT = 0; CAFFE = 1; CUDNN = 2; } optional Engine engine = 6 [default = DEFAULT]; } message MemoryDataParameter { optional uint32 batch_size = 1; optional uint32 channels = 2; optional uint32 height = 3; optional uint32 width = 4; } message MVNParameter { // This parameter can be set to false to normalize mean only optional bool normalize_variance = 1 [default = true]; // This parameter can be set to true to perform DNN-like MVN optional bool across_channels = 2 [default = false]; // Epsilon for not dividing by zero while normalizing variance optional float eps = 3 [default = 1e-9]; } message ParameterParameter { optional BlobShape shape = 1; } message PoolingParameter { enum PoolMethod { MAX = 0; AVE = 1; STOCHASTIC = 2; } optional PoolMethod pool = 1 [default = MAX]; // The pooling method // Pad, kernel size, and stride are all given as a single value for equal // dimensions in height and width or as Y, X pairs. optional uint32 pad = 4 [default = 0]; // The padding size (equal in Y, X) optional uint32 pad_h = 9 [default = 0]; // The padding height optional uint32 pad_w = 10 [default = 0]; // The padding width optional uint32 kernel_size = 2; // The kernel size (square) optional uint32 kernel_h = 5; // The kernel height optional uint32 kernel_w = 6; // The kernel width optional uint32 stride = 3 [default = 1]; // The stride (equal in Y, X) optional uint32 stride_h = 7; // The stride height optional uint32 stride_w = 8; // The stride width enum Engine { DEFAULT = 0; CAFFE = 1; CUDNN = 2; } optional Engine engine = 11 [default = DEFAULT]; // If global_pooling then it will pool over the size of the bottom by doing // kernel_h = bottom->height and kernel_w = bottom->width optional bool global_pooling = 12 [default = false]; } message PowerParameter { // PowerLayer computes outputs y = (shift + scale * x) ^ power. optional float power = 1 [default = 1.0]; optional float scale = 2 [default = 1.0]; optional float shift = 3 [default = 0.0]; } message PythonParameter { optional string module = 1; optional string layer = 2; // This value is set to the attribute `param_str` of the `PythonLayer` object // in Python before calling the `setup()` method. This could be a number, // string, dictionary in Python dict format, JSON, etc. You may parse this // string in `setup` method and use it in `forward` and `backward`. optional string param_str = 3 [default = '']; // Whether this PythonLayer is shared among worker solvers during data parallelism. // If true, each worker solver sequentially run forward from this layer. // This value should be set true if you are using it as a data layer. optional bool share_in_parallel = 4 [default = false]; } // Message that stores parameters used by RecurrentLayer message RecurrentParameter { // The dimension of the output (and usually hidden state) representation -- // must be explicitly set to non-zero. optional uint32 num_output = 1 [default = 0]; optional FillerParameter weight_filler = 2; // The filler for the weight optional FillerParameter bias_filler = 3; // The filler for the bias // Whether to enable displaying debug_info in the unrolled recurrent net. optional bool debug_info = 4 [default = false]; // Whether to add as additional inputs (bottoms) the initial hidden state // blobs, and add as additional outputs (tops) the final timestep hidden state // blobs. The number of additional bottom/top blobs required depends on the // recurrent architecture -- e.g., 1 for RNNs, 2 for LSTMs. optional bool expose_hidden = 5 [default = false]; } // Message that stores parameters used by ReductionLayer message ReductionParameter { enum ReductionOp { SUM = 1; ASUM = 2; SUMSQ = 3; MEAN = 4; } optional ReductionOp operation = 1 [default = SUM]; // reduction operation // The first axis to reduce to a scalar -- may be negative to index from the // end (e.g., -1 for the last axis). // (Currently, only reduction along ALL "tail" axes is supported; reduction // of axis M through N, where N < num_axes - 1, is unsupported.) // Suppose we have an n-axis bottom Blob with shape: // (d0, d1, d2, ..., d(m-1), dm, d(m+1), ..., d(n-1)). // If axis == m, the output Blob will have shape // (d0, d1, d2, ..., d(m-1)), // and the ReductionOp operation is performed (d0 * d1 * d2 * ... * d(m-1)) // times, each including (dm * d(m+1) * ... * d(n-1)) individual data. // If axis == 0 (the default), the output Blob always has the empty shape // (count 1), performing reduction across the entire input -- // often useful for creating new loss functions. optional int32 axis = 2 [default = 0]; optional float coeff = 3 [default = 1.0]; // coefficient for output } // Message that stores parameters used by ReLULayer message ReLUParameter { // Allow non-zero slope for negative inputs to speed up optimization // Described in: // Maas, A. L., Hannun, A. Y., & Ng, A. Y. (2013). Rectifier nonlinearities // improve neural network acoustic models. In ICML Workshop on Deep Learning // for Audio, Speech, and Language Processing. optional float negative_slope = 1 [default = 0]; enum Engine { DEFAULT = 0; CAFFE = 1; CUDNN = 2; } optional Engine engine = 2 [default = DEFAULT]; } message ReshapeParameter { // Specify the output dimensions. If some of the dimensions are set to 0, // the corresponding dimension from the bottom layer is used (unchanged). // Exactly one dimension may be set to -1, in which case its value is // inferred from the count of the bottom blob and the remaining dimensions. // For example, suppose we want to reshape a 2D blob "input" with shape 2 x 8: // // layer { // type: "Reshape" bottom: "input" top: "output" // reshape_param { ... } // } // // If "input" is 2D with shape 2 x 8, then the following reshape_param // specifications are all equivalent, producing a 3D blob "output" with shape // 2 x 2 x 4: // // reshape_param { shape { dim: 2 dim: 2 dim: 4 } } // reshape_param { shape { dim: 0 dim: 2 dim: 4 } } // reshape_param { shape { dim: 0 dim: 2 dim: -1 } } // reshape_param { shape { dim: 0 dim:-1 dim: 4 } } // optional BlobShape shape = 1; // axis and num_axes control the portion of the bottom blob's shape that are // replaced by (included in) the reshape. By default (axis == 0 and // num_axes == -1), the entire bottom blob shape is included in the reshape, // and hence the shape field must specify the entire output shape. // // axis may be non-zero to retain some portion of the beginning of the input // shape (and may be negative to index from the end; e.g., -1 to begin the // reshape after the last axis, including nothing in the reshape, // -2 to include only the last axis, etc.). // // For example, suppose "input" is a 2D blob with shape 2 x 8. // Then the following ReshapeLayer specifications are all equivalent, // producing a blob "output" with shape 2 x 2 x 4: // // reshape_param { shape { dim: 2 dim: 2 dim: 4 } } // reshape_param { shape { dim: 2 dim: 4 } axis: 1 } // reshape_param { shape { dim: 2 dim: 4 } axis: -3 } // // num_axes specifies the extent of the reshape. // If num_axes >= 0 (and axis >= 0), the reshape will be performed only on // input axes in the range [axis, axis+num_axes]. // num_axes may also be -1, the default, to include all remaining axes // (starting from axis). // // For example, suppose "input" is a 2D blob with shape 2 x 8. // Then the following ReshapeLayer specifications are equivalent, // producing a blob "output" with shape 1 x 2 x 8. // // reshape_param { shape { dim: 1 dim: 2 dim: 8 } } // reshape_param { shape { dim: 1 dim: 2 } num_axes: 1 } // reshape_param { shape { dim: 1 } num_axes: 0 } // // On the other hand, these would produce output blob shape 2 x 1 x 8: // // reshape_param { shape { dim: 2 dim: 1 dim: 8 } } // reshape_param { shape { dim: 1 } axis: 1 num_axes: 0 } // optional int32 axis = 2 [default = 0]; optional int32 num_axes = 3 [default = -1]; } message ScaleParameter { // The first axis of bottom[0] (the first input Blob) along which to apply // bottom[1] (the second input Blob). May be negative to index from the end // (e.g., -1 for the last axis). // // For example, if bottom[0] is 4D with shape 100x3x40x60, the output // top[0] will have the same shape, and bottom[1] may have any of the // following shapes (for the given value of axis): // (axis == 0 == -4) 100; 100x3; 100x3x40; 100x3x40x60 // (axis == 1 == -3) 3; 3x40; 3x40x60 // (axis == 2 == -2) 40; 40x60 // (axis == 3 == -1) 60 // Furthermore, bottom[1] may have the empty shape (regardless of the value of // "axis") -- a scalar multiplier. optional int32 axis = 1 [default = 1]; // (num_axes is ignored unless just one bottom is given and the scale is // a learned parameter of the layer. Otherwise, num_axes is determined by the // number of axes by the second bottom.) // The number of axes of the input (bottom[0]) covered by the scale // parameter, or -1 to cover all axes of bottom[0] starting from `axis`. // Set num_axes := 0, to multiply with a zero-axis Blob: a scalar. optional int32 num_axes = 2 [default = 1]; // (filler is ignored unless just one bottom is given and the scale is // a learned parameter of the layer.) // The initialization for the learned scale parameter. // Default is the unit (1) initialization, resulting in the ScaleLayer // initially performing the identity operation. optional FillerParameter filler = 3; // Whether to also learn a bias (equivalent to a ScaleLayer+BiasLayer, but // may be more efficient). Initialized with bias_filler (defaults to 0). optional bool bias_term = 4 [default = false]; optional FillerParameter bias_filler = 5; } message SigmoidParameter { enum Engine { DEFAULT = 0; CAFFE = 1; CUDNN = 2; } optional Engine engine = 1 [default = DEFAULT]; } message SliceParameter { // The axis along which to slice -- may be negative to index from the end // (e.g., -1 for the last axis). // By default, SliceLayer concatenates blobs along the "channels" axis (1). optional int32 axis = 3 [default = 1]; repeated uint32 slice_point = 2; // DEPRECATED: alias for "axis" -- does not support negative indexing. optional uint32 slice_dim = 1 [default = 1]; } // Message that stores parameters used by SoftmaxLayer, SoftmaxWithLossLayer message SoftmaxParameter { enum Engine { DEFAULT = 0; CAFFE = 1; CUDNN = 2; } optional Engine engine = 1 [default = DEFAULT]; // The axis along which to perform the softmax -- may be negative to index // from the end (e.g., -1 for the last axis). // Any other axes will be evaluated as independent softmaxes. optional int32 axis = 2 [default = 1]; } message TanHParameter { enum Engine { DEFAULT = 0; CAFFE = 1; CUDNN = 2; } optional Engine engine = 1 [default = DEFAULT]; } // Message that stores parameters used by TileLayer message TileParameter { // The index of the axis to tile. optional int32 axis = 1 [default = 1]; // The number of copies (tiles) of the blob to output. optional int32 tiles = 2; } // Message that stores parameters used by ThresholdLayer message ThresholdParameter { optional float threshold = 1 [default = 0]; // Strictly positive values } message WindowDataParameter { // Specify the data source. optional string source = 1; // For data pre-processing, we can do simple scaling and subtracting the // data mean, if provided. Note that the mean subtraction is always carried // out before scaling. optional float scale = 2 [default = 1]; optional string mean_file = 3; // Specify the batch size. optional uint32 batch_size = 4; // Specify if we would like to randomly crop an image. optional uint32 crop_size = 5 [default = 0]; // Specify if we want to randomly mirror data. optional bool mirror = 6 [default = false]; // Foreground (object) overlap threshold optional float fg_threshold = 7 [default = 0.5]; // Background (non-object) overlap threshold optional float bg_threshold = 8 [default = 0.5]; // Fraction of batch that should be foreground objects optional float fg_fraction = 9 [default = 0.25]; // Amount of contextual padding to add around a window // (used only by the window_data_layer) optional uint32 context_pad = 10 [default = 0]; // Mode for cropping out a detection window // warp: cropped window is warped to a fixed size and aspect ratio // square: the tightest square around the window is cropped optional string crop_mode = 11 [default = "warp"]; // cache_images: will load all images in memory for faster access optional bool cache_images = 12 [default = false]; // append root_folder to locate images optional string root_folder = 13 [default = ""]; } message SPPParameter { enum PoolMethod { MAX = 0; AVE = 1; STOCHASTIC = 2; } optional uint32 pyramid_height = 1; optional PoolMethod pool = 2 [default = MAX]; // The pooling method enum Engine { DEFAULT = 0; CAFFE = 1; CUDNN = 2; } optional Engine engine = 6 [default = DEFAULT]; } // DEPRECATED: use LayerParameter. message V1LayerParameter { repeated string bottom = 2; repeated string top = 3; optional string name = 4; repeated NetStateRule include = 32; repeated NetStateRule exclude = 33; enum LayerType { NONE = 0; ABSVAL = 35; ACCURACY = 1; ARGMAX = 30; BNLL = 2; CONCAT = 3; CONTRASTIVE_LOSS = 37; CONVOLUTION = 4; DATA = 5; DECONVOLUTION = 39; DROPOUT = 6; DUMMY_DATA = 32; EUCLIDEAN_LOSS = 7; ELTWISE = 25; EXP = 38; FLATTEN = 8; HDF5_DATA = 9; HDF5_OUTPUT = 10; HINGE_LOSS = 28; IM2COL = 11; IMAGE_DATA = 12; INFOGAIN_LOSS = 13; INNER_PRODUCT = 14; LRN = 15; MEMORY_DATA = 29; MULTINOMIAL_LOGISTIC_LOSS = 16; MVN = 34; POOLING = 17; POWER = 26; RELU = 18; SIGMOID = 19; SIGMOID_CROSS_ENTROPY_LOSS = 27; SILENCE = 36; SOFTMAX = 20; SOFTMAX_LOSS = 21; SPLIT = 22; SLICE = 33; TANH = 23; WINDOW_DATA = 24; THRESHOLD = 31; } optional LayerType type = 5; repeated BlobProto blobs = 6; repeated string param = 1001; repeated DimCheckMode blob_share_mode = 1002; enum DimCheckMode { STRICT = 0; PERMISSIVE = 1; } repeated float blobs_lr = 7; repeated float weight_decay = 8; repeated float loss_weight = 35; optional AccuracyParameter accuracy_param = 27; optional ArgMaxParameter argmax_param = 23; optional ConcatParameter concat_param = 9; optional ContrastiveLossParameter contrastive_loss_param = 40; optional ConvolutionParameter convolution_param = 10; optional DataParameter data_param = 11; optional DropoutParameter dropout_param = 12; optional DummyDataParameter dummy_data_param = 26; optional EltwiseParameter eltwise_param = 24; optional ExpParameter exp_param = 41; optional HDF5DataParameter hdf5_data_param = 13; optional HDF5OutputParameter hdf5_output_param = 14; optional HingeLossParameter hinge_loss_param = 29; optional ImageDataParameter image_data_param = 15; optional InfogainLossParameter infogain_loss_param = 16; optional InnerProductParameter inner_product_param = 17; optional LRNParameter lrn_param = 18; optional MemoryDataParameter memory_data_param = 22; optional MVNParameter mvn_param = 34; optional PoolingParameter pooling_param = 19; optional PowerParameter power_param = 21; optional ReLUParameter relu_param = 30; optional SigmoidParameter sigmoid_param = 38; optional SoftmaxParameter softmax_param = 39; optional SliceParameter slice_param = 31; optional TanHParameter tanh_param = 37; optional ThresholdParameter threshold_param = 25; optional WindowDataParameter window_data_param = 20; optional TransformationParameter transform_param = 36; optional LossParameter loss_param = 42; optional V0LayerParameter layer = 1; } // DEPRECATED: V0LayerParameter is the old way of specifying layer parameters // in Caffe. We keep this message type around for legacy support. message V0LayerParameter { optional string name = 1; // the layer name optional string type = 2; // the string to specify the layer type // Parameters to specify layers with inner products. optional uint32 num_output = 3; // The number of outputs for the layer optional bool biasterm = 4 [default = true]; // whether to have bias terms optional FillerParameter weight_filler = 5; // The filler for the weight optional FillerParameter bias_filler = 6; // The filler for the bias optional uint32 pad = 7 [default = 0]; // The padding size optional uint32 kernelsize = 8; // The kernel size optional uint32 group = 9 [default = 1]; // The group size for group conv optional uint32 stride = 10 [default = 1]; // The stride enum PoolMethod { MAX = 0; AVE = 1; STOCHASTIC = 2; } optional PoolMethod pool = 11 [default = MAX]; // The pooling method optional float dropout_ratio = 12 [default = 0.5]; // dropout ratio optional uint32 local_size = 13 [default = 5]; // for local response norm optional float alpha = 14 [default = 1.]; // for local response norm optional float beta = 15 [default = 0.75]; // for local response norm optional float k = 22 [default = 1.]; // For data layers, specify the data source optional string source = 16; // For data pre-processing, we can do simple scaling and subtracting the // data mean, if provided. Note that the mean subtraction is always carried // out before scaling. optional float scale = 17 [default = 1]; optional string meanfile = 18; // For data layers, specify the batch size. optional uint32 batchsize = 19; // For data layers, specify if we would like to randomly crop an image. optional uint32 cropsize = 20 [default = 0]; // For data layers, specify if we want to randomly mirror data. optional bool mirror = 21 [default = false]; // The blobs containing the numeric parameters of the layer repeated BlobProto blobs = 50; // The ratio that is multiplied on the global learning rate. If you want to // set the learning ratio for one blob, you need to set it for all blobs. repeated float blobs_lr = 51; // The weight decay that is multiplied on the global weight decay. repeated float weight_decay = 52; // The rand_skip variable is for the data layer to skip a few data points // to avoid all asynchronous sgd clients to start at the same point. The skip // point would be set as rand_skip * rand(0,1). Note that rand_skip should not // be larger than the number of keys in the database. optional uint32 rand_skip = 53 [default = 0]; // Fields related to detection (det_*) // foreground (object) overlap threshold optional float det_fg_threshold = 54 [default = 0.5]; // background (non-object) overlap threshold optional float det_bg_threshold = 55 [default = 0.5]; // Fraction of batch that should be foreground objects optional float det_fg_fraction = 56 [default = 0.25]; // optional bool OBSOLETE_can_clobber = 57 [default = true]; // Amount of contextual padding to add around a window // (used only by the window_data_layer) optional uint32 det_context_pad = 58 [default = 0]; // Mode for cropping out a detection window // warp: cropped window is warped to a fixed size and aspect ratio // square: the tightest square around the window is cropped optional string det_crop_mode = 59 [default = "warp"]; // For ReshapeLayer, one needs to specify the new dimensions. optional int32 new_num = 60 [default = 0]; optional int32 new_channels = 61 [default = 0]; optional int32 new_height = 62 [default = 0]; optional int32 new_width = 63 [default = 0]; // Whether or not ImageLayer should shuffle the list of files at every epoch. // It will also resize images if new_height or new_width are not zero. optional bool shuffle_images = 64 [default = false]; // For ConcatLayer, one needs to specify the dimension for concatenation, and // the other dimensions must be the same for all the bottom blobs. // By default it will concatenate blobs along the channels dimension. optional uint32 concat_dim = 65 [default = 1]; optional HDF5OutputParameter hdf5_output_param = 1001; } message PReLUParameter { // Parametric ReLU described in K. He et al, Delving Deep into Rectifiers: // Surpassing Human-Level Performance on ImageNet Classification, 2015. // Initial value of a_i. Default is a_i=0.25 for all i. optional FillerParameter filler = 1; // Whether or not slope paramters are shared across channels. optional bool channel_shared = 2 [default = false]; } ================================================ FILE: caffe2/.clang-format ================================================ --- AccessModifierOffset: -1 AlignAfterOpenBracket: AlwaysBreak AlignConsecutiveAssignments: false AlignConsecutiveDeclarations: false AlignEscapedNewlinesLeft: true AlignOperands: false AlignTrailingComments: false AllowAllParametersOfDeclarationOnNextLine: false AllowShortBlocksOnASingleLine: false AllowShortCaseLabelsOnASingleLine: false AllowShortFunctionsOnASingleLine: Empty AllowShortIfStatementsOnASingleLine: false AllowShortLoopsOnASingleLine: false AlwaysBreakAfterReturnType: None AlwaysBreakBeforeMultilineStrings: true AlwaysBreakTemplateDeclarations: true BinPackArguments: false BinPackParameters: false BraceWrapping: AfterClass: false AfterControlStatement: false AfterEnum: false AfterFunction: false AfterNamespace: false AfterObjCDeclaration: false AfterStruct: false AfterUnion: false BeforeCatch: false BeforeElse: false IndentBraces: false BreakBeforeBinaryOperators: None BreakBeforeBraces: Attach BreakBeforeTernaryOperators: true BreakConstructorInitializersBeforeComma: false BreakAfterJavaFieldAnnotations: false BreakStringLiterals: false ColumnLimit: 80 CommentPragmas: '^ IWYU pragma:' ConstructorInitializerAllOnOneLineOrOnePerLine: true ConstructorInitializerIndentWidth: 4 ContinuationIndentWidth: 4 Cpp11BracedListStyle: true DerivePointerAlignment: false DisableFormat: false ForEachMacros: [ FOR_EACH_RANGE, FOR_EACH, ] IncludeCategories: - Regex: '^<.*\.h(pp)?>' Priority: 1 - Regex: '^<.*' Priority: 2 - Regex: '.*' Priority: 3 IndentCaseLabels: true IndentWidth: 2 IndentWrappedFunctionNames: false KeepEmptyLinesAtTheStartOfBlocks: false MacroBlockBegin: '' MacroBlockEnd: '' MaxEmptyLinesToKeep: 1 NamespaceIndentation: None ObjCBlockIndentWidth: 2 ObjCSpaceAfterProperty: false ObjCSpaceBeforeProtocolList: false PenaltyBreakBeforeFirstCallParameter: 1 PenaltyBreakComment: 300 PenaltyBreakFirstLessLess: 120 PenaltyBreakString: 1000 PenaltyExcessCharacter: 1000000 PenaltyReturnTypeOnItsOwnLine: 200 PointerAlignment: Left ReflowComments: true SortIncludes: true SpaceAfterCStyleCast: false SpaceBeforeAssignmentOperators: true SpaceBeforeParens: ControlStatements SpaceInEmptyParentheses: false SpacesBeforeTrailingComments: 1 SpacesInAngles: false SpacesInContainerLiterals: true SpacesInCStyleCastParentheses: false SpacesInParentheses: false SpacesInSquareBrackets: false Standard: Cpp11 TabWidth: 8 UseTab: Never ... ================================================ FILE: caffe2/CMakeLists.txt ================================================ # ---[ Declare source file lists # ---[ Add respective subdirectories # Note: the folders that are being commented out have not been properly # addressed yet. add_subdirectory(proto) add_subdirectory(contrib) add_subdirectory(core) add_subdirectory(cuda_rtc) add_subdirectory(db) add_subdirectory(distributed) # add_subdirectory(experiments) # note, we may remove this folder at some point add_subdirectory(image) add_subdirectory(video) add_subdirectory(mkl) add_subdirectory(mobile) add_subdirectory(mpi) add_subdirectory(observers) add_subdirectory(onnx) add_subdirectory(operators) add_subdirectory(perfkernels) add_subdirectory(python) add_subdirectory(queue) add_subdirectory(sgd) add_subdirectory(share) # add_subdirectory(test) # todo: use caffe2_gtest_main instead of gtest_main because we will need to call GlobalInit add_subdirectory(transforms) add_subdirectory(utils) # Advanced: if we have white list specified, we will do intersections for all # main lib srcs. if (CAFFE2_WHITELISTED_FILES) caffe2_do_whitelist(Caffe2_CPU_SRCS CAFFE2_WHITELISTED_FILES) caffe2_do_whitelist(Caffe2_GPU_SRCS CAFFE2_WHITELISTED_FILES) endif() # Debug messages - if you want to get a list of source files, enable the # following. if (FALSE) message(STATUS "CPU sources: ") foreach(tmp ${Caffe2_CPU_SRCS}) message(STATUS " " ${tmp}) endforeach() message(STATUS "GPU sources: ") foreach(tmp ${Caffe2_GPU_SRCS}) message(STATUS " " ${tmp}) endforeach() message(STATUS "CPU test sources: ") foreach(tmp ${Caffe2_CPU_TEST_SRCS}) message(STATUS " " ${tmp}) endforeach() message(STATUS "GPU test sources: ") foreach(tmp ${Caffe2_GPU_TEST_SRCS}) message(STATUS " " ${tmp}) endforeach() endif() # ---[ Generate and install header files. # Write the macros file. configure_file( ${PROJECT_SOURCE_DIR}/caffe2/core/macros.h.in ${PROJECT_BINARY_DIR}/caffe2/core/macros.h) # Installing the header files install(DIRECTORY ${CMAKE_CURRENT_LIST_DIR} DESTINATION include FILES_MATCHING PATTERN "*.h") install(FILES ${PROJECT_BINARY_DIR}/caffe2/core/macros.h DESTINATION include/caffe2/core) # ---[ List of libraries to link with # Compile exposed libraries. add_library(caffe2 ${Caffe2_CPU_SRCS} $ $) add_dependencies(caffe2 Caffe_PROTO Caffe2_PROTO) target_link_libraries(caffe2 PUBLIC ${Caffe2_PUBLIC_DEPENDENCY_LIBS}) target_link_libraries(caffe2 PRIVATE ${Caffe2_DEPENDENCY_LIBS}) target_link_libraries(caffe2 PRIVATE ${Caffe2_DEPENDENCY_WHOLE_LINK_LIBS}) target_include_directories(caffe2 INTERFACE $) target_compile_options(caffe2 INTERFACE "-std=c++11") target_compile_options(caffe2 PRIVATE "-DCAFFE2_BUILD_MAIN_LIB") install(TARGETS caffe2 EXPORT Caffe2Targets DESTINATION lib) caffe2_interface_library(caffe2 caffe2_library) list(APPEND Caffe2_MAIN_LIBS caffe2_library) # ---[ CUDA library. if(USE_CUDA) # A hack to deal with cuda library dependencies and modern CMake: the # CUDA_ADD_LIBRARY includes a target_link_libraries, and as a result, # one cannot use PUBLIC/PRIVATE/INTERFACE for the target anymore. This # hack adds the PRIVATE keywords to CUDA_LIBRARIES so we can deal with # it. We will then manually add the cudart library as interface libs. set(__tmp ${CUDA_LIBRARIES}) set(CUDA_LIBRARIES PRIVATE ${CUDA_LIBRARIES}) CUDA_ADD_LIBRARY(caffe2_gpu ${Caffe2_GPU_SRCS}) set(CUDA_LIBRARIES ${__tmp}) target_link_libraries(caffe2_gpu INTERFACE caffe2::cudart) target_include_directories( caffe2_gpu INTERFACE $) target_link_libraries( caffe2_gpu PUBLIC caffe2 ${Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS}) target_link_libraries( caffe2_gpu PRIVATE ${Caffe2_CUDA_DEPENDENCY_LIBS}) caffe2_interface_library(caffe2_gpu caffe2_gpu_library) list(APPEND Caffe2_MAIN_LIBS caffe2_gpu_library) install(TARGETS caffe2_gpu EXPORT Caffe2Targets DESTINATION lib) endif() # ---[ Test binaries. if (BUILD_TEST) set(Caffe2_ALL_TEST_SRCS ${Caffe2_CPU_TEST_SRCS}) if (USE_CUDA) list(APPEND Caffe2_ALL_TEST_SRCS ${Caffe2_GPU_TEST_SRCS}) endif() foreach(test_src ${Caffe2_ALL_TEST_SRCS}) get_filename_component(test_name ${test_src} NAME_WE) add_executable(${test_name} "${test_src}") # For tests, some of the test code actually directly call the dependent # libraries even if they are not part of the public dependency libs. As a # result, we will explicitly link the test against the Caffe2 dependency # libs. target_link_libraries(${test_name} ${Caffe2_MAIN_LIBS} gtest_main) if (USE_CUDA) target_link_libraries(${test_name} ${Caffe2_CUDA_DEPENDENCY_LIBS}) endif() if (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} GREATER 3.0) target_compile_features(${test_name} PRIVATE cxx_range_for) endif() add_test(NAME ${test_name} COMMAND $) install(TARGETS ${test_name} DESTINATION test) endforeach() endif() if (BUILD_PYTHON) # Python site-packages # Get canonical directory for python site packages (relative to install # location). It varys from system to system. pycmd(PYTHON_SITE_PACKAGES " from distutils import sysconfig print(sysconfig.get_python_lib(prefix='')) ") # ---[ Options. SET(PYTHON_LIB_REL_PATH "${PYTHON_SITE_PACKAGES}" CACHE STRING "Python installation path (relative to CMake installation prefix)") message(STATUS "Using ${PYTHON_LIB_REL_PATH} as python relative installation path") # Python extension suffix # Try to get from python through sysconfig.get_env_var('EXT_SUFFIX') first, # fallback to ".pyd" if windows and ".so" for all others. pycmd(PY_EXT_SUFFIX " from distutils import sysconfig ext_suffix = sysconfig.get_config_var('EXT_SUFFIX') print(ext_suffix if ext_suffix else '') ") if("${PY_EXT_SUFFIX}" STREQUAL "") if (MSVC) set(PY_EXT_SUFFIX ".pyd") else() set(PY_EXT_SUFFIX ".so") endif() endif() # ---[ Python. add_library(caffe2_pybind11_state MODULE ${Caffe2_CPU_PYTHON_SRCS}) set_target_properties(caffe2_pybind11_state PROPERTIES COMPILE_FLAGS "-fvisibility=hidden") set_target_properties(caffe2_pybind11_state PROPERTIES PREFIX "") set_target_properties(caffe2_pybind11_state PROPERTIES SUFFIX ${PY_EXT_SUFFIX}) if (APPLE) set_target_properties(caffe2_pybind11_state PROPERTIES LINK_FLAGS "-undefined dynamic_lookup") endif() set_target_properties( caffe2_pybind11_state PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/caffe2/python) target_link_libraries( caffe2_pybind11_state caffe2_library) install(TARGETS caffe2_pybind11_state DESTINATION "${PYTHON_LIB_REL_PATH}/caffe2/python") if(USE_CUDA) add_library(caffe2_pybind11_state_gpu MODULE ${Caffe2_GPU_PYTHON_SRCS}) set_target_properties(caffe2_pybind11_state_gpu PROPERTIES COMPILE_FLAGS "-fvisibility=hidden") set_target_properties(caffe2_pybind11_state_gpu PROPERTIES PREFIX "") set_target_properties(caffe2_pybind11_state_gpu PROPERTIES SUFFIX ${PY_EXT_SUFFIX}) if (APPLE) set_target_properties(caffe2_pybind11_state_gpu PROPERTIES LINK_FLAGS "-undefined dynamic_lookup") endif() set_target_properties( caffe2_pybind11_state_gpu PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/caffe2/python) target_link_libraries( caffe2_pybind11_state_gpu caffe2_library caffe2_gpu_library) install(TARGETS caffe2_pybind11_state_gpu DESTINATION "${PYTHON_LIB_REL_PATH}/caffe2/python") endif() if (MSVC AND CMAKE_GENERATOR MATCHES "Visual Studio") # If we are building under windows, we will copy the file from # build/caffe2/python/{Debug,Release}/caffe2_pybind11_state.pyd # to its parent folder so that we can do in-build execution. add_custom_target(windows_python_copy_lib ALL) add_dependencies(windows_python_copy_lib caffe2_pybind11_state) add_custom_command( TARGET windows_python_copy_lib POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy $ ${CMAKE_BINARY_DIR}/caffe2/python) if (USE_CUDA) add_dependencies(windows_python_copy_lib caffe2_pybind11_state_gpu) add_custom_command( TARGET windows_python_copy_lib POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy $ ${CMAKE_BINARY_DIR}/caffe2/python) endif() endif() # Finally, Copy all python files to build directory # Generate and create all needed __init__.py files, if they aren't already # present in the current source tree. message(STATUS "Automatically generating missing __init__.py files.") caffe_autogen_init_py_files() # Create a custom target that copies all python files. file(GLOB_RECURSE PYTHON_SRCS RELATIVE ${PROJECT_SOURCE_DIR} "${PROJECT_SOURCE_DIR}/caffe2/*.py") add_custom_target(python_copy_files ALL) if(MSVC OR CMAKE_GENERATOR MATCHES "Ninja") # ninja fails when the command line is too long so we split # the target into several. This would be beneficial for VS also # since it build targets in parallel but not custom commands foreach(python_src ${PYTHON_SRCS}) get_filename_component(dir ${python_src} DIRECTORY) string(SHA1 name_hash "${python_src}") # get_filename_component(name_we ${python_src} NAME_WE) add_custom_target(python_copy_files_${name_hash} COMMAND ${CMAKE_COMMAND} -E copy ${PROJECT_SOURCE_DIR}/${python_src} ${CMAKE_BINARY_DIR}/${dir}) add_dependencies(python_copy_files python_copy_files_${name_hash}) endforeach() else() foreach(python_src ${PYTHON_SRCS}) get_filename_component(dir ${python_src} DIRECTORY) add_custom_command( TARGET python_copy_files PRE_BUILD COMMAND ${CMAKE_COMMAND} -E copy ${PROJECT_SOURCE_DIR}/${python_src} ${CMAKE_BINARY_DIR}/${dir}) endforeach() endif() # Install commands # Pick up static python files install(DIRECTORY ${CMAKE_BINARY_DIR}/caffe2 DESTINATION ${PYTHON_LIB_REL_PATH} FILES_MATCHING PATTERN "*.py") # Caffe proto files install(DIRECTORY ${CMAKE_BINARY_DIR}/caffe DESTINATION ${PYTHON_LIB_REL_PATH} FILES_MATCHING PATTERN "*.py") # Caffe2 proto files install(DIRECTORY ${CMAKE_BINARY_DIR}/caffe2 DESTINATION ${PYTHON_LIB_REL_PATH} FILES_MATCHING PATTERN "*.py") endif() # Finally, set the Caffe2_MAIN_LIBS variable in the parent scope. set(Caffe2_MAIN_LIBS ${Caffe2_MAIN_LIBS} PARENT_SCOPE) ================================================ FILE: caffe2/__init__.py ================================================ ================================================ FILE: caffe2/contrib/CMakeLists.txt ================================================ add_subdirectory(aten) add_subdirectory(gloo) add_subdirectory(nccl) add_subdirectory(prof) add_subdirectory(shm_mutex) add_subdirectory(script) # Finally pass the src lists back to the parent # CPU source, test sources, binary sources set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} PARENT_SCOPE) set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS} PARENT_SCOPE) set(Caffe2_CPU_BINARY_SRCS ${Caffe2_CPU_BINARY_SRCS} PARENT_SCOPE) # GPU source, test sources, binary sources set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} PARENT_SCOPE) set(Caffe2_GPU_TEST_SRCS ${Caffe2_GPU_TEST_SRCS} PARENT_SCOPE) set(Caffe2_GPU_BINARY_SRCS ${Caffe2_GPU_BINARY_SRCS} PARENT_SCOPE) ================================================ FILE: caffe2/contrib/__init__.py ================================================ ================================================ FILE: caffe2/contrib/aten/CMakeLists.txt ================================================ if(USE_ATEN) if(NOT USE_CUDA) set(NO_CUDA ON) endif() set(TORCH_CUDA_ARCH_LIST "3.5 5.2 6.0 6.1+PTX") set(TORCH_NVCC_FLAGS "-Xfatbin -compress-all") set(CMAKE_POSITION_INDEPENDENT_CODE ON) set(AT_LINK_STYLE STATIC) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fvisibility=hidden") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fvisibility=hidden") add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/aten aten EXCLUDE_FROM_ALL) add_custom_command(OUTPUT aten_op.h COMMAND python ${CMAKE_CURRENT_SOURCE_DIR}/gen_op.py --third_party_root=${PROJECT_SOURCE_DIR}/third_party --template_dir=${PROJECT_SOURCE_DIR}/caffe2/contrib/aten DEPENDS ATen ${CMAKE_CURRENT_SOURCE_DIR}/gen_op.py ${CMAKE_CURRENT_SOURCE_DIR}/aten_op_template.h) add_custom_target(__aten_op_header_gen DEPENDS aten_op.h) add_library(aten_op_header_gen INTERFACE) add_dependencies(aten_op_header_gen __aten_op_header_gen) set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} "${CMAKE_CURRENT_SOURCE_DIR}/aten_op.cc" PARENT_SCOPE) set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} "${CMAKE_CURRENT_SOURCE_DIR}/aten_op_cuda.cc" PARENT_SCOPE) endif() ================================================ FILE: caffe2/contrib/aten/README.md ================================================ # An ATen operator for Caffe2 [ATen](https://github.com/zdevito/aten) is a simple tensor library thats exposes the Tensor operations in Torch and PyTorch directly in C++11. This library provides a generated wrapper around the ATen API that makes these functions available in Caffe2 as an operator. It also makes it accessible using the ToffeeIR. ### Example Usage in Caffe2 First identify a function in ATen you want to call in [Functions.h](https://github.com/zdevito/ATen/blob/master/doc/Functions.h), [Tensor.h](https://github.com/zdevito/ATen/blob/master/doc/Tensor.h), or [Type.h](https://github.com/zdevito/ATen/blob/master/doc/Type.h). We will call the `pow` operator: ``` static inline Tensor pow(const Tensor & self, Scalar exponent); ``` Now create a Caffe2 operator to call this op. The name of the operator is always `"ATen"`, and there is always a string attribute `operator` that defines which ATen function to call: ``` import numpy as np from caffe2.python import core, workspace # create the Caffe2 Op: op = core.CreateOperator( "ATen", ["MyInput"], ["MyOutput"], operator="pow", exponent=2.0) ``` Each `Tensor` input becomes an Caffe2 input Blob, and each output becomes a Caffe2 output blob. Non-tensor inputs such as `Scalar exponent` become Caffe2 `arg` attributes. In the case of `Scalar` the attributes can be either an integers or floating point numbers. The op can now be run like any other Caffe2 operator: ``` workspace.FeedBlob("MyInput",np.random.randn(2,3).astype(np.float32)) workspace.RunOperatorOnce(op) print(workspace.FetchBlob("MyOutput") ``` For methods, the first input is always the `this` Tensor in C++. To call methods of ATen's `Type` objects, you provide an additional string attribute that determines the type: ``` # create a 2x4 tensor filled with floating point ones op = core.CreateOperator( "ATen", [], ["MyOutput"], operator="ones", type="Float", size={2,4}) ``` Generally ATen operators are polymorphic across input types, and work on both the CPU and CUDA. ### Example Usage via PyTorch Symbolic The ATen operator can also be used to define `symbolic` definitions for PyTorch when an operator is being exported to ONNX. In this case, the definition of the operator looks the same but is defined using PyTorch's ONNX API: ``` class Add(torch.autograd.Function): @staticmethod def symbolic(g, a, b): return g.op("ATen", a, b, operator_s = "add") @staticmethod def forward(ctx, a, b): return a + b ``` ================================================ FILE: caffe2/contrib/aten/aten_op.cc ================================================ /** * Copyright (c) 2016-present, Facebook, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "caffe2/contrib/aten/aten_op.h" #include "caffe2/utils/math.h" namespace caffe2 { REGISTER_CPU_OPERATOR(ATen, ATenOp); template<> at::Backend ATenOp::backend() const { return at::kCPU; } OPERATOR_SCHEMA(ATen); CAFFE_KNOWN_TYPE(at::Half); namespace math { template<> void Set(const size_t N, const at::Half h, at::Half* v, CPUContext * c) { Set(0, h.x, (uint16_t*) v, c); } } } ================================================ FILE: caffe2/contrib/aten/aten_op.h ================================================ #include "caffe2/caffe2/contrib/aten/gen_aten_op.h" ================================================ FILE: caffe2/contrib/aten/aten_op_cuda.cc ================================================ /** * Copyright (c) 2016-present, Facebook, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "caffe2/contrib/aten/aten_op.h" #include "caffe2/core/context_gpu.h" namespace caffe2 { REGISTER_CUDA_OPERATOR(ATen, ATenOp); template<> at::Backend ATenOp::backend() const { return at::kCUDA; } namespace math { template<> void Set(const size_t N, const at::Half h, at::Half* v, CUDAContext * c) { Set(0, h.x, (uint16_t*) v, c); } } } ================================================ FILE: caffe2/contrib/aten/aten_op_template.h ================================================ /** * Copyright (c) 2016-present, Facebook, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #pragma once #include #include #include #include #include #include #include #include // a map from descriptor strings (see [DESCRIPTORS]) // to the key in the switch statement that implements them static std::unordered_map op_to_key = { ${mappings} }; namespace caffe2 { using at::Half; // for AT_FORALL_SCALAR_TYPES template class ATenOp : public Operator { public: ATenOp(const OperatorDef& operator_def, Workspace* ws) : Operator(operator_def, ws) { VLOG(2) << "ATen OpDef: " << ProtoDebugString(operator_def) << "\n"; switch(findImplementation(operator_def)) { ${implementations} default: CAFFE_THROW("Unexpected key value for aten operator"); } } USE_OPERATOR_CONTEXT_FUNCTIONS; bool RunOnDevice() override { return run_op(); } private: // actual operator implementation is initialized in ctor. std::function run_op; at::Backend backend() const; TypeMeta typeMetaFor(const at::Tensor & t) { return typeMetaFor(t.type().scalarType()); } TypeMeta typeMetaFor(at::ScalarType st) { #define DEFINE_CASE(ctype,aten_name,_) \ case at::k##aten_name: \ return TypeMeta::Make(); switch(st) { AT_FORALL_SCALAR_TYPES(DEFINE_CASE) default: CAFFE_THROW("Unknown ATen Type"); } #undef DEFINE_CASE } at::Type & typeFor(const Tensor & ten) { return at::getType(backend(), atScalarTypeFor(ten.meta())); } at::Tensor tensorWrapping(const Tensor& ten_) { auto& ten = const_cast&>(ten_); return typeFor(ten).tensorFromBlob(ten.raw_mutable_data(), ten.dims()); } at::Tensor loadInput(size_t i) { return tensorWrapping(Input(i)); } std::vector loadInputsAtOffset(size_t s) { std::vector results; for (size_t i = s; i < InputSize(); i++) { results.push_back(loadInput(i)); } return results; } at::ScalarType atScalarTypeFor(const TypeMeta & meta) { #define DEFINE_IF(ctype,aten_name,_) \ if(meta.Match()) { \ return at::k##aten_name; \ } AT_FORALL_SCALAR_TYPES(DEFINE_IF) #undef DEFINE_IF CAFFE_THROW("Unknown type meta"); // TODO: improve error message... } void assignTo(Tensor * dst, const at::Tensor & src_) { at::Tensor src = src_.contiguous(); auto at_sizes = src.sizes(); std::vector dims(at_sizes.begin(),at_sizes.end()); dst->Resize(dims); dst->ShareExternalPointer( src.data_ptr(), typeMetaFor(src), 0, [src](void* ptr) mutable { // return a closure that holds a handle to t until it is called // to keep the aten memory alive return src.reset(); }); } void assignListStartingAt( size_t offset, const std::vector& tensors) { for (size_t i = 0; i < tensors.size(); i++) { assignTo(Output(offset + i), tensors[i]); } } // the AT_FORALL_SCALAR_TYPES macro just gives a 'i' or 'd' argument // for each type to specify if it is stored as a integer or a double. // We need this workaround here to extract the value in the scalar losslessly // because in some cases like 'sum' Torch promotes float to double // and will complain if we downcast it with toFloat, causing it // to lose precision double extract_d(const at::Scalar & s) { return s.toDouble(); } int64_t extract_i(const at::Scalar & s) { return s.toLong(); } void assignTo(Tensor * dst, at::Type & inferred_type, at::Scalar scalar) { switch(inferred_type.scalarType()) { #define DEFINE_CASE(ctype,aten_name,native) \ case at::k##aten_name: { \ auto value = extract_##native(scalar); \ assignToValue(dst, at::convert(value)); \ } break; AT_FORALL_SCALAR_TYPES(DEFINE_CASE) #undef DEFINE_CASE default: CAFFE_THROW("Unknown ATen Type"); } } template void assignToValue(Tensor * dst, T v) { dst->Resize(std::vector()); math::Set(1, v, dst->template mutable_data(), &context_); } int findImplementation(const OperatorDef& operator_def) { CAFFE_ENFORCE(HasArgument("operator")); std::string op = OperatorBase::GetSingleArgument("operator", ""); // construct descriptor string ([DESCRIPTORS]) given the attributes // and inputs of this operator_def, and look up the implementation key // for this variant std::stringstream descriptor; descriptor << op; std::vector attrs; for(size_t i = 0; i < operator_def.arg_size(); i++) { auto & attr = operator_def.arg(i); if(attr.name() == "operator" || attr.name() == "type" ) continue; attrs.push_back(attr.name()); } std::sort(attrs.begin(), attrs.end()); for(auto & a : attrs) descriptor << "-" << a; std::string descriptor_sized = descriptor.str() + "-" + caffe2::to_string(InputSize()); std::string descriptor_var_args = descriptor.str() + "-*"; if (op_to_key.count(descriptor_sized) > 0) { return op_to_key[descriptor_sized]; } if (op_to_key.count(descriptor_var_args) > 0) { return op_to_key[descriptor_var_args]; } std::stringstream ss; ss << "Attempting to run unknown ATen operator configuration: " << descriptor_sized; CAFFE_THROW(ss.str()); } at::Scalar readScalarAttribute(const std::string & name) { if(OperatorBase::HasSingleArgumentOfType(name)) { return OperatorBase::GetSingleArgument(name, 0); } else { CAFFE_ENFORCE(OperatorBase::HasSingleArgumentOfType(name)); return OperatorBase::GetSingleArgument(name, 0); } } template T readAttribute(const std::string & name) { CAFFE_ENFORCE(OperatorBase::HasSingleArgumentOfType(name)); return OperatorBase::GetSingleArgument(name, 0); } std::vector readIntList(const std::string & name) { CAFFE_ENFORCE(OperatorBase::HasArgument(name)); return OperatorBase::GetRepeatedArgument(name, {}); } template std::array readBoolMask(const std::string& name) { CAFFE_ENFORCE(OperatorBase::HasArgument(name)); std::vector ints = OperatorBase::GetRepeatedArgument(name, {}); std::array result; for (size_t i = 0; i < N; ++i) { result[i] = ints.at(i); } return result; } at::ScalarType stringToScalarType(const std::string & name) { #define DEFINE_IF(type,aten) \ if(#type == name) \ return at::k##aten; DEFINE_IF(float16, Half) DEFINE_IF(float, Float) DEFINE_IF(double, Double) DEFINE_IF(uint8, Byte) DEFINE_IF(int8, Char) DEFINE_IF(int16, Short) DEFINE_IF(int32, Int) DEFINE_IF(int64, Long) CAFFE_THROW("unsupported type annotation: ", name); } at::Type & stringToType(const std::string & name) { return at::getType(backend(), stringToScalarType(name)); } at::Type * readTypeAttribute(const std::string & name) { CAFFE_ENFORCE(OperatorBase::HasSingleArgumentOfType(name)); return &stringToType(OperatorBase::GetSingleArgument(name, "")); } }; } ================================================ FILE: caffe2/contrib/aten/aten_test.py ================================================ # Copyright (c) 2016-present, Facebook, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. ############################################################################## from __future__ import absolute_import from __future__ import division from __future__ import print_function from __future__ import unicode_literals from caffe2.python import core, dyndep from hypothesis import given import caffe2.python.hypothesis_test_util as hu import hypothesis.strategies as st import numpy as np dyndep.InitOpsLibrary('@/caffe2/caffe2/contrib/aten:aten_op') class TestATen(hu.HypothesisTestCase): @given(inputs=hu.tensors(n=2), **hu.gcs) def test_add(self, inputs, gc, dc): op = core.CreateOperator( "ATen", ["X", "Y"], ["Z"], operator="add") def ref(X, Y): return [X + Y] self.assertReferenceChecks(gc, op, inputs, ref) @given(inputs=hu.tensors(n=1), **hu.gcs) def test_pow(self, inputs, gc, dc): op = core.CreateOperator( "ATen", ["S"], ["Z"], operator="pow", exponent=2.0) def ref(X): return [np.square(X)] self.assertReferenceChecks(gc, op, inputs, ref) @given(x=st.integers(min_value=2, max_value=8), **hu.gcs) def test_sort(self, x, gc, dc): inputs = [np.random.permutation(x)] op = core.CreateOperator( "ATen", ["S"], ["Z", "I"], operator="sort") def ref(X): return [np.sort(X), np.argsort(X)] self.assertReferenceChecks(gc, op, inputs, ref) @given(inputs=hu.tensors(n=1), **hu.gcs) def test_sum(self, inputs, gc, dc): op = core.CreateOperator( "ATen", ["S"], ["Z"], operator="sum") def ref(X): return [np.sum(X)] self.assertReferenceChecks(gc, op, inputs, ref) @given(**hu.gcs) def test_ones(self, gc, dc): op = core.CreateOperator( "ATen", [], ["Z"], operator="ones", type="float", size={2, 4}) def ref(): return [np.ones([2, 4])] self.assertReferenceChecks(gc, op, [], ref) if __name__ == "__main__": import unittest unittest.main() ================================================ FILE: caffe2/contrib/aten/docs/pytorch_to_caffe2.md ================================================ # Using ONNX and ATen to export models from PyTorch to Caffe2 When using ONNX to export a model from PyTorch into Caffe2, you sometimes end up hitting operators that are not yet part of the ONNX specification. These may be operators that haven't been standardized yet, or custom `torch.autograd.Function` types that are specific to a network. To bridge this gap, we provide an experimental operator in ONNX that allows you to directly access PyTorch's tensor functions using the ATen library. [ATen](https://github.com/zdevito/aten) is the underlying C++ library that PyTorch uses to do tensor operations. Caffe2 has an [ATen operator](https://github.com/caffe2/caffe2/tree/master/caffe2/contrib/aten) that can run these tensor functions in a Caffe2 network after importing them through ONNX. This guide explains how to configure Caffe2 and modify your PyTorch program to use this functionality. ### Enable ATen in Caffe2 The ATen facility in Caffe2 is part of a contrib package and needs to be enabled when you configure Caffe2 using cmake: ``` git clone https://github.com/caffe2/caffe2/ mkdir caffe2/build cd caffe2/build cmake -DUSE_ATEN=ON .. make install ``` ### Describe How to Export a PyTorch Autograd Function using ATen To export a model to ONNX, PyTorch first creates a trace of all the `torch.autograd.Function`s run in the forward pass of a network. For each function in the trace, it calls that function's `symbolic` method which describes how to construct the part of the ONNX graph that will compute this function (see [basic_ops.py](https://github.com/pytorch/pytorch/blob/master/torch/autograd/_functions/basic_ops.py#L59) for examples). When equivalent ONNX operators do not exist, you can instead call any ATen function. As an example let's assume we have an autograd function which computes `x*x+y`: ``` class MyFunction(Function): @staticmethod def forward(ctx, x, y): return x*x + y ``` We can add a `symbolic` method to it like so: ``` class MyFunction(Function): @staticmethod def forward(ctx, x, y): return x*x + y @staticmethod def symbolic(graph, x, y): x2 = graph.at("mul", x, x) r = graph.at("add", x2, y) # x, y, x2, and r are 'Node' objects # print(r) or print(graph) will print out a textual representation for debugging. # this representation will be converted to ONNX protobufs on export. return r ``` The function `graph.at` adds a new ATen op the computation graph. You can call any ATen function using this facility. To do so, first identify a function in ATen you want to call in [Functions.h](https://github.com/zdevito/ATen/blob/master/doc/Functions.h), [Tensor.h](https://github.com/zdevito/ATen/blob/master/doc/Tensor.h), or [Type.h](https://github.com/zdevito/ATen/blob/master/doc/Type.h). As an example, we might want to call the `pow` operator: ``` static inline Tensor pow(const Tensor & self, Scalar exponent); ``` We can translate this into the equivalent `graph.at` function: ``` def symbolic(graph, x): graph.at("pow", x, exponent_f = 2.0) # compute x**2 ``` Tensor arguments to ATen functions become arguments to `graph.at`, while a `Scalar` like `exponent` becomes a keyword argument that specify ONNX attributes. Attributes are suffixed with their type (`_f` for floats and `_i` for integers, and `_s` for strings). For methods, the first input is always the `this` Tensor in C++. To call methods of ATen's `Type` objects, you provide an additional string attribute that determines the type. For instance, `ones` creates a new constant tensor of all ones: ``` class Type { ... virtual Tensor ones(IntList size) const; ... }; ``` From PyTorch it can be created by adding the type as an additional attribute: ``` def symbolic(graph, x): return graph.at("ones", type_s="float", size_i=[2,4]) ``` Generally ATen operators are polymorphic across input types, and work on both the CPU and CUDA. ## Putting it together With these building blocks we can now write and export networks that include custom operators using `torch.onnx.export`: ``` class MyModule(nn.Module): def forward(self, x, y): # you can combine your ATen ops with standard onnx ones x = nn.ReLU()(x) return MyFunction.apply(x, y) torch.onnx.export(MyModule(), (Variable(torch.ones(3,4)), Variable(torch.ones(3,4))), "output.onnx", verbose=True) ``` This exports the following graph, which contains calls the `ATen` operator: ``` graph(%1 : Float(3, 4) %2 : Float(3, 4)) { %3 : Float(3, 4) = Relu(%1), uses = [%4.i0, %4.i1]; %4 : UNKNOWN_TYPE = ATen[operator=mul](%3, %3), uses = [%5.i0]; %5 : Float(3, 4) = ATen[operator=add](%4, %2), uses = [%0.i0]; return (%5); } ``` The graph can then be imported using ONNX and run with Caffe2: ``` import onnx import caffe2.python.onnx.backend import numpy as np graph = onnx.load("output.onnx") a = np.random.randn(3, 2).astype(np.float32) b = np.random.randn(3, 2).astype(np.float32) prepared_backend = caffe2.python.onnx.backend.prepare(graph) W = {graph.graph.input[0].name: a, graph.graph.input[1].name: b} c2_out = prepared_backend.run(W)[0] x = np.maximum(a, 0) r = x*x + b np.testing.assert_array_almost_equal(r, c2_out) ``` ### Code For the full source code for this tutorial, see [sample.py](sample.py). ================================================ FILE: caffe2/contrib/aten/docs/sample.py ================================================ import numpy as np from torch import nn from torch.autograd import Variable, Function import torch.onnx import onnx import caffe2.python.onnx.backend class MyFunction(Function): @staticmethod def forward(ctx, x, y): return x*x + y @staticmethod def symbolic(graph, x, y): x2 = graph.at("mul", x, x) r = graph.at("add", x2, y) # x, y, x2, and r are 'Node' objects # print(r) or print(graph) will print out a textual representation for debugging. # this representation will be converted to ONNX protobufs on export. return r class MyModule(nn.Module): def forward(self, x, y): # you can combine your ATen ops with standard onnx ones x = nn.ReLU()(x) return MyFunction.apply(x, y) torch.onnx.export(MyModule(), (Variable(torch.ones(3,4)), Variable(torch.ones(3,4))), "output.onnx", verbose=True) # prints the graph for debugging: # graph(%1 : Float(3, 4) # %2 : Float(3, 4)) { # %3 : Float(3, 4) = Relu(%1), uses = [%4.i0, %4.i1]; # %4 : UNKNOWN_TYPE = ATen[operator=mul](%3, %3), uses = [%5.i0]; # %5 : Float(3, 4) = ATen[operator=add](%4, %2), uses = [%0.i0]; # return (%5); # } graph = onnx.load("output.onnx") a = np.random.randn(3, 4).astype(np.float32) b = np.random.randn(3, 4).astype(np.float32) prepared_backend = caffe2.python.onnx.backend.prepare(graph) W = {graph.graph.input[0].name: a, graph.graph.input[1].name: b} c2_out = prepared_backend.run(W)[0] x = np.maximum(a, 0) r = x*x + b np.testing.assert_array_almost_equal(r, c2_out) ================================================ FILE: caffe2/contrib/aten/gen_op.py ================================================ #!/bin/env python # Copyright (c) 2016-present, Facebook, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. ############################################################################## import sys import yaml import argparse import os from copy import deepcopy parser = argparse.ArgumentParser() parser.add_argument("--template_dir", default=".", help="where template.h is") parser.add_argument("--yaml_dir", default="aten/src/ATen/ATen", help="where ATen yaml files are") parser.add_argument("--output_prefix", default="", help="") parser.add_argument( "--install_dir", default=".", help="where to put generated file") parser.add_argument("--third_party_root", default="", help="caffe2 third_party") args, _ = parser.parse_known_args() if args.third_party_root: sys.path.append(os.path.join(args.third_party_root, "aten/src/ATen")) from code_template import CodeTemplate as CT else: from src.ATen.code_template import CodeTemplate as CT OP_TEMPLATE = CT.from_file( os.path.join(args.template_dir, 'aten_op_template.h')) try: # use faster C loader if available from yaml import CLoader as Loader except ImportError: from yaml import Loader def write(filename, s): with open(filename, "w") as f: f.write(s) def read(filename): with open(filename, "r") as f: return f.read() def value_has_tensors(v): # Sparse shouldn't appear in public API, seems to be temporary bug return "Tensor" in v['dynamic_type'] and "Sparse" not in v['dynamic_type'] def value_is_tensor_type(v): return value_has_tensors(v) and v['dynamic_type'] != 'TensorList' # for each aten type, how do we handle a return value of that type? RETURN_MAP = { 'Tensor': 'assignTo(Output(${offset}),${output});', 'Scalar': 'assignTo(Output(${offset}),*inferred_type, ${output});', 'bool': 'assignToValue(Output(${offset}),${output});', 'int64_t': 'assignToValue(Output(${offset}),${output});', 'std::vector': 'assignListStartingAt(${offset}, ${output});', } # for each non-Tensor aten argument, how to we read it from caffe2's # attribute list. Most of these call runtime functions defined in the # template class. ARGUMENT_MAP = { 'Scalar': 'at::Scalar ${arg} = readScalarAttribute("${arg}");', 'bool': 'bool ${arg} = readAttribute("${arg}");', 'int': 'int ${arg} = readAttribute("${arg}");', 'double': 'double ${arg} = readAttribute("${arg}");', 'int64_t': 'int64_t ${arg} = readAttribute("${arg}");', 'IntList': 'auto ${arg} = readIntList("${arg}");', 'std::array': 'auto ${arg} = readBoolMask<2>("${arg}");', 'std::array': 'auto ${arg} = readBoolMask<3>("${arg}");', } def expand(o): num_defaults = sum(1 if 'default' in arg else 0 for arg in o['arguments']) results = [o] for i in range(0, num_defaults): # last num_default values should be default assert('default' in o['arguments'][-(i + 1)]) v = deepcopy(o) v['arguments'] = v['arguments'][:-(i + 1)] results.append(v) return results # filter the list of declarations removing things we cannot support def supports(o): # skip all in-place operators for now since aten cannot Resize # caffe2 memory inside an operator if o['inplace']: return False # _out variants also work in-place on arguments taken as destinations # we also cannot handle these because aten cannot resize caffe2 Tensors if "_out" in o['name']: return False # skip return types we cannot handle for ret in o['returns']: if not value_has_tensors(ret) and ret['type'] not in RETURN_MAP: print("Skipping {} Because of Ret: {} ({})".format( o['name'], ret['type'], ret['dynamic_type'])) return False # skip arguments we cannot handle for arg in o['arguments']: if not value_has_tensors(arg) and arg['type'] not in ARGUMENT_MAP: print("Skipping {} Because of Arg: {} ({}) ".format( o['name'], arg['type'], arg['dynamic_type'])) return False return True # template for each potential operator. # each operator has an integer 'key' associated with it, and # a lambda that defines the operator # non-tensor attributes are created in ${initialization} # and then saved as arguments to the lambda # Inputs/Outputs are read inside the lambda OPTION_TEMPLATE = CT("""\ case ${key}: { // ${name} ${initialization} run_op = [=] { ${statements} auto the_result = ${invocation}; ${assignments} return true; }; } break; """) def get_output(o, i): if len(o['returns']) == 1: return 'the_result' else: return 'std::get<{}>(the_result)'.format(i) def attribute_names(o): return sorted([a['name'] for a in o['arguments'] if not value_has_tensors(a)]) def required_attribute_names(o): return sorted([a['name'] for a in o['arguments'] if not value_has_tensors(a) and 'default' not in a]) def self_as_first_argument(arguments): return ([a for a in arguments if a['name'] == 'self'] + [a for a in arguments if a['name'] != 'self']) def get_num_inputs(o): args = 0 for a in o['arguments']: if a['type'] == 'TensorList': return '*' elif value_has_tensors(a): args += 1 return str(args) if __name__ == '__main__': decls = yaml.load(read(os.path.join(args.yaml_dir, 'Declarations.yaml')), Loader=Loader) filtered = [expanded for o in decls for expanded in expand(o) if supports(expanded)] top_env = { 'mappings': [], 'implementations': [], } seen = set() key = 0 for o in filtered: # [DESCRIPTORS] # each option is associated with a descriptor string that is used # to figure out which version of an op is being used: # The format is: # opname-num_inputs-attribute_1-attribute2 # Example: # lerp-2-weight # the operator lerp takes 2 arguments and has the attribute weight attr_names = attribute_names(o) num_inputs = get_num_inputs(o) descriptor = '-'.join([o['name']] + attr_names + [num_inputs]) if descriptor in seen: continue seen.add(descriptor) # map from descriptor string to the integer key in the switch statements # that initializes the operators top_env['mappings'].append('{{ "{}", {} }},'.format(descriptor, key)) env = { 'name': o['name'], 'statements': [], 'arguments': [], 'assignments': [], 'initialization': [], 'key': str(key), } defined_inferred_type = False if 'Tensor' in o['method_of']: # make sure 'self' is the first argument. currently Declarations.yaml # does not always do this. Instead it keeps the argument list the same order # as the Type method. o['arguments'] = self_as_first_argument(o['arguments']) elif 'namespace' not in o['method_of']: # methods on type like 'ones' or 'zeros' always take a # string attribute that is translated into the at::Type object # e.g. "Float" is at::kFloat assert('Type' in o['method_of']) defined_inferred_type = True env['initialization'].append( 'auto inferred_type = readTypeAttribute("type");') i = 0 for arg in o['arguments']: env['arguments'].append(arg['name']) if arg['type'] == 'TensorList': env['statements'].append( 'auto {} = loadInputsAtOffset({});'.format(arg['name'], i)) elif value_is_tensor_type(arg): assert(i != '*') # tensor list is not last argument # load tensor inputs from Caffe2 env['statements'].append( "auto {} = loadInput({});".format(arg['name'], i)) i += 1 if arg['dynamic_type'] == 'Tensor' and not defined_inferred_type: # first tensor input is used to define the output type. defined_inferred_type = True env['statements'].append( 'auto inferred_type = &({}.type());'.format( arg['name'])) else: init = CT(ARGUMENT_MAP[arg['type']]).substitute(env, arg=arg['name']) env['initialization'].append(init) for i, r in enumerate(o['returns']): t = RETURN_MAP[r['type'] if not value_is_tensor_type(r) else 'Tensor'] assignment = CT(t).substitute(env, offset=i, output=get_output(o, i)) env['assignments'].append(assignment) if 'Tensor' in o['method_of']: env['invocation'] = "self.{}({})".format( o['name'], ', '.join(env['arguments'][1:])) elif 'namespace' in o['method_of']: env['invocation'] = CT("at::${name}(${arguments})").substitute(env) else: assert('Type' in o['method_of']) env['invocation'] = CT( 'inferred_type->${name}(${arguments})').substitute(env) top_env['implementations'].append(OPTION_TEMPLATE.substitute(env)) key += 1 write(os.path.join(args.install_dir, args.output_prefix + "aten_op.h"), OP_TEMPLATE.substitute(top_env)) ================================================ FILE: caffe2/contrib/cuda-convnet2/LICENSE ================================================ Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright [yyyy] [name of copyright owner] Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ================================================ FILE: caffe2/contrib/cuda-convnet2/README.md ================================================ # cuda-convnet2 Automatically exported from code.google.com/p/cuda-convnet2 You can read the documentation in two ways: 1. On this site: go to branches > wiki. 2. On Google Code (for now?): https://code.google.com/p/cuda-convnet2/ ================================================ FILE: caffe2/contrib/cuda-convnet2/build.sh ================================================ #!/bin/sh # Copyright 2014 Google Inc. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. ############################################################################### # Fill in the below environment variables. # # If you're not sure what these paths should be, # you can use the find command to try to locate them. # For example, NUMPY_INCLUDE_PATH contains the file # arrayobject.h. So you can search for it like this: # # find /usr -name arrayobject.h # # (it'll almost certainly be under /usr) # CUDA toolkit installation directory. export CUDA_INSTALL_PATH=/usr/local/cuda # Python include directory. This should contain the file Python.h, among others. export PYTHON_INCLUDE_PATH=/usr/include/python2.7 # Numpy include directory. This should contain the file arrayobject.h, among others. export NUMPY_INCLUDE_PATH=/usr/lib/python2.7/dist-packages/numpy/core/include/numpy/ # ATLAS library directory. This should contain the file libcblas.so, among others. export ATLAS_LIB_PATH=/usr/lib/atlas-base # You don't have to change these: export LD_LIBRARY_PATH=$CUDA_INSTALL_PATH/lib64:$LD_LIBRARY_PATH export CUDA_SDK_PATH=$CUDA_INSTALL_PATH/samples export PATH=$PATH:$CUDA_INSTALL_PATH/bin cd util && make numpy=1 -j $* && cd .. cd nvmatrix && make -j $* && cd .. cd cudaconv3 && make -j $* && cd .. cd cudaconvnet && make -j $* && cd .. cd make-data/pyext && make -j $* && cd ../.. ================================================ FILE: caffe2/contrib/cuda-convnet2/convdata.py ================================================ # Copyright 2014 Google Inc. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from python_util.data import * import numpy.random as nr import numpy as n import random as r from time import time from threading import Thread from math import sqrt import sys #from matplotlib import pylab as pl from PIL import Image from StringIO import StringIO from time import time import itertools as it class JPEGBatchLoaderThread(Thread): def __init__(self, dp, batch_num, label_offset, list_out): Thread.__init__(self) self.list_out = list_out self.label_offset = label_offset self.dp = dp self.batch_num = batch_num @staticmethod def load_jpeg_batch(rawdics, dp, label_offset): if type(rawdics) != list: rawdics = [rawdics] nc_total = sum(len(r['data']) for r in rawdics) jpeg_strs = list(it.chain.from_iterable(rd['data'] for rd in rawdics)) labels = list(it.chain.from_iterable(rd['labels'] for rd in rawdics)) img_mat = n.empty((nc_total * dp.data_mult, dp.inner_pixels * dp.num_colors), dtype=n.float32) lab_mat = n.zeros((nc_total, dp.get_num_classes()), dtype=n.float32) dp.convnet.libmodel.decodeJpeg(jpeg_strs, img_mat, dp.img_size, dp.inner_size, dp.test, dp.multiview) lab_vec = n.tile(n.asarray([(l[nr.randint(len(l))] if len(l) > 0 else -1) + label_offset for l in labels], dtype=n.single).reshape((nc_total, 1)), (dp.data_mult,1)) for c in xrange(nc_total): lab_mat[c, [z + label_offset for z in labels[c]]] = 1 lab_mat = n.tile(lab_mat, (dp.data_mult, 1)) return {'data': img_mat[:nc_total * dp.data_mult,:], 'labvec': lab_vec[:nc_total * dp.data_mult,:], 'labmat': lab_mat[:nc_total * dp.data_mult,:]} def run(self): rawdics = self.dp.get_batch(self.batch_num) p = JPEGBatchLoaderThread.load_jpeg_batch(rawdics, self.dp, self.label_offset) self.list_out.append(p) class ColorNoiseMakerThread(Thread): def __init__(self, pca_stdevs, pca_vecs, num_noise, list_out): Thread.__init__(self) self.pca_stdevs, self.pca_vecs = pca_stdevs, pca_vecs self.num_noise = num_noise self.list_out = list_out def run(self): noise = n.dot(nr.randn(self.num_noise, 3).astype(n.single) * self.pca_stdevs.T, self.pca_vecs.T) self.list_out.append(noise) class ImageDataProvider(LabeledDataProvider): def __init__(self, data_dir, batch_range=None, init_epoch=1, init_batchnum=None, dp_params=None, test=False): LabeledDataProvider.__init__(self, data_dir, batch_range, init_epoch, init_batchnum, dp_params, test) self.data_mean = self.batch_meta['data_mean'].astype(n.single) self.color_eig = self.batch_meta['color_pca'][1].astype(n.single) self.color_stdevs = n.c_[self.batch_meta['color_pca'][0].astype(n.single)] self.color_noise_coeff = dp_params['color_noise'] self.num_colors = 3 self.img_size = int(sqrt(self.batch_meta['num_vis'] / self.num_colors)) self.mini = dp_params['minibatch_size'] self.inner_size = dp_params['inner_size'] if dp_params['inner_size'] > 0 else self.img_size self.inner_pixels = self.inner_size **2 self.border_size = (self.img_size - self.inner_size) / 2 self.multiview = dp_params['multiview_test'] and test self.num_views = 5*2 self.data_mult = self.num_views if self.multiview else 1 self.batch_size = self.batch_meta['batch_size'] self.label_offset = 0 if 'label_offset' not in self.batch_meta else self.batch_meta['label_offset'] self.scalar_mean = dp_params['scalar_mean'] # Maintain pointers to previously-returned data matrices so they don't get garbage collected. self.data = [None, None] # These are pointers to previously-returned data matrices self.loader_thread, self.color_noise_thread = None, None self.convnet = dp_params['convnet'] self.num_noise = self.batch_size self.batches_generated, self.loaders_started = 0, 0 self.data_mean_crop = self.data_mean.reshape((self.num_colors,self.img_size,self.img_size))[:,self.border_size:self.border_size+self.inner_size,self.border_size:self.border_size+self.inner_size].reshape((1,3*self.inner_size**2)) if self.scalar_mean >= 0: self.data_mean_crop = self.scalar_mean def showimg(self, img): from matplotlib import pylab as pl pixels = img.shape[0] / 3 size = int(sqrt(pixels)) img = img.reshape((3,size,size)).swapaxes(0,2).swapaxes(0,1) pl.imshow(img, interpolation='nearest') pl.show() def get_data_dims(self, idx=0): if idx == 0: return self.inner_size**2 * 3 if idx == 2: return self.get_num_classes() return 1 def start_loader(self, batch_idx): self.load_data = [] self.loader_thread = JPEGBatchLoaderThread(self, self.batch_range[batch_idx], self.label_offset, self.load_data) self.loader_thread.start() def start_color_noise_maker(self): color_noise_list = [] self.color_noise_thread = ColorNoiseMakerThread(self.color_stdevs, self.color_eig, self.num_noise, color_noise_list) self.color_noise_thread.start() return color_noise_list def set_labels(self, datadic): pass def get_data_from_loader(self): if self.loader_thread is None: self.start_loader(self.batch_idx) self.loader_thread.join() self.data[self.d_idx] = self.load_data[0] self.start_loader(self.get_next_batch_idx()) else: # Set the argument to join to 0 to re-enable batch reuse self.loader_thread.join() if not self.loader_thread.is_alive(): self.data[self.d_idx] = self.load_data[0] self.start_loader(self.get_next_batch_idx()) #else: # print "Re-using batch" self.advance_batch() def add_color_noise(self): # At this point the data already has 0 mean. # So I'm going to add noise to it, but I'm also going to scale down # the original data. This is so that the overall scale of the training # data doesn't become too different from the test data. s = self.data[self.d_idx]['data'].shape cropped_size = self.get_data_dims(0) / 3 ncases = s[0] if self.color_noise_thread is None: self.color_noise_list = self.start_color_noise_maker() self.color_noise_thread.join() self.color_noise = self.color_noise_list[0] self.color_noise_list = self.start_color_noise_maker() else: self.color_noise_thread.join(0) if not self.color_noise_thread.is_alive(): self.color_noise = self.color_noise_list[0] self.color_noise_list = self.start_color_noise_maker() self.data[self.d_idx]['data'] = self.data[self.d_idx]['data'].reshape((ncases*3, cropped_size)) self.color_noise = self.color_noise[:ncases,:].reshape((3*ncases, 1)) self.data[self.d_idx]['data'] += self.color_noise * self.color_noise_coeff self.data[self.d_idx]['data'] = self.data[self.d_idx]['data'].reshape((ncases, 3* cropped_size)) self.data[self.d_idx]['data'] *= 1.0 / (1.0 + self.color_noise_coeff) # <--- NOTE: This is the slow line, 0.25sec. Down from 0.75sec when I used division. def get_next_batch(self): self.d_idx = self.batches_generated % 2 epoch, batchnum = self.curr_epoch, self.curr_batchnum self.get_data_from_loader() # Subtract mean self.data[self.d_idx]['data'] -= self.data_mean_crop if self.color_noise_coeff > 0 and not self.test: self.add_color_noise() self.batches_generated += 1 return epoch, batchnum, [self.data[self.d_idx]['data'].T, self.data[self.d_idx]['labvec'].T, self.data[self.d_idx]['labmat'].T] # Takes as input an array returned by get_next_batch # Returns a (numCases, imgSize, imgSize, 3) array which can be # fed to pylab for plotting. # This is used by shownet.py to plot test case predictions. def get_plottable_data(self, data, add_mean=True): mean = self.data_mean_crop.reshape((data.shape[0],1)) if data.flags.f_contiguous or self.scalar_mean else self.data_mean_crop.reshape((data.shape[0],1)) return n.require((data + (mean if add_mean else 0)).T.reshape(data.shape[1], 3, self.inner_size, self.inner_size).swapaxes(1,3).swapaxes(1,2) / 255.0, dtype=n.single) class CIFARDataProvider(LabeledDataProvider): def __init__(self, data_dir, batch_range=None, init_epoch=1, init_batchnum=None, dp_params=None, test=False): LabeledDataProvider.__init__(self, data_dir, batch_range, init_epoch, init_batchnum, dp_params, test) self.img_size = 32 self.num_colors = 3 self.inner_size = dp_params['inner_size'] if dp_params['inner_size'] > 0 else self.batch_meta['img_size'] self.border_size = (self.img_size - self.inner_size) / 2 self.multiview = dp_params['multiview_test'] and test self.num_views = 9 self.scalar_mean = dp_params['scalar_mean'] self.data_mult = self.num_views if self.multiview else 1 self.data_dic = [] for i in batch_range: self.data_dic += [unpickle(self.get_data_file_name(i))] self.data_dic[-1]["labels"] = n.require(self.data_dic[-1]['labels'], dtype=n.single) self.data_dic[-1]["labels"] = n.require(n.tile(self.data_dic[-1]["labels"].reshape((1, n.prod(self.data_dic[-1]["labels"].shape))), (1, self.data_mult)), requirements='C') self.data_dic[-1]['data'] = n.require(self.data_dic[-1]['data'] - self.scalar_mean, dtype=n.single, requirements='C') self.cropped_data = [n.zeros((self.get_data_dims(), self.data_dic[0]['data'].shape[1]*self.data_mult), dtype=n.single) for x in xrange(2)] self.batches_generated = 0 self.data_mean = self.batch_meta['data_mean'].reshape((self.num_colors,self.img_size,self.img_size))[:,self.border_size:self.border_size+self.inner_size,self.border_size:self.border_size+self.inner_size].reshape((self.get_data_dims(), 1)) def get_next_batch(self): epoch, batchnum = self.curr_epoch, self.curr_batchnum self.advance_batch() bidx = batchnum - self.batch_range[0] cropped = self.cropped_data[self.batches_generated % 2] self.__trim_borders(self.data_dic[bidx]['data'], cropped) cropped -= self.data_mean self.batches_generated += 1 return epoch, batchnum, [cropped, self.data_dic[bidx]['labels']] def get_data_dims(self, idx=0): return self.inner_size**2 * self.num_colors if idx == 0 else 1 # Takes as input an array returned by get_next_batch # Returns a (numCases, imgSize, imgSize, 3) array which can be # fed to pylab for plotting. # This is used by shownet.py to plot test case predictions. def get_plottable_data(self, data): return n.require((data + self.data_mean).T.reshape(data.shape[1], 3, self.inner_size, self.inner_size).swapaxes(1,3).swapaxes(1,2) / 255.0, dtype=n.single) def __trim_borders(self, x, target): y = x.reshape(self.num_colors, self.img_size, self.img_size, x.shape[1]) if self.test: # don't need to loop over cases if self.multiview: start_positions = [(0,0), (0, self.border_size), (0, self.border_size*2), (self.border_size, 0), (self.border_size, self.border_size), (self.border_size, self.border_size*2), (self.border_size*2, 0), (self.border_size*2, self.border_size), (self.border_size*2, self.border_size*2)] end_positions = [(sy+self.inner_size, sx+self.inner_size) for (sy,sx) in start_positions] for i in xrange(self.num_views): target[:,i * x.shape[1]:(i+1)* x.shape[1]] = y[:,start_positions[i][0]:end_positions[i][0],start_positions[i][1]:end_positions[i][1],:].reshape((self.get_data_dims(),x.shape[1])) else: pic = y[:,self.border_size:self.border_size+self.inner_size,self.border_size:self.border_size+self.inner_size, :] # just take the center for now target[:,:] = pic.reshape((self.get_data_dims(), x.shape[1])) else: for c in xrange(x.shape[1]): # loop over cases startY, startX = nr.randint(0,self.border_size*2 + 1), nr.randint(0,self.border_size*2 + 1) endY, endX = startY + self.inner_size, startX + self.inner_size pic = y[:,startY:endY,startX:endX, c] if nr.randint(2) == 0: # also flip the image with 50% probability pic = pic[:,:,::-1] target[:,c] = pic.reshape((self.get_data_dims(),)) class DummyConvNetLogRegDataProvider(LabeledDummyDataProvider): def __init__(self, data_dim): LabeledDummyDataProvider.__init__(self, data_dim) self.img_size = int(sqrt(data_dim/3)) def get_next_batch(self): epoch, batchnum, dic = LabeledDummyDataProvider.get_next_batch(self) dic = {'data': dic[0], 'labels': dic[1]} print dic['data'].shape, dic['labels'].shape return epoch, batchnum, [dic['data'], dic['labels']] # Returns the dimensionality of the two data matrices returned by get_next_batch def get_data_dims(self, idx=0): return self.batch_meta['num_vis'] if idx == 0 else 1 ================================================ FILE: caffe2/contrib/cuda-convnet2/convnet.py ================================================ # Copyright 2014 Google Inc. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import numpy as n import numpy.random as nr import random as r from python_util.util import * from python_util.data import * from python_util.options import * from python_util.gpumodel import * import sys import math as m import layer as lay from convdata import ImageDataProvider, CIFARDataProvider, DummyConvNetLogRegDataProvider from os import linesep as NL import copy as cp import os class Driver(object): def __init__(self, convnet): self.convnet = convnet def on_start_batch(self, batch_data, train): pass def on_finish_batch(self): pass class GradCheckDriver(Driver): def on_start_batch(self, batch_data, train): data = batch_data[2] self.convnet.libmodel.checkGradients(data) class TrainingDriver(Driver): def on_start_batch(self, batch_data, train): data = batch_data[2] self.convnet.libmodel.startBatch(data, self.convnet.get_progress(), not train) class MultiviewTestDriver(TrainingDriver): def on_start_batch(self, batch_data, train): self.write_output = False if train: TrainingDriver.on_start_batch(self, batch_data, train) else: data = batch_data[2] num_views = self.convnet.test_data_provider.num_views if self.convnet.test_out != "" and self.convnet.logreg_name != "": self.write_output = True self.test_file_name = os.path.join(self.convnet.test_out, 'test_preds_%d' % batch_data[1]) self.probs = n.zeros((data[0].shape[1]/num_views, self.convnet.test_data_provider.get_num_classes()), dtype=n.single) self.convnet.libmodel.startMultiviewTest(data, num_views, self.probs, self.convnet.logreg_name) else: self.convnet.libmodel.startMultiviewTest(data, num_views) def on_finish_batch(self): if self.write_output: if not os.path.exists(self.convnet.test_out): os.makedirs(self.convnet.test_out) pickle(self.test_file_name, {'data': self.probs, 'note': 'generated from %s' % self.convnet.save_file}) class FeatureWriterDriver(Driver): def __init__(self, convnet): Driver.__init__(self, convnet) self.last_batch = convnet.test_batch_range[-1] def on_start_batch(self, batch_data, train): if train: raise ModelStateException("FeatureWriter must be used in conjunction with --test-only=1. It writes test data features.") self.batchnum, self.data = batch_data[1], batch_data[2] if not os.path.exists(self.convnet.feature_path): os.makedirs(self.convnet.feature_path) self.num_ftrs = self.convnet.layers[self.convnet.write_features]['outputs'] self.ftrs = n.zeros((self.data[0].shape[1], self.num_ftrs), dtype=n.single) self.convnet.libmodel.startFeatureWriter(self.data, [self.ftrs], [self.convnet.write_features]) def on_finish_batch(self): path_out = os.path.join(self.convnet.feature_path, 'data_batch_%d' % self.batchnum) pickle(path_out, {'data': self.ftrs, 'labels': self.data[1]}) print "Wrote feature file %s" % path_out if self.batchnum == self.last_batch: pickle(os.path.join(self.convnet.feature_path, 'batches.meta'), {'source_model':self.convnet.load_file, 'num_vis':self.num_ftrs, 'batch_size': self.convnet.test_data_provider.batch_meta['batch_size']}) class ConvNet(IGPUModel): def __init__(self, op, load_dic, dp_params={}): filename_options = [] for v in ('color_noise', 'multiview_test', 'inner_size', 'scalar_mean', 'minibatch_size'): dp_params[v] = op.get_value(v) IGPUModel.__init__(self, "ConvNet", op, load_dic, filename_options, dp_params=dp_params) def import_model(self): lib_name = "cudaconvnet._ConvNet" print "=========================" print "Importing %s C++ module" % lib_name self.libmodel = __import__(lib_name,fromlist=['_ConvNet']) def init_model_lib(self): self.libmodel.initModel(self.layers, self.device_ids, self.minibatch_size, self.conserve_mem) def init_model_state(self): ms = self.model_state layers = ms['layers'] if self.loaded_from_checkpoint else {} ms['layers'] = lay.LayerParser.parse_layers(os.path.join(self.layer_path, self.layer_def), os.path.join(self.layer_path, self.layer_params), self, layers=layers) self.do_decouple_conv() self.do_unshare_weights() self.op.set_value('conv_to_local', [], parse=False) self.op.set_value('unshare_weights', [], parse=False) self.set_driver() def do_decouple_conv(self): # Convert convolutional layers to local if len(self.op.get_value('conv_to_local')) > 0: for lname in self.op.get_value('conv_to_local'): if self.model_state['layers'][lname]['type'] == 'conv': lay.LocalLayerParser.conv_to_local(self.model_state['layers'], lname) def do_unshare_weights(self): # Decouple weight matrices if len(self.op.get_value('unshare_weights')) > 0: for name_str in self.op.get_value('unshare_weights'): if name_str: name = lay.WeightLayerParser.get_layer_name(name_str) if name is not None: name, idx = name[0], name[1] if name not in self.model_state['layers']: raise ModelStateException("Layer '%s' does not exist; unable to unshare" % name) layer = self.model_state['layers'][name] lay.WeightLayerParser.unshare_weights(layer, self.model_state['layers'], matrix_idx=idx) else: raise ModelStateException("Invalid layer name '%s'; unable to unshare." % name_str) def set_driver(self): if self.op.get_value('check_grads'): self.driver = GradCheckDriver(self) elif self.op.get_value('multiview_test'): self.driver = MultiviewTestDriver(self) elif self.op.get_value('write_features'): self.driver = FeatureWriterDriver(self) else: self.driver = TrainingDriver(self) def fill_excused_options(self): if self.op.get_value('check_grads'): self.op.set_value('save_path', '') self.op.set_value('train_batch_range', '0') self.op.set_value('test_batch_range', '0') self.op.set_value('data_path', '') # Make sure the data provider returned data in proper format def parse_batch_data(self, batch_data, train=True): if max(d.dtype != n.single for d in batch_data[2]): raise DataProviderException("All matrices returned by data provider must consist of single-precision floats.") return batch_data def start_batch(self, batch_data, train=True): self.driver.on_start_batch(batch_data, train) def finish_batch(self): ret = IGPUModel.finish_batch(self) self.driver.on_finish_batch() return ret def print_iteration(self): print "%d.%d (%.2f%%)..." % (self.epoch, self.batchnum, 100 * self.get_progress()), def print_train_time(self, compute_time_py): print "(%.3f sec)" % (compute_time_py) def print_costs(self, cost_outputs): costs, num_cases = cost_outputs[0], cost_outputs[1] children = set() for errname in costs: if sum(errname in self.layers[z]['children'] for z in costs) == 0: # print self.layers[errname]['children'] for child in set(self.layers[errname]['children']) & set(costs.keys()): costs[errname] = [v + u for v, u in zip(costs[errname], costs[child])] children.add(child) filtered_costs = eval(self.layers[errname]['outputFilter'])(costs[errname], num_cases) print "%s: " % errname, if 'outputFilterFormatter' not in self.layers[errname]: print ", ".join("%.6f" % v for v in filtered_costs), else: print eval(self.layers[errname]['outputFilterFormatter'])(self,filtered_costs), if m.isnan(filtered_costs[0]) or m.isinf(filtered_costs[0]): print "<- error nan or inf!" sys.exit(1) for c in children: del costs[c] def print_train_results(self): self.print_costs(self.train_outputs[-1]) def print_test_status(self): pass def print_test_results(self): print NL + "======================Test output======================" self.print_costs(self.test_outputs[-1]) if not self.test_only: print NL + "----------------------Averages-------------------------" self.print_costs(self.aggregate_test_outputs(self.test_outputs[-len(self.test_batch_range):])) print NL + "-------------------------------------------------------", for name,val in sorted(self.layers.items(), key=lambda x: x[1]['id']): # This is kind of hacky but will do for now. l = self.layers[name] if 'weights' in l: wscales = [(l['name'], i, n.mean(n.abs(w)), n.mean(n.abs(wi))) for i,(w,wi) in enumerate(zip(l['weights'],l['weightsInc']))] print "" print NL.join("Layer '%s' weights[%d]: %e [%e] [%e]" % (s[0], s[1], s[2], s[3], s[3]/s[2] if s[2] > 0 else 0) for s in wscales), print "%sLayer '%s' biases: %e [%e]" % (NL, l['name'], n.mean(n.abs(l['biases'])), n.mean(n.abs(l['biasesInc']))), print "" def conditional_save(self): self.save_state() def aggregate_test_outputs(self, test_outputs): test_outputs = cp.deepcopy(test_outputs) num_cases = sum(t[1] for t in test_outputs) for i in xrange(1 ,len(test_outputs)): for k,v in test_outputs[i][0].items(): for j in xrange(len(v)): test_outputs[0][0][k][j] += test_outputs[i][0][k][j] return (test_outputs[0][0], num_cases) @classmethod def get_options_parser(cls): op = IGPUModel.get_options_parser() op.add_option("mini", "minibatch_size", IntegerOptionParser, "Minibatch size", default=128) op.add_option("layer-def", "layer_def", StringOptionParser, "Layer definition file", set_once=False) op.add_option("layer-params", "layer_params", StringOptionParser, "Layer parameter file") op.add_option("layer-path", "layer_path", StringOptionParser, "Layer file path prefix", default="") op.add_option("check-grads", "check_grads", BooleanOptionParser, "Check gradients and quit?", default=0, excuses=['data_path','save_path', 'save_file_override', 'train_batch_range','test_batch_range']) op.add_option("multiview-test", "multiview_test", BooleanOptionParser, "Cropped DP: test on multiple patches?", default=0) op.add_option("inner-size", "inner_size", IntegerOptionParser, "Cropped DP: crop size (0 = don't crop)", default=0, set_once=True) op.add_option("conv-to-local", "conv_to_local", ListOptionParser(StringOptionParser), "Convert given conv layers to unshared local", default=[]) op.add_option("unshare-weights", "unshare_weights", ListOptionParser(StringOptionParser), "Unshare weight matrices in given layers", default=[]) op.add_option("conserve-mem", "conserve_mem", BooleanOptionParser, "Conserve GPU memory (slower)?", default=0) op.add_option("color-noise", "color_noise", FloatOptionParser, "Add PCA noise to color channels with given scale", default=0.0) op.add_option("test-out", "test_out", StringOptionParser, "Output test case predictions to given path", default="", requires=['logreg_name', 'multiview_test']) op.add_option("logreg-name", "logreg_name", StringOptionParser, "Logreg cost layer name (for --test-out)", default="") op.add_option("scalar-mean", "scalar_mean", FloatOptionParser, "Subtract this scalar from image (-1 = don't)", default=-1) op.add_option("write-features", "write_features", StringOptionParser, "Write test data features from given layer", default="", requires=['feature-path']) op.add_option("feature-path", "feature_path", StringOptionParser, "Write test data features to this path (to be used with --write-features)", default="") op.delete_option('max_test_err') op.options["testing_freq"].default = 57 op.options["num_epochs"].default = 50000 op.options['dp_type'].default = None DataProvider.register_data_provider('dummy-lr-n', 'Dummy ConvNet logistic regression', DummyConvNetLogRegDataProvider) DataProvider.register_data_provider('image', 'JPEG-encoded image data provider', ImageDataProvider) DataProvider.register_data_provider('cifar', 'CIFAR-10 data provider', CIFARDataProvider) return op if __name__ == "__main__": # nr.seed(6) op = ConvNet.get_options_parser() op, load_dic = IGPUModel.parse_options(op) model = ConvNet(op, load_dic) model.start() ================================================ FILE: caffe2/contrib/cuda-convnet2/cudaconv3/Makefile ================================================ ################################################################################ # # Copyright 1993-2012 NVIDIA Corporation. All rights reserved. # # NOTICE TO USER: # # This source code is subject to NVIDIA ownership rights under U.S. and # international Copyright laws. # # NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE # CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR # IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH # REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF # MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. # IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, # OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS # OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE # OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE # OR PERFORMANCE OF THIS SOURCE CODE. # # U.S. Government End Users. This source code is a "commercial item" as # that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of # "commercial computer software" and "commercial computer software # documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) # and is provided to the U.S. Government only as a commercial end item. # Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through # 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the # source code with only those rights set forth herein. # ################################################################################ # Location of the CUDA Toolkit binaries and libraries CUDA_INC_PATH = $(CUDA_INSTALL_PATH)/include CUDA_BIN_PATH = $(CUDA_INSTALL_PATH)/bin CUDA_LIB_PATH = $(CUDA_INSTALL_PATH)/lib64 # Common binaries NVCC = $(CUDA_BIN_PATH)/nvcc GCC = g++ AR = ar # CUDA code generation flags GENCODE_SM35 := -gencode arch=compute_35,code=sm_35 GENCODE_FLAGS := $(GENCODE_SM35) LDFLAGS := -L$(CUDA_LIB_PATH) -lcudart CCFLAGS := -m64 NVCCFLAGS := -m64 # Debug build flags ifeq ($(dbg),1) CCFLAGS += -g NVCCFLAGS += -g -G DBG := debug else DBG := release NVCCFLAGS += -O3 CCFLAGS += -O3 endif # Add profiler output ifeq ($(prof),1) NVCCFLAGS += --ptxas-options=-v endif TARGETDIR := ./bin/$(DBG) OBJDIR := ./obj/$(DBG) ########## USER STUFF ########### LDFLAGS += -L../util -lutilpy -L../nvmatrix -lnvmatrix -lcublas INCLUDES := -I$(CUDA_INC_PATH) -I $(CUDA_SDK_PATH)/common/inc -I./include CUFILES := $(shell find . -name "*.cu") CU_DEPS := $(shell find . -name "*.cuh") CCFILES := $(shell find . -name "*.cpp") C_DEPS := $(shell find . -name "*.h") NVCCFLAGS += --compiler-options '-fPIC' LDFLAGS += -shared CCFLAGS += -fPIC TARGET := $(TARGETDIR)/libcudaconv.so ################################################################################ # Set up target and object files ################################################################################ OBJS += $(patsubst %.cpp,$(OBJDIR)/%.cpp.o,$(CCFILES)) OBJS += $(patsubst %.c,$(OBJDIR)/%.c.o,$(CFILES)) OBJS += $(patsubst %.cu,$(OBJDIR)/%.cu.o,$(CUFILES)) # Target rules all: makedirs $(TARGET) $(OBJDIR)/%.cu.o : %.cu $(CU_DEPS) $(NVCC) $(NVCCFLAGS) $(GENCODE_FLAGS) $(INCLUDES) -o $@ -c $< $(OBJDIR)/%.cpp.o : %.cpp $(C_DEPS) $(GCC) $(CCFLAGS) $(INCLUDES) -o $@ -c $< $(TARGET): $(OBJS) $(GCC) $(CCFLAGS) -o $@ $+ $(LDFLAGS) ln -sf $(TARGET) . makedirs: mkdir -p $(TARGETDIR) mkdir -p $(OBJDIR)/src clean: rm -rf ./obj ================================================ FILE: caffe2/contrib/cuda-convnet2/cudaconv3/include/conv_util.cuh ================================================ /* * Copyright 2014 Google Inc. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef CONV_UTIL_CUH #define CONV_UTIL_CUH #include "../../nvmatrix/include/nvmatrix.cuh" #include "caffe2/core/context_gpu.h" #ifndef MIN #define MIN(a, b) ((a) > (b) ? (b) : (a)) #endif #ifndef MAX #define MAX(a, b) ((a) > (b) ? (a) : (b)) #endif void convLocalMaxUndo(NVMatrix& images, NVMatrix& maxGrads, NVMatrix& maxActs, NVMatrix& target, int subsX, int startX, int strideX, int outputsX); void convLocalAvgUndo(NVMatrix& avgGrads, NVMatrix& target, int subsX, int startX, int strideX, int outputsX, int imgSize, bool sum); void convLocalAvgUndo(NVMatrix& avgGrads, NVMatrix& target, int subsX, int startX, int strideX, int outputsX, int imgSize, bool sum, float scaleTargets, float scaleOutput); void convLocalMaxUndo(NVMatrix& images, NVMatrix& maxGrads, NVMatrix& maxActs, NVMatrix& target, int subsX, int startX, int strideX, int outputsX, float scaleTargets, float scaleOutput); void convResponseNorm(NVMatrix& images, NVMatrix& denoms, NVMatrix& target, int numFilters, int sizeX, float addScale, float powScale, float minDiv); void convResponseNormUndo(NVMatrix& outGrads, NVMatrix& denoms, NVMatrix& inputs, NVMatrix& acts, NVMatrix& target, int numFilters, int sizeX, float addScale, float powScale, float scaleTargets, float scaleOutput); void convContrastNorm(NVMatrix& images, NVMatrix& meanDiffs, NVMatrix& denoms, NVMatrix& target, int numFilters, int sizeX, float addScale, float powScale, float minDiv); void convContrastNormUndo(NVMatrix& outGrads, NVMatrix& denoms, NVMatrix& meanDiffs, NVMatrix& acts, NVMatrix& target, int numFilters, int sizeX, float addScale, float powScale, float scaleTargets, float scaleOutput); void convGaussianBlur(NVMatrix& images, NVMatrix& filter, NVMatrix& target, bool horiz, int numChannels, float scaleTargets, float scaleOutputs); void convBedOfNails(NVMatrix& images, NVMatrix& target, int numChannels, int imgSize, int startX, int strideX, float scaleTargets, float scaleOutput); void convBedOfNailsUndo(NVMatrix& actsGrad, NVMatrix& target, int numChannels, int imgSize, int startX, int strideX, float scaleTargets, float scaleOutput); void convResizeBilinear(NVMatrix& images, NVMatrix& target, int imgSize, int tgtSize, float scale); void convRGBToYUV(NVMatrix& images, NVMatrix& target); void convRGBToLAB(NVMatrix& images, NVMatrix& target, bool center); void convCrop(NVMatrix& imgs, NVMatrix& target, int imgSize, int tgtSize, int startY, int startX); void normalizeLocalWeights(NVMatrix& weights, int numModules, float norm); void convContrastNormCrossMap(NVMatrix& images, NVMatrix& meanDiffs, NVMatrix& target, int numFilters, int sizeF, float addScale, float powScale, float minDiv, bool blocked); void convResponseNormCrossMapUndo(NVMatrix& outGrads, NVMatrix& inputs, NVMatrix& acts, NVMatrix& target, int numFilters, int sizeF, float addScale, float powScale, float minDiv, bool blocked, float scaleTargets, float scaleOutput); void convResponseNormCrossMap(NVMatrix& images, NVMatrix& target, int numFilters, int sizeF, float addScale, float powScale, bool blocked); void convResponseNormCrossMap(NVMatrix& images, NVMatrix& target, int numFilters, int sizeF, float addScale, float powScale, float minDiv, bool blocked); void convReflectHorizontal(NVMatrix& images, NVMatrix& targets, int imgSize); void convCrossMapMaxPoolUndo(NVMatrix& images, NVMatrix& maxGrads, NVMatrix& maxActs, NVMatrix& target, const int imgSize, const int startF, const int poolSize, const int stride, const float scaleTargets, const float scaleOutputs); cudaTextureObject_t GetTensorTextureObject(caffe2::TensorCUDA* tensor); template class AvgPooler { public: __device__ inline float operator()(const float a, const float b) const { return a + b; } __device__ inline float getBaseValue() const { return 0; } __device__ inline float output(const float a, const int regionSize) const { return sum ? a : (a / regionSize); } }; class MaxPooler { public: __device__ inline float operator()(const float a, const float b) const { return fmaxf(a, b); } __device__ inline float getBaseValue() const { return -2e38; } __device__ inline float output(const float a, const int regionSize) const { return a; } }; class MaxAbsPooler { public: __device__ inline float operator()(const float a, const float b) const { return fabsf(a) > fabsf(b) ? a : b; } __device__ inline float getBaseValue() const { return 0.0f; } __device__ inline float output(const float a, const int regionSize) const { return a; } }; /* * Block size B_YxB_X * blockIdx.x determines output.x, image idx in batches of B_X*imgsPerThread * blockIdx.y determines output.y, filter idx in batches of B_Y*filtersPerThread * * So each block does one output for some number of images/filters. * * threadIdx.x determines img idx * threadIdx.y determines filter idx * * imgs: (numFilters, imgPixels, numImages) * target: (numFilters, numOutputs, numImages) * * numImages must be divisible by B_X*imgsPerThread if checkCaseBounds is false */ template __global__ void kLocalPool(float* imgs, float* target, const int imgSize, const int numFilters, const int numImages, const int subsX, const int startX, const int strideX, const int outputsX, Agg agg) { const int numImgBlocks = DIVUP(numImages,B_X*imgsPerThread); const int numFilterBlocks = DIVUP(numFilters, B_Y*filtersPerThread); const int outputIdxX = blockIdx.x / numImgBlocks; const int outputIdxY = blockIdx.y / numFilterBlocks; const int blockImgIdx = (blockIdx.x % numImgBlocks) * B_X * imgsPerThread; const int blockFilterIdx = (blockIdx.y % numFilterBlocks) * B_Y * filtersPerThread; const int myFilterIdx = (blockFilterIdx + threadIdx.y*filtersPerThread); if (myFilterIdx >= numFilters) { return; } const int outputIdx = outputIdxY * outputsX + outputIdxX; const int numOutputs = outputsX * outputsX; const int imgPixels = imgSize * imgSize; const int startImgPxX = startX + outputIdxX * strideX; const int startImgPxY = startX + outputIdxY * strideX; const int imgIdx = blockImgIdx + threadIdx.x; imgs += myFilterIdx * imgPixels * numImages + imgIdx; target += (myFilterIdx * numOutputs + outputIdx) * numImages + imgIdx; float prod[filtersPerThread][imgsPerThread]; #pragma unroll for (int f = 0; f < filtersPerThread; f++) { #pragma unroll for (int i = 0; i < imgsPerThread; i++) { prod[f][i] = agg.getBaseValue(); } } const int loopStartY = MAX(0, startImgPxY); const int loopStartX = MAX(0, startImgPxX); const int loopEndY = MIN(imgSize, startImgPxY + subsX); const int loopEndX = MIN(imgSize, startImgPxX + subsX); const int regionSize = (loopEndY - loopStartY) * (loopEndX - loopStartX); for (int y = loopStartY; y < loopEndY; y++) { for (int x = loopStartX; x < loopEndX; x++) { const int imgPx = y * imgSize + x; #pragma unroll for (int i = 0; i < imgsPerThread; i++) { if (!checkCaseBounds || imgIdx + i * B_X < numImages) { #pragma unroll for (int f = 0; f < filtersPerThread; f++) { prod[f][i] = agg(prod[f][i], imgs[(f * imgPixels + imgPx) * numImages + i * B_X]); } } } } } #pragma unroll for (int i = 0; i < imgsPerThread; i++) { if (!checkCaseBounds || imgIdx + i * B_X < numImages) { #pragma unroll for (int f = 0; f < filtersPerThread; f++) { target[f * numOutputs * numImages + i * B_X] = agg.output(prod[f][i], regionSize); } } } } /* * Block size B_YxB_X * blockIdx.x determines pixel.x, image idx in batches of B_X*imgsPerThread * blockIdx.y determines pixel.y, output idx in batches of B_Y * * So each block does one pixel for some number of images/filters. * * threadIdx.x determines img idx * threadIdx.y determines output idx * * imgs: (numFilters, imgPixels, numImages) * target: (numOutputs, imgPixels, numImages) (out) * * numImages must be divisible by B_X*imgsPerThread if checkCaseBounds is false */ template __global__ void kPoolCrossMap(float* imgs, float* target, const int imgSize, const int numFilters, const int numImages, const int startF, const int poolSize, const int numOutputs, const int stride, Agg agg) { const int imgPixels = imgSize * imgSize; const int numImgBlocks = DIVUP(numImages, B_X*imgsPerThread); // const int numOutputs = DIVUP(numFilters, stride); const int numOutputBlocks = DIVUP(numOutputs,B_Y); const int pxIdxX = blockIdx.x / numImgBlocks; const int pxIdxY = blockIdx.y / numOutputBlocks; const int blockImgIdx = (blockIdx.x % numImgBlocks) * B_X * imgsPerThread; const int outputIdx = (blockIdx.y % numOutputBlocks) * B_Y + threadIdx.y; // const int filterIdx = outputIdx * stride; const int pxIdx = pxIdxY * imgSize + pxIdxX; const int imgIdx = blockImgIdx + threadIdx.x; if (outputIdx < numOutputs) { imgs += (pxIdx) * numImages + imgIdx; target += (outputIdx * imgPixels + pxIdx) * numImages + imgIdx; float prod[imgsPerThread]; #pragma unroll for (int i = 0; i < imgsPerThread; i++) { if (!checkCaseBounds || imgIdx + i * B_X < numImages) { prod[i] = agg.getBaseValue(); } } const int myStartF = startF + outputIdx * stride; const int loopStartF = max(0, myStartF); const int loopEndF = min(numFilters, myStartF + poolSize); for (int f = loopStartF; f < loopEndF; ++f) { #pragma unroll for (int i = 0; i < imgsPerThread; i++) { if (!checkCaseBounds || imgIdx + i * B_X < numImages) { prod[i] = agg(prod[i], imgs[f * imgPixels * numImages + i * B_X]); } } } #pragma unroll for (int i = 0; i < imgsPerThread; i++) { if (!checkCaseBounds || imgIdx + i * B_X < numImages) { target[i * B_X] = agg.output(prod[i], poolSize); } } } } /* * imgs: (numFilters, imgPixels, numImages) * target: (numOutputs, imgPixels, numImages) */ template void convPoolCrossMap(NVMatrix& images, NVMatrix& target, const int startF, const int poolSize, const int numOutputs, const int stride, const int imgSize, Pooler pooler) { int numImages = images.getNumCols(); int imgPixels = imgSize * imgSize; int numFilters = images.getNumRows() / imgPixels; assert(images.getNumRows() == numFilters * imgPixels); assert(!images.isTrans()); assert(!target.isTrans()); assert(images.isContiguous()); // assert(numFilters % 4 == 0); // assert(numImages % 128 == 0); assert(stride <= poolSize); assert(startF <= 0); assert(startF + (numOutputs-1) * stride + poolSize >= numFilters); // All filters must be covered cudaStream_t stream = NVMatrix::getDefaultStream(); target.resize(imgPixels*numOutputs, numImages); int imgsPerThread = numImages % 128 == 0 ? 4 : numImages % 64 == 0 ? 2 : 1; dim3 threads(32, 4); dim3 blocks(imgSize * DIVUP(numImages, threads.x * imgsPerThread), imgSize * DIVUP(numOutputs, threads.y)); bool checkCaseBounds = numImages % (threads.x*imgsPerThread) != 0; if (!checkCaseBounds) { if (imgsPerThread == 4) { cudaFuncSetCacheConfig(kPoolCrossMap, cudaFuncCachePreferShared); kPoolCrossMap<<>>(images.getDevData(), target.getDevData(), imgSize, numFilters, numImages, startF, poolSize, numOutputs, stride, pooler); } else if (imgsPerThread == 2) { cudaFuncSetCacheConfig(kPoolCrossMap, cudaFuncCachePreferShared); kPoolCrossMap<<>>(images.getDevData(), target.getDevData(), imgSize, numFilters, numImages, startF, poolSize, numOutputs, stride, pooler); } else if (imgsPerThread == 1) { cudaFuncSetCacheConfig(kPoolCrossMap, cudaFuncCachePreferShared); kPoolCrossMap<<>>(images.getDevData(), target.getDevData(), imgSize, numFilters, numImages, startF, poolSize, numOutputs, stride, pooler); } } else { if (imgsPerThread == 1) { cudaFuncSetCacheConfig(kPoolCrossMap, cudaFuncCachePreferShared); kPoolCrossMap<<>>(images.getDevData(), target.getDevData(), imgSize, numFilters, numImages, startF, poolSize, numOutputs, stride, pooler); } else { assert(false); } } getLastCudaError("convPoolCrossMap: kernel execution failed"); } /* * Block size 16xB_X * blockIdx.x determines 4x4 pixel.x region, image idx in batches of B_X*imgsPerThread * blockIdx.y determines 4x4 pixel.y region, filter idx in batches of filtersPerThread * * So each block does a 4x4 region for some number of images/filters. * * threadIdx.x determines img idx * threadIdx.y determines pixel idx * * imgs: (numFilters, imgPixels, numImages) * target: (numFilters, numOutputs, numImages) * * B_X one of 8, 16, 32 * imgsPerThread one of 1, 2, 4, 8, 16 * * B_XximgsPerThread MUST be divisible by 32. * Number of filters MUST be divisible by filtersPerThread. * * numImages must be divisible by B_X*imgsPerThread if checkCaseBounds is false * * Final write-out will not be fully coalesced unless B_X is 32. But there's a lot more * reading than writing here, and the reading is all coalesced, so it should be OK. * * To be used when the stride is 1 and the pooling region is fairly large. */ template __global__ void kLocalPool2(float* imgs, float* target, const int imgSize, const int numFilters, const int numImages, const int subsX, const int startX, const int outputsX, Agg agg) { __shared__ float shImgs[filtersPerThread][B_X*imgsPerThread]; const int numImgBlocks = DIVUP(numImages,B_X*imgsPerThread); const int numFilterBlocks = numFilters/(filtersPerThread); const int blockOutputX = 4*(blockIdx.x / numImgBlocks); const int blockOutputY = 4*(blockIdx.y / numFilterBlocks); const int blockImgIdx = (blockIdx.x % numImgBlocks) * B_X * imgsPerThread; const int blockFilterIdx = (blockIdx.y % numFilterBlocks) * filtersPerThread; // const int blockOutputIdx = blockOutputY * outputsX + blockOutputX; const int numOutputs = outputsX * outputsX; const int imgPixels = imgSize * imgSize; const int tidx = threadIdx.y * B_X + threadIdx.x; const int loadY = tidx / 32, loadX = tidx % 32; const int myX = threadIdx.y % 4; const int myY = threadIdx.y / 4; const int myOutputIdxY = blockOutputY + myY; const int myOutputIdxX = blockOutputX + myX; const int myOutputIdx = myOutputIdxY * outputsX + myOutputIdxX; const int startImgPxX = startX + blockOutputX; const int startImgPxY = startX + blockOutputY; const int endImgPxX = startImgPxX + subsX; const int endImgPxY = startImgPxY + subsX; const int myStartImgPxY = startImgPxY + myY; const int myStartImgPxX = startImgPxX + myX; const int myEndImgPxY = endImgPxY + myY; const int myEndImgPxX = endImgPxX + myX; const int loopStartY = MAX(startImgPxY, 0); const int loopStartX = MAX(startImgPxX, 0); const int loopEndY = MIN(imgSize, endImgPxY + 3); const int loopEndX = MIN(imgSize, endImgPxX + 3); const int imgIdx = blockImgIdx + threadIdx.x; imgs += (blockFilterIdx + loadY) * imgPixels * numImages + blockImgIdx + loadX; target += (blockFilterIdx * numOutputs + myOutputIdx) * numImages + imgIdx; float prod[filtersPerThread][imgsPerThread]; #pragma unroll for (int f = 0; f < filtersPerThread; f++) { #pragma unroll for (int i = 0; i < imgsPerThread; i++) { prod[f][i] = agg.getBaseValue(); } } int regionSize = 0; for (int y = loopStartY; y < loopEndY; y++) { const bool isInY = y >= myStartImgPxY && y < myEndImgPxY ; for (int x = loopStartX; x < loopEndX; x++) { // Load a pixel const int px = y * imgSize + x; #pragma unroll for (int ly = 0; ly < filtersPerThread; ly += B_X/2) { if (filtersPerThread % (B_X/2) == 0 || ly + loadY < filtersPerThread) { #pragma unroll for (int lx = 0; lx < B_X*imgsPerThread; lx += 32) { if (!checkCaseBounds || lx + loadX + blockImgIdx < numImages) { shImgs[ly + loadY][lx + loadX] = imgs[(ly * imgPixels + px) * numImages + lx]; } } } } __syncthreads(); // Is this pixel in my region? if (isInY && x >= myStartImgPxX && x < myEndImgPxX) { #pragma unroll for (int i = 0; i < imgsPerThread; i++) { if (!checkCaseBounds || imgIdx + i * B_X < numImages) { #pragma unroll for (int f = 0; f < filtersPerThread; f++) { prod[f][i] = agg(prod[f][i], shImgs[f][threadIdx.x + i * B_X]); } } } ++regionSize; } __syncthreads(); } } if (myOutputIdxY < outputsX && myOutputIdxX < outputsX) { #pragma unroll for (int i = 0; i < imgsPerThread; i++) { if (!checkCaseBounds || imgIdx + i * B_X < numImages) { #pragma unroll for (int f = 0; f < filtersPerThread; f++) { target[f * numOutputs * numImages + i * B_X] = agg.output(prod[f][i], regionSize); } } } } } /* * imgs: (numFilters, imgPixels, numImages) * target: (numFilters, outputs, numImages) */ template void convLocalPool(NVMatrix& images, NVMatrix& target, int numFilters, int subsX, int startX, int strideX, int outputsX, Pooler pooler) { int numImages = images.getNumCols(); int imgPixels = images.getNumRows() / numFilters; assert(images.getNumRows() == numFilters * imgPixels); int imgSize = int(sqrt(imgPixels)); assert(imgSize * imgSize == imgPixels); assert(!images.isTrans()); assert(!target.isTrans()); assert(images.isContiguous()); // assert(numFilters % 4 == 0); // assert(numImages % 128 == 0); cudaStream_t stream = NVMatrix::getDefaultStream(); int outputs = outputsX * outputsX; target.resize(numFilters*outputs, numImages); if (strideX == 1 && subsX >= 6 && outputsX > 1) { // NOTE: this part has not been optimized for Kepler int imgsPerThread = numImages % 128 == 0 ? 8 : 4; int filtersPerThread = numFilters % 4 == 0 ? 4 : numFilters % 3 == 0 ? 3 : numFilters % 2 == 0 ? 2 : 1; int bx = 8; bool checkCaseBounds = numImages % (bx*imgsPerThread) != 0; assert((imgsPerThread * bx) % 32 == 0); assert(numFilters % filtersPerThread == 0); dim3 threads(bx, 16); dim3 blocks(DIVUP(outputsX, 4) * DIVUP(numImages, bx*imgsPerThread), DIVUP(outputsX, 4) * numFilters / filtersPerThread); // printf("threads: %dx%d, blocks: %dx%d, imgSize: %d, numFilters: %d, numImages: %d, subsX: %d, startX: %d, outputsX: %d\n", // threads.y, threads.x, blocks.y, blocks.x, imgSize, numFilters, numImages, subsX, startX, outputsX); if (imgsPerThread == 8) { if (filtersPerThread == 1) { if (checkCaseBounds) { cudaFuncSetCacheConfig(kLocalPool2, cudaFuncCachePreferShared); kLocalPool2<<>>(images.getDevData(), target.getDevData(), imgSize, numFilters, numImages, subsX, startX, outputsX, pooler); } else { cudaFuncSetCacheConfig(kLocalPool2, cudaFuncCachePreferShared); kLocalPool2<<>>(images.getDevData(), target.getDevData(), imgSize, numFilters, numImages, subsX, startX, outputsX, pooler); } } else if (filtersPerThread == 2) { if (checkCaseBounds) { cudaFuncSetCacheConfig(kLocalPool2, cudaFuncCachePreferShared); kLocalPool2<<>>(images.getDevData(), target.getDevData(), imgSize, numFilters, numImages, subsX, startX, outputsX, pooler); } else { cudaFuncSetCacheConfig(kLocalPool2, cudaFuncCachePreferShared); kLocalPool2<<>>(images.getDevData(), target.getDevData(), imgSize, numFilters, numImages, subsX, startX, outputsX, pooler); } } else if (filtersPerThread == 3) { if (checkCaseBounds) { cudaFuncSetCacheConfig(kLocalPool2, cudaFuncCachePreferShared); kLocalPool2<<>>(images.getDevData(), target.getDevData(), imgSize, numFilters, numImages, subsX, startX, outputsX, pooler); } else { cudaFuncSetCacheConfig(kLocalPool2, cudaFuncCachePreferShared); kLocalPool2<<>>(images.getDevData(), target.getDevData(), imgSize, numFilters, numImages, subsX, startX, outputsX, pooler); } } else if (filtersPerThread == 4) { if (checkCaseBounds) { cudaFuncSetCacheConfig(kLocalPool2, cudaFuncCachePreferShared); kLocalPool2<<>>(images.getDevData(), target.getDevData(), imgSize, numFilters, numImages, subsX, startX, outputsX, pooler); } else { cudaFuncSetCacheConfig(kLocalPool2, cudaFuncCachePreferShared); kLocalPool2<<>>(images.getDevData(), target.getDevData(), imgSize, numFilters, numImages, subsX, startX, outputsX, pooler); } } } else if (imgsPerThread == 4) { if (filtersPerThread == 1) { if (checkCaseBounds) { cudaFuncSetCacheConfig(kLocalPool2, cudaFuncCachePreferShared); kLocalPool2<<>>(images.getDevData(), target.getDevData(), imgSize, numFilters, numImages, subsX, startX, outputsX, pooler); } else { cudaFuncSetCacheConfig(kLocalPool2, cudaFuncCachePreferShared); kLocalPool2<<>>(images.getDevData(), target.getDevData(), imgSize, numFilters, numImages, subsX, startX, outputsX, pooler); } } else if (filtersPerThread == 2) { if (checkCaseBounds) { cudaFuncSetCacheConfig(kLocalPool2, cudaFuncCachePreferShared); kLocalPool2<<>>(images.getDevData(), target.getDevData(), imgSize, numFilters, numImages, subsX, startX, outputsX, pooler); } else { cudaFuncSetCacheConfig(kLocalPool2, cudaFuncCachePreferShared); kLocalPool2<<>>(images.getDevData(), target.getDevData(), imgSize, numFilters, numImages, subsX, startX, outputsX, pooler); } } else if (filtersPerThread == 3) { if (checkCaseBounds) { cudaFuncSetCacheConfig(kLocalPool2, cudaFuncCachePreferShared); kLocalPool2<<>>(images.getDevData(), target.getDevData(), imgSize, numFilters, numImages, subsX, startX, outputsX, pooler); } else { cudaFuncSetCacheConfig(kLocalPool2, cudaFuncCachePreferShared); kLocalPool2<<>>(images.getDevData(), target.getDevData(), imgSize, numFilters, numImages, subsX, startX, outputsX, pooler); } } else if (filtersPerThread == 4) { if (checkCaseBounds) { cudaFuncSetCacheConfig(kLocalPool2, cudaFuncCachePreferShared); kLocalPool2<<>>(images.getDevData(), target.getDevData(), imgSize, numFilters, numImages, subsX, startX, outputsX, pooler); } else { cudaFuncSetCacheConfig(kLocalPool2, cudaFuncCachePreferShared); kLocalPool2<<>>(images.getDevData(), target.getDevData(), imgSize, numFilters, numImages, subsX, startX, outputsX, pooler); } } } } else { int filtersPerThread = numFilters % 16 == 0 ? 4 : 1; int imgsPerThread = numImages % 128 == 0 ? 4 : numImages % 64 == 0 ? 2 : 1; bool checkCaseBounds = numImages % (32*imgsPerThread) != 0; dim3 threads(32, 4); dim3 blocks(DIVUP(numImages,32*imgsPerThread) * outputsX, DIVUP(numFilters, 4 * filtersPerThread) * outputsX); if (imgsPerThread == 4) { if (filtersPerThread == 1) { if (checkCaseBounds) { cudaFuncSetCacheConfig(kLocalPool, cudaFuncCachePreferL1); kLocalPool<<>>(images.getDevData(), target.getDevData(), imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, pooler); } else { cudaFuncSetCacheConfig(kLocalPool, cudaFuncCachePreferL1); kLocalPool<<>>(images.getDevData(), target.getDevData(), imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, pooler); } } else { if (checkCaseBounds) { cudaFuncSetCacheConfig(kLocalPool, cudaFuncCachePreferL1); kLocalPool<<>>(images.getDevData(), target.getDevData(), imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, pooler); } else { cudaFuncSetCacheConfig(kLocalPool, cudaFuncCachePreferL1); kLocalPool<<>>(images.getDevData(), target.getDevData(), imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, pooler); } } } else if (imgsPerThread == 2) { if (filtersPerThread == 1) { if (checkCaseBounds) { cudaFuncSetCacheConfig(kLocalPool, cudaFuncCachePreferL1); kLocalPool<<>>(images.getDevData(), target.getDevData(), imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, pooler); } else { cudaFuncSetCacheConfig(kLocalPool, cudaFuncCachePreferL1); kLocalPool<<>>(images.getDevData(), target.getDevData(), imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, pooler); } } else { if (checkCaseBounds) { cudaFuncSetCacheConfig(kLocalPool, cudaFuncCachePreferL1); kLocalPool<<>>(images.getDevData(), target.getDevData(), imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, pooler); } else { cudaFuncSetCacheConfig(kLocalPool, cudaFuncCachePreferL1); kLocalPool<<>>(images.getDevData(), target.getDevData(), imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, pooler); } } } else { if (filtersPerThread == 1) { if (checkCaseBounds) { cudaFuncSetCacheConfig(kLocalPool, cudaFuncCachePreferL1); kLocalPool<<>>(images.getDevData(), target.getDevData(), imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, pooler); } else { cudaFuncSetCacheConfig(kLocalPool, cudaFuncCachePreferL1); kLocalPool<<>>(images.getDevData(), target.getDevData(), imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, pooler); } } else { if (checkCaseBounds) { cudaFuncSetCacheConfig(kLocalPool, cudaFuncCachePreferL1); kLocalPool<<>>(images.getDevData(), target.getDevData(), imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, pooler); } else { cudaFuncSetCacheConfig(kLocalPool, cudaFuncCachePreferL1); kLocalPool<<>>(images.getDevData(), target.getDevData(), imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, pooler); } } } } getLastCudaError("convLocalPool: kernel execution failed"); } #endif /* CONV_UTIL_CUH */ ================================================ FILE: caffe2/contrib/cuda-convnet2/cudaconv3/include/cudaconv2.cuh ================================================ /* * Copyright 2014 Google Inc. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef COMMON_CUH #define COMMON_CUH #include // helper functions CUDA error checking and initialization #include "../../nvmatrix/include/nvmatrix.cuh" #include "conv_util.cuh" #include "caffe2/core/context_gpu.h" enum FILTER_OUTPUT_ORDER { MODULE_FILTER_IMAGE, FILTER_MODULE_IMAGE }; void convFilterActs( caffe2::CUDAContext* context, caffe2::TensorCUDA* images, caffe2::TensorCUDA* filters, caffe2::TensorCUDA* targets, int imgSizeY, int numModulesY, int numModulesX, int paddingStart, int moduleStride, int numImgColors, int numGroups); void convFilterActs( caffe2::CUDAContext* context, caffe2::TensorCUDA* images, caffe2::TensorCUDA* filters, caffe2::TensorCUDA* targets, int imgSizeY, int numModulesY, int numModulesX, int paddingStart, int moduleStride, int numImgColors, int numGroups, float scaleTargets, float scaleOutput); void localFilterActs( caffe2::CUDAContext* context, caffe2::TensorCUDA* images, caffe2::TensorCUDA* filters, caffe2::TensorCUDA* targets, int imgSizeY, int numModulesY, int numModulesX, int paddingStart, int moduleStride, int numImgColors, int numGroups); void localFilterActs( caffe2::CUDAContext* context, caffe2::TensorCUDA* images, caffe2::TensorCUDA* filters, caffe2::TensorCUDA* targets, int imgSizeY, int numModulesY, int numModulesX, int paddingStart, int moduleStride, int numImgColors, int numGroups, float scaleTargets, float scaleOutput); void convImgActs( caffe2::CUDAContext* context, caffe2::TensorCUDA* hidActs, caffe2::TensorCUDA* filters, caffe2::TensorCUDA* targets, int imgSizeY, int imgSizeX, int numModulesY, int paddingStart, int moduleStride, int numImgColors, int numGroups); void convImgActs( caffe2::CUDAContext* context, caffe2::TensorCUDA* hidActs, caffe2::TensorCUDA* filters, caffe2::TensorCUDA* targets, int imgSizeY, int imgSizeX, int numModulesY, int paddingStart, int moduleStride, int numImgColors, int numGroups, float scaleTargets, float scaleOutput); void localImgActs( caffe2::CUDAContext* context, caffe2::TensorCUDA* hidActs, caffe2::TensorCUDA* filters, caffe2::TensorCUDA* targets, int imgSizeY, int imgSizeX, int numModulesY, int paddingStart, int moduleStride, int numImgColors, int numGroups); void localImgActs( caffe2::CUDAContext* context, caffe2::TensorCUDA* hidActs, caffe2::TensorCUDA* filters, caffe2::TensorCUDA* targets, int imgSizeY, int imgSizeX, int numModulesY, int paddingStart, int moduleStride, int numImgColors, int numGroups, float scaleTargets, float scaleOutput); void convWeightActs( caffe2::CUDAContext* context, caffe2::TensorCUDA* images, caffe2::TensorCUDA* hidActs, caffe2::TensorCUDA* targets, int imgSizeY, int numModulesY, int numModulesX, int filterSize, int paddingStart, int moduleStride, int numImgColors, int numGroups, int sumWidth); void convWeightActs( caffe2::CUDAContext* context, caffe2::TensorCUDA* images, caffe2::TensorCUDA* hidActs, caffe2::TensorCUDA* targets, int imgSizeY, int numModulesY, int numModulesX, int filterSize, int paddingStart, int moduleStride, int numImgColors, int numGroups, int sumWidth, float scaleTargets, float scaleOutput); void localWeightActs( caffe2::CUDAContext* context, caffe2::TensorCUDA* images, caffe2::TensorCUDA* hidActs, caffe2::TensorCUDA* targets, int imgSizeY, int numModulesY, int numModulesX, int filterSize, int paddingStart, int moduleStride, int numImgColors, int numGroups); void localWeightActs( caffe2::CUDAContext* context, caffe2::TensorCUDA* images, caffe2::TensorCUDA* hidActs, caffe2::TensorCUDA* targets, int imgSizeY, int numModulesY, int numModulesX, int filterSize, int paddingStart, int moduleStride, int numImgColors, int numGroups, float scaleTargets, float scaleOutput); #endif /* COMMON_CUH */ ================================================ FILE: caffe2/contrib/cuda-convnet2/cudaconv3/src/conv_util.cu ================================================ /* * Copyright 2014 Google Inc. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include #include #include #include "../../nvmatrix/include/nvmatrix_kernels.cuh" #include "../../nvmatrix/include/nvmatrix.cuh" #include "../include/conv_util.cuh" using namespace std; __device__ inline float square(const float a) { return a * a; } /* * Horizontal reflection. * imgs: (numColors, imgSize, imgSize, numCases) * targets: (numColors, imgSize, imgSize, numCases) * * targets should be a different array from imgs. * * Block size: (4, 32) * blockIdx.y * 4 + threadIdx.y determines pixel * blockIdx.x * 32 * imgsPerThread + threadIdx.x determines case batch * */ template __global__ void kReflectH(float * imgs, float * targets, const int imgSize, const int numCases) { const int pxIdx = blockIdx.y * 4 + threadIdx.y; const int imgPixels = imgSize * imgSize; if (pxIdx < imgPixels) { const int caseIdx = blockIdx.x * 32 * imgsPerThread + threadIdx.x; const int pxIdxY = pxIdx / imgSize; const int pxIdxX = pxIdx % imgSize; const int pxIdxXR = imgSize - 1 - pxIdxX; // reflected coordinate const int pxIdxR = pxIdxY * imgSize + pxIdxXR; imgs += pxIdx * numCases + caseIdx; targets += pxIdxR * numCases + caseIdx; #pragma unroll for (int i = 0; i < imgsPerThread; ++i) { if (!checkCaseBounds || caseIdx + i * 32 < numCases) { #pragma unroll for (int c = 0; c < numColors; ++c) { targets[c * imgPixels * numCases + i * 32] = imgs[c * imgPixels * numCases + i * 32]; } } } } } /* * Horizontal reflection. * imgs: (numColors, imgSize, imgSize, numCases) * targets: (numColors, imgSize, imgSize, numCases) */ void convReflectHorizontal(NVMatrix& images, NVMatrix& targets, int imgSize) { int numCases = images.getNumCols(); int imgPixels = imgSize * imgSize; int numColors = images.getNumRows() / imgPixels; assert(numColors * imgPixels == images.getNumRows()); assert(numColors > 0 && numColors <= 3); targets.resize(images); int imgsPerThread = numCases % 128 == 0 ? 4 : numCases % 64 == 0 ? 2 : 1; bool checkCaseBounds = numCases % (32 * imgsPerThread) != 0; dim3 threads(32, 4); dim3 blocks(DIVUP(numCases, imgsPerThread * 32), DIVUP(imgPixels, 4)); cudaStream_t stream = NVMatrix::getDefaultStream(); if (checkCaseBounds) { if (numColors == 1) { if (imgsPerThread == 1) { cudaFuncSetCacheConfig(kReflectH<1, 1, true>, cudaFuncCachePreferL1); kReflectH<1, 1, true><<>>(images.getDevData(), targets.getDevData(), imgSize, numCases); } else if (imgsPerThread == 2) { cudaFuncSetCacheConfig(kReflectH<1, 2, true>, cudaFuncCachePreferL1); kReflectH<1, 2, true><<>>(images.getDevData(), targets.getDevData(), imgSize, numCases); } else if (imgsPerThread == 4) { cudaFuncSetCacheConfig(kReflectH<1, 4, true>, cudaFuncCachePreferL1); kReflectH<1, 4, true><<>>(images.getDevData(), targets.getDevData(), imgSize, numCases); } } else if (numColors == 2) { if (imgsPerThread == 1) { cudaFuncSetCacheConfig(kReflectH<2, 1, true>, cudaFuncCachePreferL1); kReflectH<2, 1, true><<>>(images.getDevData(), targets.getDevData(), imgSize, numCases); } else if (imgsPerThread == 2) { cudaFuncSetCacheConfig(kReflectH<2, 2, true>, cudaFuncCachePreferL1); kReflectH<2, 2, true><<>>(images.getDevData(), targets.getDevData(), imgSize, numCases); } else if (imgsPerThread == 4) { cudaFuncSetCacheConfig(kReflectH<2, 4, true>, cudaFuncCachePreferL1); kReflectH<2, 4, true><<>>(images.getDevData(), targets.getDevData(), imgSize, numCases); } } else if (numColors == 3) { if (imgsPerThread == 1) { cudaFuncSetCacheConfig(kReflectH<3, 1, true>, cudaFuncCachePreferL1); kReflectH<3, 1, true><<>>(images.getDevData(), targets.getDevData(), imgSize, numCases); } else if (imgsPerThread == 2) { cudaFuncSetCacheConfig(kReflectH<3, 2, true>, cudaFuncCachePreferL1); kReflectH<3, 2, true><<>>(images.getDevData(), targets.getDevData(), imgSize, numCases); } else if (imgsPerThread == 4) { cudaFuncSetCacheConfig(kReflectH<3, 4, true>, cudaFuncCachePreferL1); kReflectH<3, 4, true><<>>(images.getDevData(), targets.getDevData(), imgSize, numCases); } } } else { if (numColors == 1) { if (imgsPerThread == 1) { cudaFuncSetCacheConfig(kReflectH<1, 1, false>, cudaFuncCachePreferL1); kReflectH<1, 1, false><<>>(images.getDevData(), targets.getDevData(), imgSize, numCases); } else if (imgsPerThread == 2) { cudaFuncSetCacheConfig(kReflectH<1, 2, false>, cudaFuncCachePreferL1); kReflectH<1, 2, false><<>>(images.getDevData(), targets.getDevData(), imgSize, numCases); } else if (imgsPerThread == 4) { cudaFuncSetCacheConfig(kReflectH<1, 4, false>, cudaFuncCachePreferL1); kReflectH<1, 4, false><<>>(images.getDevData(), targets.getDevData(), imgSize, numCases); } } else if (numColors == 2) { if (imgsPerThread == 1) { cudaFuncSetCacheConfig(kReflectH<2, 1, false>, cudaFuncCachePreferL1); kReflectH<2, 1, false><<>>(images.getDevData(), targets.getDevData(), imgSize, numCases); } else if (imgsPerThread == 2) { cudaFuncSetCacheConfig(kReflectH<2, 2, false>, cudaFuncCachePreferL1); kReflectH<2, 2, false><<>>(images.getDevData(), targets.getDevData(), imgSize, numCases); } else if (imgsPerThread == 4) { cudaFuncSetCacheConfig(kReflectH<2, 4, false>, cudaFuncCachePreferL1); kReflectH<2, 4, false><<>>(images.getDevData(), targets.getDevData(), imgSize, numCases); } } else if (numColors == 3) { if (imgsPerThread == 1) { cudaFuncSetCacheConfig(kReflectH<3, 1, false>, cudaFuncCachePreferL1); kReflectH<3, 1, false><<>>(images.getDevData(), targets.getDevData(), imgSize, numCases); } else if (imgsPerThread == 2) { cudaFuncSetCacheConfig(kReflectH<3, 2, false>, cudaFuncCachePreferL1); kReflectH<3, 2, false><<>>(images.getDevData(), targets.getDevData(), imgSize, numCases); } else if (imgsPerThread == 4) { cudaFuncSetCacheConfig(kReflectH<3, 4, false>, cudaFuncCachePreferL1); kReflectH<3, 4, false><<>>(images.getDevData(), targets.getDevData(), imgSize, numCases); } } } getLastCudaError("kReflectH: kernel execution failed"); } /* * blockIdx.y determines module in batches of B_Y * blockIdx.x determines filter in batches of B_X * filtersPerThread * * weights: (numModules, numColors, filterPixels, numFilters) * Not fully coalesced if B_X < 32, so use cache. */ template __global__ void kNormalizeLCWeights(float* weights, const uint numFilters, const int numModules, const uint weightsPerFilter, const float norm) { const uint moduleIdx = B_Y * blockIdx.y + threadIdx.y; const uint filterIdx = B_X * blockIdx.x + threadIdx.x; float prod[filtersPerThread]; #pragma unroll for (uint i = 0; i < filtersPerThread; ++i) { prod[i] = 0; } if (moduleIdx < numModules) { weights += moduleIdx * weightsPerFilter * numFilters + filterIdx; for (uint p = 0; p < weightsPerFilter; ++p) { #pragma unroll for (uint i = 0; i < filtersPerThread; ++i) { prod[i] += square(weights[p * numFilters + i * B_X]); } } #pragma unroll for (uint i = 0; i < filtersPerThread; ++i) { prod[i] = sqrtf(prod[i]); prod[i] = prod[i] > norm ? __fdividef(norm, prod[i]) : 1.0f; } for (uint p = 0; p < weightsPerFilter; ++p) { #pragma unroll for (uint i = 0; i < filtersPerThread; ++i) { weights[p * numFilters + i * B_X] *= prod[i]; } } } } /* * weights: (numModules, numColors, filterPixels, numFilters) */ void normalizeLocalWeights(NVMatrix& weights, int numModules, float norm) { int numFilters = weights.getNumCols(); int weightsPerFilter = weights.getNumRows() / numModules; assert(numModules * weightsPerFilter == weights.getNumRows()); assert(!weights.isTrans()); assert(weights.isContiguous()); assert(numFilters % 16 == 0); int bx = numFilters % 32 == 0 ? 32 : 16; int by = bx == 32 ? 4 : 8; int filtersPerThread = numFilters % 128 == 0 ? 4 : numFilters % 64 == 0 ? 2 : 1; dim3 blocks(numFilters / (bx * filtersPerThread), DIVUP(numModules, by)); dim3 threads(bx, by); cudaStream_t stream = NVMatrix::getDefaultStream(); if (filtersPerThread == 4) { cudaFuncSetCacheConfig(kNormalizeLCWeights<4, 32, 4>, cudaFuncCachePreferL1); kNormalizeLCWeights<4, 32, 4><<>>(weights.getDevData(), numFilters, numModules, weightsPerFilter, norm); } else if (filtersPerThread == 2) { cudaFuncSetCacheConfig(kNormalizeLCWeights<4, 32, 2>, cudaFuncCachePreferL1); kNormalizeLCWeights<4, 32, 2><<>>(weights.getDevData(), numFilters, numModules, weightsPerFilter, norm); } else { if (numFilters % 32 == 0) { cudaFuncSetCacheConfig(kNormalizeLCWeights<4, 32, 1>, cudaFuncCachePreferL1); kNormalizeLCWeights<4, 32, 1><<>>(weights.getDevData(), numFilters, numModules, weightsPerFilter, norm); } else { cudaFuncSetCacheConfig(kNormalizeLCWeights<8, 16, 1>, cudaFuncCachePreferL1); kNormalizeLCWeights<8, 16, 1><<>>(weights.getDevData(), numFilters, numModules, weightsPerFilter, norm); } } } /* * Block size 4x32 * blockIdx.x determines img idx in batches of 32*imgsPerThread * blockIdx.y determines channel idx, pixel idx in batches of 4 * * threadIdx.x determins case idx * threadIdx.y determines pixel idx * * imgs: (numChannels, imgPixels, numImages) with given imgStride * target: (numChannels, tgtPixels, numImages) */ template __global__ void kCrop(float* imgs, float* target, const uint numImages, const int imgStride, const uint imgSize, const uint tgtSize, const uint startY, const uint startX) { const uint imgPixels = imgSize * imgSize; const uint tgtPixels = tgtSize * tgtSize; const uint caseIdx = blockIdx.x * 32 * imgsPerThread + threadIdx.x; const uint blockChanIdx = blockIdx.y / DIVUP(tgtPixels, 4); const uint tgtPixelIdx = 4*(blockIdx.y % DIVUP(tgtPixels, 4)) + threadIdx.y; const uint tgtPxY = tgtPixelIdx / tgtSize; const uint tgtPxX = tgtPixelIdx % tgtSize; const uint srcPixelIdx = (startY + tgtPxY) * imgSize + startX + tgtPxX; if (tgtPixelIdx < tgtPixels) { imgs += (blockChanIdx * imgPixels + srcPixelIdx) * imgStride + caseIdx; target += (blockChanIdx * tgtPixels + tgtPixelIdx) * numImages + caseIdx; #pragma unroll for (uint i = 0; i < imgsPerThread; ++i) { if (!checkCaseBounds || (caseIdx + 32 * i < numImages)) { target[i * 32] = imgs[i * 32]; } } } } /* * Block size 4x32 * blockIdx.y determines pixel idx in batches of 4 * blockIdx.x determines case idx in batches of 32*imgsPerThread * threadIdx.y determines pixel idx * threadIdx.x determines case idx * * imgs: (3, imgPixels, numImages) with given imgStride * target: (3, imgPixels, numImages) * * Each thread produces (y,u,v) values for a particular (r,g,b) pixel * * The RGB --> YUV transform is (http://en.wikipedia.org/wiki/YUV): * * [Y] [ 0.2126 0.7152 0.0722 ][R] * [U] = [-0.09991 -0.33609 0.436 ][G] * [V] [ 0.615 -0.55861 -0.05639][B] */ template __global__ void kRGBToYUV(float* imgs, float* target, const int imgPixels, const int numImages, const int imgStride) { const int caseIdx = blockIdx.x * 32 * imgsPerThread + threadIdx.x; const int pxIdx = blockIdx.y * 4 + threadIdx.y; if (pxIdx < imgPixels) { const int imgChannelStride = imgPixels * imgStride; const int tgtChannelStride = imgPixels * numImages; imgs += pxIdx * imgStride + caseIdx; target += pxIdx * numImages + caseIdx; #pragma unroll for (int i = 0; i < imgsPerThread; ++i) { if (!checkCaseBounds || caseIdx + i * 32 < numImages) { const float R = imgs[0 * imgChannelStride + i * 32]; const float G = imgs[1 * imgChannelStride + i * 32]; const float B = imgs[2 * imgChannelStride + i * 32]; target[0 * tgtChannelStride + i * 32] = 0.2126f * R + 0.7152f * G + 0.0722f * B; // Y target[1 * tgtChannelStride + i * 32] = -0.09991f * R + -0.33609f * G + 0.436f * B; // U target[2 * tgtChannelStride + i * 32] = 0.615f * R + -0.55861f * G + -0.05639f * B; // V } } } } __device__ inline float labf(const float x) { if (x > 0.0088564517f) { return __powf(x, 0.3333f); } return 7.787037f * x + 0.13793103f; } /* * Block size 4x32 * blockIdx.y determines pixel idx in batches of 4 * blockIdx.x determines case idx in batches of 32*imgsPerThread * threadIdx.y determines pixel idx * threadIdx.x determines case idx * * imgs: (3, imgPixels, numImages) with given imgStride * target: (3, imgPixels, numImages) * * This proceeds in two steps. * * - First, RGB values are linearly transformed to XYZ as per * http://en.wikipedia.org/wiki/CIE_XYZ_color_space * - Second, XYZ values are nonlinearly transformed to L*a*b* as per * http://en.wikipedia.org/wiki/Lab_color_space#The_forward_transformation * * Each thread produces (L*,a*,b*) values for a particular (r,g,b) pixel * * The RGB --> XYZ transform is: * * [X] [0.49 0.31 0.2 ][R] * [Y] = 5.6506753 * [0.17697 0.8124 0.01063 ][G] * [Z] [0 0.01 0.99 ][B] * * NOTE: The input should be in the range 0-1. Don't do mean-subtraction beforehand. * * Then X_max, Y_max, Z_max = 5.6506753. * * The range of the L* values is [0, 100]. * If the center flag is given, the range will be [-50, 50]. * */ template __global__ void kRGBToLAB(float* imgs, float* target, const int imgPixels, const int numImages, const int imgStride) { const int caseIdx = blockIdx.x * 32 * imgsPerThread + threadIdx.x; const int pxIdx = blockIdx.y * 4 + threadIdx.y; if (pxIdx < imgPixels) { const int imgChannelStride = imgPixels * imgStride; const int tgtChannelStride = imgPixels * numImages; imgs += pxIdx * imgStride + caseIdx; target += pxIdx * numImages + caseIdx; #pragma unroll for (int i = 0; i < imgsPerThread; ++i) { if (!checkCaseBounds || caseIdx + i * 32 < numImages) { const float R = imgs[0 * imgChannelStride + i * 32]; const float G = imgs[1 * imgChannelStride + i * 32]; const float B = imgs[2 * imgChannelStride + i * 32]; const float X = (0.49f * R + 0.31f * G + 0.2f * B); const float Y = (0.17697f * R + 0.8124f * G + 0.01063f * B); const float Z = (0.01f * G + 0.99f * B); const float labX = labf(X); const float labY = labf(Y); const float labZ = labf(Z); target[0 * tgtChannelStride + i * 32] = 116.0f * labY - 16.0f - (center ? 50.0f : 0); // L* target[1 * tgtChannelStride + i * 32] = 500.0f * (labX - labY); // a* target[2 * tgtChannelStride + i * 32] = 200.0f * (labY - labZ); // b* } } } } /* * Block size 16x32. * Each block produces a 4x4 chunk of the output image. * threadIdx.y determines pixel idx in 4x4 chunk. * threadIdx.x determines case idx. * blockIdx.x determines case idx in batches of 32*imgsPerThread. * blockIdx.y determines 4x4 chunk idx, channel idx. * * imgs: (numChannels, imgPixels, numImages) with given imgStride * target: (numChannels, tgtPixels, numImages) * * imgSize = scale * tgtSize (roughly) * * This is a rather naive kernel that relies on cache for speed. But all it's doing * is basic texture manipulation, which is very local in nature, so it should be ok. * Also, it will in practice be a tiny fraction of the runtime of a large convnet. * * So that is my justification for being lazy here. */ template __global__ void kResizeBilinear(float* imgs, float* target, const int imgSize, const int tgtSize, const int numImages, const int imgStride, const float scale, const float centerScale) { const int numChunksX = DIVUP(tgtSize, 4); const int numChunks = numChunksX * numChunksX; const int channelIdx = blockIdx.y / numChunks; const int chunkIdx = blockIdx.y % numChunks; const int chunkIdxX = chunkIdx % numChunksX; const int chunkIdxY = chunkIdx / numChunksX; const int caseIdx = blockIdx.x * 32 * imgsPerThread + threadIdx.x; const int imgPixels = imgSize * imgSize; const int tgtPixels = tgtSize * tgtSize; const int pxX = 4 * chunkIdxX + threadIdx.y % 4; const int pxY = 4 * chunkIdxY + threadIdx.y / 4; if (pxY < tgtSize && pxX < tgtSize) { const int pxIdx = pxY * tgtSize + pxX; imgs += channelIdx * imgPixels * imgStride + caseIdx; target += channelIdx * tgtPixels * numImages + pxIdx * numImages + caseIdx; // This will cause slight distortions at the edges when upsampling in some cases. // But I think that's not a big deal. const float srcPxX = fmaxf(0.0f, fminf(__int2float_rn(imgSize) - 1.01f, __int2float_rn(pxX) * scale + centerScale)); const float srcPxY = fmaxf(0.0f, fminf(__int2float_rn(imgSize) - 1.01f, __int2float_rn(pxY) * scale + centerScale)); const float u = floorf(srcPxX + 1) - srcPxX; const float w = srcPxY - floorf(srcPxY); // Consider doing max(0, min(imgSize, x)) here const int srcPx0 = (__float2int_rd(srcPxY) * imgSize + __float2int_rd(srcPxX)); // top-left const int srcPx1 = srcPx0 + 1; // top-right const int srcPx2 = srcPx0 + imgSize; // bottom-left const int srcPx3 = srcPx2 + 1; // bottom-right #pragma unroll for (int c = 0; c < imgsPerThread; ++c) { if (!checkCaseBounds || caseIdx + c * 32 < numImages) { const float val0 = imgs[srcPx0 * imgStride + c * 32]; const float val1 = imgs[srcPx1 * imgStride + c * 32]; const float val2 = imgs[srcPx2 * imgStride + c * 32]; const float val3 = imgs[srcPx3 * imgStride + c * 32]; const float c0 = u * (val0 - val1) + val1; const float c1 = u * (val2 - val3) + val3; target[32 * c] = w * (c1 - c0) + c0; } } } } /* * Block size B_YxB_X. * B_X*imgsPerThread*blockIdx.x + threadIdx.x determines img idx * B_Y*blockIdx.y + threadIdx.y determines img row (col if !horiz), channel idx * * imgs: (numChannels, imgPixels, numImages) with given imgStride * filter: (1, 2*radius + 1) * target: (numChannels, imgPixels, numImages) * * target can be the same matrix as imgs. * radius must be one of 3, 5, 7, 9. * * Tried imgsPerThread, slower. */ template __global__ void kGaussianBlur(float* imgs, float* filter, float* target, const int imgSize, const int numImages, const int imgStride, const int numChannels, const bool horiz, const float scaleTargets, const float scaleOutputs) { const int filterWidth = 2*radius+1; __shared__ float shFilter[filterWidth-1]; const int imgPixels = imgSize * imgSize; const int ty = B_Y * blockIdx.y + threadIdx.y; const int channelIdx = ty / imgSize; const int rowIdx = ty % imgSize; const int imgIdx = B_X*blockIdx.x + threadIdx.x; // const int tidx = B_Y * threadIdx.y + threadIdx.x; if (horiz) { imgs += channelIdx * imgPixels * imgStride + rowIdx * imgSize * imgStride + imgIdx; target += channelIdx * imgPixels * numImages + rowIdx * imgSize * numImages + imgIdx; } else { imgs += channelIdx * imgPixels * imgStride + rowIdx * imgStride + imgIdx; target += channelIdx * imgPixels * numImages + rowIdx * numImages + imgIdx; } float outputs[filterWidth-1]; #pragma unroll for (int r = 0; r < filterWidth-1; r++) { outputs[r] = 0; } if (threadIdx.x < filterWidth-1) { shFilter[threadIdx.x] = filter[threadIdx.x]; } __syncthreads(); if (imgIdx < numImages && channelIdx < numChannels) { // This writes radius*2 = filterWidth - 1 values to outputs #pragma unroll for (int col = 0; col < radius; col++) { float px = imgs[0]; #pragma unroll for (int r = 0; r < radius + 1 + col; r++) { outputs[r] += px * shFilter[radius + col - r]; } imgs += horiz ? imgStride : imgStride * imgSize; } // Unfortunately this has to be at this level of granularity if (scaleTargets != 0) { for (int col = radius; col < imgSize ; col++) { // loop over img columns float px = imgs[0]; target[0] = scaleTargets * target[0] + scaleOutputs * (outputs[0] + px * shFilter[0]); #pragma unroll for (int r = 1; r < radius*2; r++) { outputs[r-1] = outputs[r] + px * shFilter[r]; } outputs[filterWidth - 2] = px * shFilter[0]; imgs += horiz ? imgStride : imgStride * imgSize; target += horiz ? numImages : numImages * imgSize; } #pragma unroll for (int r = 0; r < radius; r++) { float* t = &target[0]; t[0] = scaleTargets * t[0] + scaleOutputs * outputs[r]; target += horiz ? numImages : numImages * imgSize; } } else { for (int col = radius; col < imgSize ; col++) { // loop over img columns float px = imgs[0]; target[0] = scaleOutputs * (outputs[0] + px * shFilter[0]); #pragma unroll for (int r = 1; r < radius*2; r++) { outputs[r-1] = outputs[r] + px * shFilter[r]; } outputs[filterWidth - 2] = px * shFilter[0]; imgs += horiz ? imgStride : imgStride * imgSize; target += horiz ? numImages : numImages * imgSize; } #pragma unroll for (int r = 0; r < radius; r++) { target[0] = scaleOutputs * outputs[r]; target += horiz ? numImages : numImages * imgSize; } } } } /* * Block size B_YxB_X * blockIdx.x determines output.x, image idx in batches of B_X*imgsPerThread * blockIdx.y determines output.y, filter idx in batches of B_Y*filtersPerThread * * So each block does one output for some number of images/filters. * * threadIdx.x determines img idx * threadIdx.y determines filter idx * * imgs: (numChannels, imgPixels, numImages) * target: (numChannels, numOutputs, numImages) * * numImages must be divisible by B_X*imgsPerThread if checkCaseBounds is false * numFilters must be divisible by filtersPerThread */ template __global__ void kBedOfNails(float* imgs, float* target, const int imgSize, const int numChannels, const int numImages, const int startX, const int strideX, const int outputsX, const bool reverse, const float scaleTargets, const float scaleOutput) { const int numImgBlocks = DIVUP(numImages,B_X*imgsPerThread); const int numChanBlocks = DIVUP(numChannels, B_Y*chansPerThread); const int outputIdxX = blockIdx.x / numImgBlocks; const int outputIdxY = blockIdx.y / numChanBlocks; const int blockImgIdx = (blockIdx.x % numImgBlocks) * B_X * imgsPerThread; const int blockChanIdx = (blockIdx.y % numChanBlocks) * B_Y * chansPerThread; const int myChanIdx = (blockChanIdx + threadIdx.y*chansPerThread); if (myChanIdx >= numChannels) { return; } // if (blockIdx.x != 0 || blockIdx.y != 0) { // return; // } const int outputIdx = outputIdxY * outputsX + outputIdxX; const int numOutputs = outputsX * outputsX; const int imgPixels = imgSize * imgSize; const int startImgPxX = startX + outputIdxX * strideX; const int startImgPxY = startX + outputIdxY * strideX; const int imgIdx = blockImgIdx + threadIdx.x; const int imgPx = startImgPxY * imgSize + startImgPxX; imgs += myChanIdx * imgPixels * numImages + imgPx * numImages + imgIdx; target += (myChanIdx * numOutputs + outputIdx) * numImages + imgIdx; if (scaleTargets != 0) { if (!reverse) { #pragma unroll for (int i = 0; i < imgsPerThread; i++) { if (!checkCaseBounds || imgIdx + i * B_X < numImages) { #pragma unroll for (int c = 0; c < chansPerThread; c++) { target[c * numOutputs * numImages + i * B_X] = scaleTargets * target[c * numOutputs * numImages + i * B_X] + scaleOutput * imgs[c * imgPixels * numImages + i * B_X]; } } } } else { #pragma unroll for (int i = 0; i < imgsPerThread; i++) { if (!checkCaseBounds || imgIdx + i * B_X < numImages) { #pragma unroll for (int c = 0; c < chansPerThread; c++) { imgs[c * imgPixels * numImages + i * B_X] = scaleTargets * imgs[c * imgPixels * numImages + i * B_X] + scaleOutput * target[c * numOutputs * numImages + i * B_X]; } } } } } else { if (!reverse) { #pragma unroll for (int i = 0; i < imgsPerThread; i++) { if (!checkCaseBounds || imgIdx + i * B_X < numImages) { #pragma unroll for (int c = 0; c < chansPerThread; c++) { target[c * numOutputs * numImages + i * B_X] = scaleOutput * imgs[c * imgPixels * numImages + i * B_X]; } } } } else { #pragma unroll for (int i = 0; i < imgsPerThread; i++) { if (!checkCaseBounds || imgIdx + i * B_X < numImages) { #pragma unroll for (int c = 0; c < chansPerThread; c++) { imgs[c * imgPixels * numImages + i * B_X] = scaleOutput * target[c * numOutputs * numImages + i * B_X]; } } } } } } /* * imgs: (numChannels, imgPixels, numImages) * target: (numChannels, outputs, numImages) */ void _convBedOfNails(NVMatrix& images, NVMatrix& target, int numChannels, int imgSize, int startX, int strideX, bool reverse, float scaleTargets, float scaleOutput) { int numImages = reverse ? target.getNumCols() : images.getNumCols(); int imgPixels = imgSize * imgSize; assert(!images.isTrans()); assert(!target.isTrans()); assert(images.isContiguous()); assert(target.isContiguous()); assert(strideX > 1); int outputsX = DIVUP(imgSize, strideX); int outputs = outputsX * outputsX; if (reverse) { assert(target.getNumRows() == numChannels * outputs); } else { assert(images.getNumRows() == numChannels * imgPixels); } if (scaleTargets == 0) { if (reverse) { images.resize(numChannels * imgPixels, numImages); images.apply(NVMatrixOps::Zero()); } else { target.resize(numChannels*outputs, numImages); } } else { if (reverse) { assert(images.getNumRows() == numChannels * outputs); assert(images.getNumCols() == numImages); } else { assert(target.getNumRows() == numChannels * outputs); assert(target.getNumCols() == numImages); } } int imgsPerThread = numImages % 128 == 0 ? 4 : numImages % 64 == 0 ? 2 : 1; bool checkCaseBounds = numImages % (32*imgsPerThread) != 0; int chansPerThread = numChannels % 8 == 0 ? 2 : 1; dim3 threads(32, 4); dim3 blocks(DIVUP(numImages,32*imgsPerThread) * outputsX, DIVUP(numChannels, 4 * chansPerThread) * outputsX); cudaStream_t stream = NVMatrix::getDefaultStream(); if (imgsPerThread == 4) { if (chansPerThread == 1) { if (checkCaseBounds) { cudaFuncSetCacheConfig(kBedOfNails<4, 32, 4, 1, true>, cudaFuncCachePreferL1); kBedOfNails<4, 32, 4, 1, true><<>>(images.getDevData(), target.getDevData(), imgSize, numChannels, numImages, startX, strideX, outputsX, reverse, scaleTargets, scaleOutput); } else { cudaFuncSetCacheConfig(kBedOfNails<4, 32, 4, 1, false>, cudaFuncCachePreferL1); kBedOfNails<4, 32, 4, 1, false><<>>(images.getDevData(), target.getDevData(), imgSize, numChannels, numImages, startX, strideX, outputsX, reverse, scaleTargets, scaleOutput); } } else { if (checkCaseBounds) { cudaFuncSetCacheConfig(kBedOfNails<4, 32, 4, 2, true>, cudaFuncCachePreferL1); kBedOfNails<4, 32, 4, 2, true><<>>(images.getDevData(), target.getDevData(), imgSize, numChannels, numImages, startX, strideX, outputsX, reverse, scaleTargets, scaleOutput); } else { cudaFuncSetCacheConfig(kBedOfNails<4, 32, 4, 2, false>, cudaFuncCachePreferL1); kBedOfNails<4, 32, 4, 2, false><<>>(images.getDevData(), target.getDevData(), imgSize, numChannels, numImages, startX, strideX, outputsX, reverse, scaleTargets, scaleOutput); } } } else if (imgsPerThread == 2) { if (chansPerThread == 1) { if (checkCaseBounds) { cudaFuncSetCacheConfig(kBedOfNails<4, 32, 2, 1, true>, cudaFuncCachePreferL1); kBedOfNails<4, 32, 2, 1, true><<>>(images.getDevData(), target.getDevData(), imgSize, numChannels, numImages, startX, strideX, outputsX, reverse, scaleTargets, scaleOutput); } else { cudaFuncSetCacheConfig(kBedOfNails<4, 32, 2, 1, false>, cudaFuncCachePreferL1); kBedOfNails<4, 32, 2, 1, false><<>>(images.getDevData(), target.getDevData(), imgSize, numChannels, numImages, startX, strideX, outputsX, reverse, scaleTargets, scaleOutput); } } else { if (checkCaseBounds) { cudaFuncSetCacheConfig(kBedOfNails<4, 32, 2, 2, true>, cudaFuncCachePreferL1); kBedOfNails<4, 32, 2, 2, true><<>>(images.getDevData(), target.getDevData(), imgSize, numChannels, numImages, startX, strideX, outputsX, reverse, scaleTargets, scaleOutput); } else { cudaFuncSetCacheConfig(kBedOfNails<4, 32, 2, 2, false>, cudaFuncCachePreferL1); kBedOfNails<4, 32, 2, 2, false><<>>(images.getDevData(), target.getDevData(), imgSize, numChannels, numImages, startX, strideX, outputsX, reverse, scaleTargets, scaleOutput); } } } else { if (chansPerThread == 1) { if (checkCaseBounds) { cudaFuncSetCacheConfig(kBedOfNails<4, 32, 1, 1, true>, cudaFuncCachePreferL1); kBedOfNails<4, 32, 1, 1, true><<>>(images.getDevData(), target.getDevData(), imgSize, numChannels, numImages, startX, strideX, outputsX, reverse, scaleTargets, scaleOutput); } else { cudaFuncSetCacheConfig(kBedOfNails<4, 32, 1, 1, false>, cudaFuncCachePreferL1); kBedOfNails<4, 32, 1, 1, false><<>>(images.getDevData(), target.getDevData(), imgSize, numChannels, numImages, startX, strideX, outputsX, reverse, scaleTargets, scaleOutput); } } else { if (checkCaseBounds) { cudaFuncSetCacheConfig(kBedOfNails<4, 32, 1, 2, true>, cudaFuncCachePreferL1); kBedOfNails<4, 32, 1, 2, true><<>>(images.getDevData(), target.getDevData(), imgSize, numChannels, numImages, startX, strideX, outputsX, reverse, scaleTargets, scaleOutput); } else { cudaFuncSetCacheConfig(kBedOfNails<4, 32, 1, 2, false>, cudaFuncCachePreferL1); kBedOfNails<4, 32, 1, 2, false><<>>(images.getDevData(), target.getDevData(), imgSize, numChannels, numImages, startX, strideX, outputsX, reverse, scaleTargets, scaleOutput); } } } } void convBedOfNails(NVMatrix& images, NVMatrix& target, int numChannels, int imgSize, int startX, int strideX, float scaleTargets, float scaleOutput) { _convBedOfNails(images, target, numChannels, imgSize, startX, strideX, false, scaleTargets, scaleOutput); } void convBedOfNailsUndo(NVMatrix& actsGrad, NVMatrix& target, int numChannels, int imgSize, int startX, int strideX, float scaleTargets, float scaleOutput) { _convBedOfNails(target, actsGrad, numChannels, imgSize, startX, strideX, true, scaleTargets, scaleOutput); } /* * imgs: (numChannels, imgPixels, numImages) with given imgStride * filter: (1, 2*radius + 1) * target: (numChannels, imgPixels, numImages) */ void convGaussianBlur(NVMatrix& images, NVMatrix& filter, NVMatrix& target, bool horiz, int numChannels, float scaleTargets, float scaleOutputs) { int numImages = images.getNumCols(); int radius = filter.getNumCols() / 2; int imgPixels = images.getNumRows() / numChannels; int imgSize = int(sqrt(imgPixels)); assert(imgPixels == imgSize * imgSize); assert(radius >= 1 && radius <= 4); assert(imgSize >= 2 * radius + 1); assert(filter.getNumRows() == 1); assert(images.getNumRows() == numChannels * imgPixels); assert(!images.isTrans()); assert(!filter.isTrans()); assert(!target.isTrans()); assert(target.isContiguous()); if (scaleTargets == 0) { target.resize(images); } else { assert(target.isSameDims(images)); } dim3 threads(32, 4); dim3 blocks(DIVUP(numImages, threads.x), DIVUP(numChannels*imgSize, threads.y)); cudaStream_t stream = NVMatrix::getDefaultStream(); if (radius == 1) { cudaFuncSetCacheConfig(kGaussianBlur<4, 32, 1>, cudaFuncCachePreferL1); kGaussianBlur<4, 32, 1><<>>(images.getDevData(), filter.getDevData(), target.getDevData(), imgSize, numImages, images.getStride(), numChannels, horiz, scaleTargets, scaleOutputs); } else if (radius == 2) { cudaFuncSetCacheConfig(kGaussianBlur<4, 32, 2>, cudaFuncCachePreferL1); kGaussianBlur<4, 32, 2><<>>(images.getDevData(), filter.getDevData(), target.getDevData(), imgSize, numImages, images.getStride(), numChannels,horiz, scaleTargets, scaleOutputs); } else if (radius == 3) { cudaFuncSetCacheConfig(kGaussianBlur<4, 32, 3>, cudaFuncCachePreferL1); kGaussianBlur<4, 32, 3><<>>(images.getDevData(), filter.getDevData(), target.getDevData(), imgSize, numImages, images.getStride(), numChannels,horiz, scaleTargets, scaleOutputs); } else if (radius == 4) { cudaFuncSetCacheConfig(kGaussianBlur<4, 32, 4>, cudaFuncCachePreferL1); kGaussianBlur<4, 32, 4><<>>(images.getDevData(), filter.getDevData(), target.getDevData(), imgSize, numImages, images.getStride(), numChannels,horiz, scaleTargets, scaleOutputs); } } /* * Block size 1x128 * blockIdx.x determines pixel.x, image idx in batches of 128*imgsPerThread * blockIdx.y determines pixel.y * * So each block does one output for some number of images and all the fliters. * * threadIdx.x determines img idx * * imgs: (numFilters, imgPixels, numImages) * meanDiffs: (numFilters, imgPixels, numImages) * denoms: (numFilters, imgPixels, numImages) (out) * target: (numFilters, imgPixels, numImages) (out) * * numImages must be divisible by B_X*imgsPerThread if checkCaseBounds is false * numFilters must be divisible by B_Y*filtersPerThread */ template __global__ void kCNorm_fewfilter(float* imgs, float* meanDiffs, float* denoms, float* target, const int imgSize, const int numImages, const int sizeX, const float addScale, const float powScale, const float minDiv) { const int imgPixels = imgSize * imgSize; const int numImgBlocks = DIVUP(numImages, 128*imgsPerThread); const int pxIdxX = blockIdx.x / numImgBlocks; const int pxIdxY = blockIdx.y; const int blockImgIdx = (blockIdx.x % numImgBlocks) * 128 * imgsPerThread; const int pxIdx = pxIdxY * imgSize + pxIdxX; const int startPxX = -sizeX/2 + pxIdxX; const int startPxY = -sizeX/2 + pxIdxY; const int imgIdx = blockImgIdx + threadIdx.x; imgs += pxIdx * numImages + imgIdx; denoms += pxIdx * numImages + imgIdx; meanDiffs += imgIdx; target += pxIdx * numImages + imgIdx; float prod[numFilters][imgsPerThread]; #pragma unroll for (int i = 0; i < imgsPerThread; i++) { if (!checkCaseBounds || imgIdx + i * 128 < numImages) { #pragma unroll for (int f = 0; f < numFilters; f++) { prod[f][i] = 0; } } } const int loopStartY = MAX(0, startPxY); const int loopStartX = MAX(0, startPxX); const int loopEndY = MIN(imgSize, startPxY + sizeX); const int loopEndX = MIN(imgSize, startPxX + sizeX); for (int y = loopStartY; y < loopEndY; y++) { for (int x = loopStartX; x < loopEndX; x++) { const int imgPx = y * imgSize + x; #pragma unroll for (int i = 0; i < imgsPerThread; i++) { if (!checkCaseBounds || imgIdx + i * 128 < numImages) { #pragma unroll for (int f = 0; f < numFilters; f++) { prod[f][i] += square(meanDiffs[(f * imgPixels + imgPx) * numImages + i * 128]); } } } } } #pragma unroll for (int i = 0; i < imgsPerThread; i++) { if (!checkCaseBounds || imgIdx + i * 128 < numImages) { #pragma unroll for (int f = 0; f < numFilters; f++) { prod[f][i] = minDiv + addScale * prod[f][i]; denoms[f * imgPixels * numImages + i * 128] = prod[f][i]; target[f * imgPixels * numImages + i * 128] = imgs[f * imgPixels * numImages + i * 128] * __powf(prod[f][i], -powScale); } } } } /* * Block size B_YxB_X * blockIdx.x determines image idx in batches of B_X*imgsPerThread * blockIdx.y determines filter idx in batches of B_Y*filtersPerThread * blockIdx.z determines pixel * * So each block does one pixel for some number of images/filters. * * threadIdx.x determines img idx * threadIdx.y determines filter idx * * imgs: (numFilters, imgPixels, numImages) * means: (numFilters, imgPixels, numImages) * denoms: (numFilters, imgPixels, numImages) (out) * target: (numFilters, imgPixels, numImages) (out) * * numImages must be divisible by B_X*imgsPerThread if checkCaseBounds is false * numFilters must be divisible by B_Y*filtersPerThread */ template __global__ void kCNorm_manyfilter(float* imgs, float* meanDiffs, float* denoms, float* target, const int imgSize, const int numFilters, const int numImages, const int sizeX, const float addScale, const float powScale, const float minDiv) { const int imgPixels = imgSize * imgSize; const int pxIdxX = blockIdx.z % imgSize; const int pxIdxY = blockIdx.z / imgSize; const int blockImgIdx = blockIdx.x * B_X * imgsPerThread; const int blockFilterIdx = blockIdx.y * B_Y * filtersPerThread; const int pxIdx = pxIdxY * imgSize + pxIdxX; const int startPxX = -sizeX/2 + pxIdxX; const int startPxY = -sizeX/2 + pxIdxY; const int imgIdx = blockImgIdx + threadIdx.x; imgs += ((blockFilterIdx + threadIdx.y) * imgPixels + pxIdx) * numImages + imgIdx; meanDiffs += (blockFilterIdx + threadIdx.y) * imgPixels * numImages + imgIdx; denoms += ((blockFilterIdx + threadIdx.y) * imgPixels + pxIdx) * numImages + imgIdx; target += ((blockFilterIdx + threadIdx.y) * imgPixels + pxIdx) * numImages + imgIdx; float prod[filtersPerThread][imgsPerThread]; #pragma unroll for (int i = 0; i < imgsPerThread; i++) { if (!checkCaseBounds || imgIdx + i * B_X < numImages) { #pragma unroll for (int f = 0; f < filtersPerThread; f++) { prod[f][i] = 0; } } } const int loopStartY = max(0, startPxY); const int loopStartX = max(0, startPxX); const int loopEndY = min(imgSize, startPxY + sizeX); const int loopEndX = min(imgSize, startPxX + sizeX); for (int y = loopStartY; y < loopEndY; y++) { for (int x = loopStartX; x < loopEndX; x++) { const int imgPx = y * imgSize + x; #pragma unroll for (int f = 0; f < filtersPerThread; f++) { #pragma unroll for (int i = 0; i < imgsPerThread; i++) { if (!checkCaseBounds || imgIdx + i * B_X < numImages) { prod[f][i] += square(meanDiffs[(f * B_Y * imgPixels + imgPx) * numImages + i * B_X]); } } } } } #pragma unroll for (int f = 0; f < filtersPerThread; f++) { #pragma unroll for (int i = 0; i < imgsPerThread; i++) { if (!checkCaseBounds || imgIdx + i * B_X < numImages) { prod[f][i] = minDiv + addScale * prod[f][i]; denoms[f * B_Y * imgPixels * numImages + i * B_X] = prod[f][i]; target[f * B_Y * imgPixels * numImages + i * B_X] = imgs[f * B_Y * imgPixels * numImages + i * B_X] * __powf(prod[f][i], -powScale); } } } } /* * Block size 16xB_X * blockIdx.x determines 4x4 pixel.x region, image idx in batches of B_X*imgsPerThread * blockIdx.y determines 4x4 pixel.y region, filter idx in batches of filtersPerThread * * So each block does 4x4 region of pixels for some number of images/filters. * * threadIdx.x determines img idx * threadIdx.y determines pixel idx * * imgs: (numFilters, imgPixels, numImages) * means: (numFilters, imgPixels, numImages) * denoms: (numFilters, imgPixels, numImages) (out) * target: (numFilters, imgPixels, numImages) (out) * * B_X one of 8, 16, 32 * imgsPerThread one of 1, 2, 4, 8, 16 * * B_XximgsPerThread MUST be divisible by 32. * Number of filters MUST be divisible by filtersPerThread. * * numImages must be divisible by B_X*imgsPerThread if checkCaseBounds is false * numFilters must be divisible by filtersPerThread * * Final write-out will not be fully coalesced unless B_X is 32. But there's a lot more * reading than writing here, and the reading is all coalesced, so it should be OK. */ template __global__ void kCNorm2(float* imgs, float* meanDiffs, float* denoms, float* target, const int imgSize, const int numFilters, const int numImages, const int sizeX, const float addScale, const float powScale, const float minDiv) { __shared__ float shDiffs[filtersPerThread][B_X*imgsPerThread]; const int imgPixels = imgSize * imgSize; const int numImgBlocks = DIVUP(numImages, B_X*imgsPerThread); const int numFilterBlocks = numFilters/(filtersPerThread); const int blockPxX = 4*(blockIdx.x / numImgBlocks); const int blockPxY = 4*(blockIdx.y / numFilterBlocks); const int blockImgIdx = (blockIdx.x % numImgBlocks) * B_X * imgsPerThread; const int blockFilterIdx = (blockIdx.y % numFilterBlocks) * filtersPerThread; const int tidx = threadIdx.y * B_X + threadIdx.x; const int loadY = tidx / 32, loadX = tidx % 32; const int startPxX = MAX(0, -sizeX/2 + blockPxX); const int startPxY = MAX(0, -sizeX/2 + blockPxY); const int endPxX = MIN(imgSize, blockPxX + DIVUP(sizeX, 2) + 3); const int endPxY = MIN(imgSize, blockPxY + DIVUP(sizeX, 2) + 3); const int myPxX = blockPxX + threadIdx.y % 4; const int myPxY = blockPxY + threadIdx.y / 4; const int myPxIdx = myPxY * imgSize + myPxX; // const bool doWork = myPxX < imgSize && myPxY < imgSize; const int myStartPxY = -sizeX/2 + myPxY; const int myStartPxX = -sizeX/2 + myPxX; const int myEndPxY = myPxY + DIVUP(sizeX, 2); const int myEndPxX = myPxX + DIVUP(sizeX, 2); const int imgIdx = blockImgIdx + threadIdx.x; imgs += (blockFilterIdx * imgPixels + myPxIdx) * numImages + imgIdx; meanDiffs += (blockFilterIdx + loadY) * imgPixels * numImages + blockImgIdx + loadX; denoms += (blockFilterIdx * imgPixels + myPxIdx) * numImages + imgIdx; target += (blockFilterIdx * imgPixels + myPxIdx) * numImages + imgIdx; float prod[filtersPerThread][imgsPerThread]; #pragma unroll for (int i = 0; i < imgsPerThread; i++) { if (!checkCaseBounds || imgIdx + i * B_X < numImages) { #pragma unroll for (int f = 0; f < filtersPerThread; f++) { prod[f][i] = 0; } } } for (int y = startPxY; y < endPxY; y++) { const bool isInY = y >= myStartPxY && y < myEndPxY; for (int x = startPxX; x < endPxX; x++) { const int px = y * imgSize + x; // All the threads load a pixel from memory #pragma unroll for (int ly = 0; ly < filtersPerThread; ly += B_X/2) { if (filtersPerThread % (B_X/2) == 0 || ly + loadY < filtersPerThread) { #pragma unroll for (int lx = 0; lx < B_X*imgsPerThread; lx += 32) { if (!checkCaseBounds || lx + loadX + blockImgIdx < numImages) { shDiffs[ly + loadY][lx + loadX] = meanDiffs[(ly * imgPixels + px) * numImages + lx]; } } } } __syncthreads(); // Each row of threads decides if it's interested in this pixel if (isInY && x >= myStartPxX && x < myEndPxX) { #pragma unroll for (int i = 0; i < imgsPerThread; i++) { if (!checkCaseBounds || imgIdx + i * B_X < numImages) { #pragma unroll for (int f = 0; f < filtersPerThread; f++) { prod[f][i] += square(shDiffs[f][threadIdx.x + i * B_X]); } } } } __syncthreads(); } } // imgs -= (loadY * imgPixels - myPxIdx) * numImages + loadX; // imgs += threadIdx.x; if (myPxX < imgSize && myPxY < imgSize) { #pragma unroll for (int i = 0; i < imgsPerThread; i++) { if (!checkCaseBounds || imgIdx + i * B_X < numImages) { #pragma unroll for (int f = 0; f < filtersPerThread; f++) { prod[f][i] = minDiv + addScale * prod[f][i]; denoms[f * imgPixels * numImages + i * B_X] = prod[f][i]; target[f * imgPixels * numImages + i * B_X] = imgs[f * imgPixels * numImages + i * B_X] * __powf(prod[f][i], -powScale); } } } } } /* * Block size B_YxB_X * blockIdx.x determines pixel.x, image idx in batches of B_X*imgsPerThread * blockIdx.y determines pixel.y, filter idx in batches of B_Y * * So each block does one pixel for some number of images/filters. * * threadIdx.x determines img idx * threadIdx.y determines filter idx * * imgs: (numFilters, imgPixels, numImages) * meanDiffs: (numFilters, imgPixels, numImages) * denoms: (numFilters, imgPixels, numImages) (out) * target: (numFilters, imgPixels, numImages) (out) * * numImages must be divisible by B_X*imgsPerThread if checkCaseBounds is false * numFilters must be divisible by B_Y */ template __global__ void kFCNorm(cudaTextureObject_t imgs, cudaTextureObject_t meanDiffs, float* target, const int imgSize, const int numFilters, const int numImages, const int sizeF, const float addScale, const float powScale, const float minDiv) { const int imgPixels = imgSize * imgSize; const int numImgBlocks = DIVUP(numImages, B_X*imgsPerThread); const int numFilterBlocks = numFilters/B_Y; const int pxIdxX = blockIdx.x / numImgBlocks; const int pxIdxY = blockIdx.y / numFilterBlocks; const int blockImgIdx = (blockIdx.x % numImgBlocks) * B_X * imgsPerThread; const int filterIdx = (blockIdx.y % numFilterBlocks) * B_Y + threadIdx.y; const int pxIdx = pxIdxY * imgSize + pxIdxX; const int imgIdx = blockImgIdx + threadIdx.x; const int imgOffset = ((filterIdx) * imgPixels + pxIdx) * numImages + imgIdx; const int meanDiffsOffset = pxIdx * numImages + imgIdx; // imgs += ((filterIdx) * imgPixels + pxIdx) * numImages + imgIdx; // meanDiffs += pxIdx * numImages + imgIdx; target += ((filterIdx) * imgPixels + pxIdx) * numImages + imgIdx; float prod[imgsPerThread]; #pragma unroll for (int i = 0; i < imgsPerThread; i++) { if (!checkCaseBounds || imgIdx + i * B_X < numImages) { prod[i] = 0; } } const int startF = blocked ? (filterIdx / sizeF) * sizeF : -sizeF/2 + filterIdx; const int loopStartF = blocked ? startF : MAX(0, startF); const int loopEndF = MIN(numFilters, startF + sizeF); for (int f = loopStartF; f < loopEndF; ++f) { #pragma unroll for (int i = 0; i < imgsPerThread; i++) { if (!checkCaseBounds || imgIdx + i * B_X < numImages) { prod[i] += square(tex1Dfetch(meanDiffs, meanDiffsOffset + f * imgPixels * numImages + i * B_X)); } } } #pragma unroll for (int i = 0; i < imgsPerThread; i++) { if (!checkCaseBounds || imgIdx + i * B_X < numImages) { prod[i] = minDiv + addScale * prod[i]; target[i * B_X] = tex1Dfetch(imgs, imgOffset + i * B_X) * __powf(prod[i], -powScale); } } } /* * Block size B_YxB_X * blockIdx.x determines pixel.x, image idx in batches of B_X*imgsPerThread * blockIdx.y determines pixel.y, filter idx in batches of B_Y * * So each block does one output pixel for some number of images/filters. * * threadIdx.x determines img idx * threadIdx.y determines filter idx * * imgs: (numFilters, imgPixels, numImages) * maxGrads: (numOutputs, imgPixels, numImages) * maxActs: (numOutputs, imgPixels, numImages) * target: (numFilters, imgPixels, numImages) * * numImages must be divisible by B_X*imgsPerThread * numFilters must be divisible by B_Y * * TODO: this isn't really ideal */ template __global__ void kCrossMapMaxPoolUndo(float* imgs, float* maxGrads, float* maxActs, float* target, const int imgSize, const int numFilters, const int numImages, const int startF, const int poolSize, const int numOutputs, const int stride, const float scaleTargets, const float scaleOutputs) { const int numImgBlocks = DIVUP(numImages,B_X*imgsPerThread); // const int numOutputs = DIVUP(numFilters, stride); const int numFilterBlocks = numFilters/B_Y; const int pxIdxX = blockIdx.x / numImgBlocks; const int pxIdxY = blockIdx.y / numFilterBlocks; const int blockImgIdx = (blockIdx.x % numImgBlocks) * B_X * imgsPerThread; const int filterIdx = (blockIdx.y % numFilterBlocks) * B_Y + threadIdx.y; const int imgPixels = imgSize * imgSize; const int pxIdx = pxIdxY * imgSize + pxIdxX; const int imgIdx = blockImgIdx + threadIdx.x; imgs += ((filterIdx) * imgPixels + pxIdx) * numImages + imgIdx; maxGrads += (/*(filterIdx) * imgPixels +*/ pxIdx) * numImages + imgIdx; maxActs += (/*(filterIdx) * imgPixels +*/ pxIdx) * numImages + imgIdx; target += ((filterIdx) * imgPixels + pxIdx) * numImages + imgIdx; float prod[imgsPerThread]; // if (imgIdx != 0 || pxIdx != 0 || filterIdx != 0) { // return; // } #pragma unroll for (int i = 0; i < imgsPerThread; i++) { prod[i] = 0; } if (filterIdx < numFilters) { // const int startOut = max(0, (filterIdx-startF-poolSize)/ stride + 1); const int loopStartOut = max(0, (filterIdx-startF-poolSize)/ stride + 1); const int loopEndOut = min(numOutputs, (filterIdx - startF)/ stride + 1); for (int o = loopStartOut; o < loopEndOut; ++o) { #pragma unroll for (int i = 0; i < imgsPerThread; i++) { if (!checkCaseBounds || imgIdx + i * B_X < numImages) { const float ma = maxActs[o * imgPixels * numImages + i * B_X]; const float mg = maxGrads[o * imgPixels * numImages + i * B_X]; const float img = imgs[i*B_X]; prod[i] += (img == ma) * mg; } } } // printf("gpu f start: %d, end: %d\n", loopStartF, loopEndF); if (!add) { #pragma unroll for (int i = 0; i < imgsPerThread; i++) { if (!checkCaseBounds || imgIdx + i * B_X < numImages) { target[i * B_X] = prod[i]; } } } else { #pragma unroll for (int i = 0; i < imgsPerThread; i++) { if (!checkCaseBounds || imgIdx + i * B_X < numImages) { target[i * B_X] = scaleTargets * target[i * B_X] + scaleOutputs * prod[i]; } } } } } /* * images: (numFilters, imgPixels, numImages) * maxGrads: (numOutputs, imgPixels, numImages) * maxActs: (numOutputs, imgPixels, numImages) * target: (numFilters, imgPixels, numImages) */ void convCrossMapMaxPoolUndo(NVMatrix& images, NVMatrix& maxGrads, NVMatrix& maxActs, NVMatrix& target, const int imgSize, const int startF, const int poolSize, const int stride, const float scaleTargets, const float scaleOutputs) { int numImages = images.getNumCols(); int imgPixels = imgSize * imgSize; int numFilters = images.getNumRows() / imgPixels; int numOutputs = maxActs.getNumRows() / imgPixels; assert(images.getNumRows() == numFilters * imgPixels); assert(maxGrads.getNumRows() == numOutputs * imgPixels); assert(maxGrads.getNumCols() == numImages); assert(maxGrads.isSameDims(maxActs)); assert(images.getNumRows() == numFilters * imgPixels); assert(!images.isTrans()); assert(!target.isTrans()); assert(!maxGrads.isTrans()); assert(!maxActs.isTrans()); assert(images.isContiguous()); assert(maxGrads.isContiguous()); assert(maxActs.isContiguous()); assert(maxGrads.isSameDims(maxActs)); // assert(numFilters % 16 == 0); // assert(numImages % 128 == 0); assert(stride <= poolSize); assert(startF <= 0); assert(startF + (numOutputs-1) * stride + poolSize >= numFilters); // All filters must be covered dim3 threads(32, 4); int imgsPerThread = numImages % 128 == 0 ? 4 : numImages % 64 == 0 ? 2 : 1; dim3 blocks(imgSize * DIVUP(numImages, threads.x * imgsPerThread), imgSize * DIVUP(numFilters, threads.y)); bool checkCaseBounds = numImages % (threads.x*imgsPerThread) != 0; cudaStream_t stream = NVMatrix::getDefaultStream(); if (scaleTargets == 0) { target.resize(images); if (!checkCaseBounds) { if (imgsPerThread == 4) { kCrossMapMaxPoolUndo<4, 32, 4, false, false><<>>(images.getDevData(), maxGrads.getDevData(), maxActs.getDevData(), target.getDevData(), imgSize, numFilters, numImages, startF, poolSize, numOutputs, stride, scaleTargets, scaleOutputs); } else if (imgsPerThread == 2) { kCrossMapMaxPoolUndo<4, 32, 2, false, false><<>>(images.getDevData(), maxGrads.getDevData(), maxActs.getDevData(), target.getDevData(), imgSize, numFilters, numImages, startF, poolSize, numOutputs, stride, scaleTargets, scaleOutputs); } else { kCrossMapMaxPoolUndo<4, 32, 1, false, false><<>>(images.getDevData(), maxGrads.getDevData(), maxActs.getDevData(), target.getDevData(), imgSize, numFilters, numImages, startF, poolSize, numOutputs, stride, scaleTargets, scaleOutputs); } } else { kCrossMapMaxPoolUndo<4, 32, 1, false, true><<>>(images.getDevData(), maxGrads.getDevData(), maxActs.getDevData(), target.getDevData(), imgSize, numFilters, numImages, startF, poolSize, numOutputs, stride, scaleTargets, scaleOutputs); } } else { assert(target.isSameDims(images)); if (!checkCaseBounds) { if (imgsPerThread == 4) { kCrossMapMaxPoolUndo<4, 32, 4, true, false><<>>(images.getDevData(), maxGrads.getDevData(), maxActs.getDevData(), target.getDevData(), imgSize, numFilters, numImages, startF, poolSize, numOutputs, stride, scaleTargets, scaleOutputs); } else if (imgsPerThread == 2) { kCrossMapMaxPoolUndo<4, 32, 2, true, false><<>>(images.getDevData(), maxGrads.getDevData(), maxActs.getDevData(), target.getDevData(), imgSize, numFilters, numImages, startF, poolSize, numOutputs, stride, scaleTargets, scaleOutputs); } else { kCrossMapMaxPoolUndo<4, 32, 1, true, false><<>>(images.getDevData(), maxGrads.getDevData(), maxActs.getDevData(), target.getDevData(), imgSize, numFilters, numImages, startF, poolSize, numOutputs, stride, scaleTargets, scaleOutputs); } } else { kCrossMapMaxPoolUndo<4, 32, 1, true, true><<>>(images.getDevData(), maxGrads.getDevData(), maxActs.getDevData(), target.getDevData(), imgSize, numFilters, numImages, startF, poolSize, numOutputs, stride, scaleTargets, scaleOutputs); } } getLastCudaError("convCrossMapMaxPoolUndo: kernel execution failed"); } /* * Block size B_YxB_X * blockIdx.x determines pixel.x, image idx in batches of B_X*imgsPerThread * blockIdx.y determines pixel.y, filter idx in batches of B_Y * * So each block does one output pixel for some number of images/filters. * * threadIdx.x determines img idx * threadIdx.y determines filter idx * * outGrads: (numFilters, imgPixels, numImages) * denoms: (numFilters, imgPixels, numImages) * inputs: (numFilters, imgPixels, numImages) * acts: (numFilters, imgPixels, numImages) * target: (numFilters, imgPixels, numImages) * * numImages must be divisible by B_X*imgsPerThread * numFilters must be divisible by B_Y * * TODO: this isn't really ideal */ template __global__ void kFRNormUndo(cudaTextureObject_t outGrads, cudaTextureObject_t denoms, cudaTextureObject_t inputs, cudaTextureObject_t acts, float* target, const int imgSize, const int numFilters, const int numImages, const int sizeF, const float powScale, const float scaleTargets, const float scaleOutputs) { const int numImgBlocks = DIVUP(numImages,B_X*imgsPerThread); const int numFilterBlocks = numFilters/B_Y; const int pxIdxX = blockIdx.x / numImgBlocks; const int pxIdxY = blockIdx.y / numFilterBlocks; const int blockImgIdx = (blockIdx.x % numImgBlocks) * B_X * imgsPerThread; const int filterIdx = (blockIdx.y % numFilterBlocks) * B_Y + threadIdx.y; const int imgPixels = imgSize * imgSize; const int pxIdx = pxIdxY * imgSize + pxIdxX; const int imgIdx = blockImgIdx + threadIdx.x; const int actsOffset = pxIdx * numImages + imgIdx; const int inputOffset = ((filterIdx) * imgPixels + pxIdx) * numImages + imgIdx; target += inputOffset; float prod[imgsPerThread]; #pragma unroll for (int i = 0; i < imgsPerThread; i++) { prod[i] = 0; } const int startF = blocked ? (filterIdx / sizeF) * sizeF : -sizeF + sizeF/2 + 1 + filterIdx; const int loopStartF = blocked ? startF : MAX(0, startF); const int loopEndF = MIN(numFilters, startF + sizeF); for (int f = loopStartF; f < loopEndF; ++f) { #pragma unroll for (int i = 0; i < imgsPerThread; i++) { if (!checkCaseBounds || imgIdx + i * B_X < numImages) { prod[i] += tex1Dfetch(acts, actsOffset + f * imgPixels * numImages + i * B_X); } } } if (!add) { #pragma unroll for (int i = 0; i < imgsPerThread; i++) { if (!checkCaseBounds || imgIdx + i * B_X < numImages) { const float inp = tex1Dfetch(inputs, inputOffset + i * B_X); const float out = tex1Dfetch(outGrads, inputOffset + i * B_X); const float den = tex1Dfetch(denoms, inputOffset + i * B_X); prod[i] = inp * prod[i] + out * __powf(den, -powScale); target[i * B_X] = prod[i]; } } } else { #pragma unroll for (int i = 0; i < imgsPerThread; i++) { if (!checkCaseBounds || imgIdx + i * B_X < numImages) { const float inp = tex1Dfetch(inputs, inputOffset + i * B_X); const float out = tex1Dfetch(outGrads, inputOffset + i * B_X); const float den = tex1Dfetch(denoms, inputOffset + i * B_X); prod[i] = inp * prod[i] + out * __powf(den, -powScale); target[i * B_X] = scaleTargets * target[i * B_X] + scaleOutputs * prod[i]; } } } } /* * Block size B_YxB_X * blockIdx.x determines pixel.x, image idx in batches of B_X*imgsPerThread * blockIdx.y determines pixel.y, filter idx in batches of B_Y * * So each block does one output pixel for some number of images/filters. * * threadIdx.x determines img idx * threadIdx.y determines filter idx * * outGrads: (numFilters, imgPixels, numImages) * denoms: (numFilters, imgPixels, numImages) * inputs: (numFilters, imgPixels, numImages) * acts: (numFilters, imgPixels, numImages) * target: (numFilters, imgPixels, numImages) * * numImages must be divisible by B_X*imgsPerThread * numFilters must be divisible by B_Y * * TODO: this is pretty wasteful of computation. a lot of threads basically compute the same products. */ template //__launch_bounds__(128,16) __global__ void kFRNormUndo2(cudaTextureObject_t outGrads, cudaTextureObject_t inputs, cudaTextureObject_t acts, float* target, const int imgSize, const int numFilters, const int numImages, const int sizeF, const float addScale, const float powScale, const float minDiv, const float scaleTargets, const float scaleOutputs) { const int numImgBlocks = DIVUP(numImages,B_X*imgsPerThread); const int numFilterBlocks = numFilters/B_Y; const int pxIdxX = blockIdx.x / numImgBlocks; const int pxIdxY = blockIdx.y / numFilterBlocks; const int blockImgIdx = (blockIdx.x % numImgBlocks) * B_X * imgsPerThread; const int filterIdx = (blockIdx.y % numFilterBlocks) * B_Y + threadIdx.y; const int imgPixels = imgSize * imgSize; const int pxIdx = pxIdxY * imgSize + pxIdxX; const int imgIdx = blockImgIdx + threadIdx.x; const int inpOffset = pxIdx * numImages + imgIdx; const int outOffset = ((filterIdx) * imgPixels + pxIdx) * numImages + imgIdx; target += outOffset; float prod[imgsPerThread]; float denoms[imgsPerThread]; #pragma unroll for (int i = 0; i < imgsPerThread; i++) { prod[i] = 0; denoms[i] = 0; } int startF = blocked ? (filterIdx / sizeF) * sizeF : -sizeF + sizeF/2 + 1 + filterIdx; int loopStartF = blocked ? startF : MAX(0, startF); int loopEndF = MIN(numFilters, startF + sizeF); for (int f = loopStartF; f < loopEndF; ++f) { #pragma unroll for (int i = 0; i < imgsPerThread; i++) { if (!checkCaseBounds || imgIdx + i * B_X < numImages) { // If an input is zero, then we shuldn't divide by it. const float grad = tex1Dfetch(outGrads, inpOffset + f * imgPixels * numImages + i * B_X); const float act = tex1Dfetch(acts, inpOffset + f * imgPixels * numImages + i * B_X); const float inp = tex1Dfetch(inputs, inpOffset + f * imgPixels * numImages + i * B_X) + (act == 0); prod[i] += grad * act * __powf(__fdividef(act, inp), 1.0f/powScale); } } } startF = blocked ? (filterIdx / sizeF) * sizeF : -sizeF/2 + filterIdx; loopStartF = blocked ? startF : MAX(0, startF); loopEndF = MIN(numFilters, startF + sizeF); for (int f = loopStartF; f < loopEndF; ++f) { #pragma unroll for (int i = 0; i < imgsPerThread; i++) { if (!checkCaseBounds || imgIdx + i * B_X < numImages) { denoms[i] += square(tex1Dfetch(inputs, inpOffset + f * imgPixels * numImages + i * B_X)); } } } if (!add) { #pragma unroll for (int i = 0; i < imgsPerThread; i++) { if (!checkCaseBounds || imgIdx + i * B_X < numImages) { const float inp = tex1Dfetch(inputs, outOffset + i * B_X); const float out = tex1Dfetch(outGrads, outOffset + i * B_X); denoms[i] = addScale * denoms[i] + minDiv; prod[i] = (-2 * powScale * addScale * inp * prod[i] + out * __powf(denoms[i], -powScale)); target[i * B_X] = prod[i]; } } } else { #pragma unroll for (int i = 0; i < imgsPerThread; i++) { if (!checkCaseBounds || imgIdx + i * B_X < numImages) { const float inp = tex1Dfetch(inputs, outOffset + i * B_X); const float out = tex1Dfetch(outGrads, outOffset + i * B_X); denoms[i] = addScale * denoms[i] + minDiv; prod[i] = (-2 * powScale * addScale * inp * prod[i] + out * __powf(denoms[i], -powScale)); target[i * B_X] = scaleTargets * target[i * B_X] + scaleOutputs * prod[i]; } } } } /* * Block size B_YxB_X * blockIdx.x determines pixel.x, image idx in batches of B_X*imgsPerThread * blockIdx.y determines pixel.y, filter idx in batches of B_Y*filtersPerThread * * So each block does one output pixel for some number of images/filters. * * threadIdx.x determines img idx * threadIdx.y determines filter idx * * imgs: (numFilters, imgPixels, numImages) * maxGrads: (numFilters, numOutputs, numImages) * rMaxActs: (numFilters, numOutputs, numImages) * target: (numFilters, imgPixels, numImages) * * numImages must be divisible by B_X*imgsPerThread * numFilters must be divisible by B_Y*filtersPerThread */ template __global__ void kLocalAvgUndo(float* avgGrads, float* target, const int imgSize, const int numFilters, const int numImages, const int subsX, const int startX, const int strideX, const int outputsX, const float scaleTargets, const float scaleOutputs) { const int numImgBlocks = DIVUP(numImages,B_X*imgsPerThread); const int blockPxX = blockIdx.x / numImgBlocks; const int blockPxY = blockIdx.y / (numFilters/(B_Y*filtersPerThread)); const int blockImgIdx = (blockIdx.x % numImgBlocks) * B_X * imgsPerThread; const int blockFilterIdx = (blockIdx.y % (numFilters/(B_Y*filtersPerThread))) * B_Y * filtersPerThread; const int blockPx = blockPxY * imgSize + blockPxX; const int numOutputs = outputsX * outputsX; const int imgPixels = imgSize * imgSize; const int startOutputY = blockPxY - startX < subsX ? 0 : 1 + (blockPxY - startX - subsX) / strideX; const int endOutputY = MIN(outputsX, 1 + (blockPxY - startX) / strideX); const int startOutputX = blockPxX - startX < subsX ? 0 : 1 + (blockPxX - startX - subsX) / strideX; const int endOutputX = MIN(outputsX, 1 + (blockPxX - startX) / strideX); const int imgIdx = blockImgIdx + threadIdx.x; avgGrads += ((blockFilterIdx + threadIdx.y) * numOutputs) * numImages + imgIdx; target += ((blockFilterIdx + threadIdx.y) * imgPixels + blockPx) * numImages + imgIdx; float prod[filtersPerThread][imgsPerThread]; #pragma unroll for (int f = 0; f < filtersPerThread; f++) { #pragma unroll for (int i = 0; i < imgsPerThread; i++) { prod[f][i] = 0; } } if (blockPxX >= startX && blockPxX < startX + strideX * (outputsX-1) + subsX && blockPxY >= startX && blockPxY < startX + strideX * (outputsX-1) + subsX) { for (int my = startOutputY; my < endOutputY; my++) { const float regionStartY = fmaxf(0, startX + my * strideX); const float regionEndY = fminf(imgSize, startX + my * strideX + subsX); const float regionSizeY = regionEndY - regionStartY; for (int mx = startOutputX; mx < endOutputX; mx++) { const int outputIdx = my * outputsX + mx; const float regionStartX = fmaxf(0, startX + mx * strideX); const float regionEndX = fminf(imgSize, startX + mx * strideX + subsX); const float regionSizeX = regionEndX - regionStartX; // It's important to do the division here, because pushing division into the below // loops makes the code 4x slower. const float regionSizeInv = sum ? 1.0f : (1.0f / (regionSizeX * regionSizeY)); #pragma unroll for (int i = 0; i < imgsPerThread; i++) { if (!checkCaseBounds || imgIdx + i * B_X < numImages) { #pragma unroll for (int f = 0; f < filtersPerThread; f++) { prod[f][i] += avgGrads[(f * B_Y * numOutputs + outputIdx) * numImages + i * B_X] * regionSizeInv; } } } } } } if (!add) { #pragma unroll for (int i = 0; i < imgsPerThread; i++) { if (!checkCaseBounds || imgIdx + i * B_X < numImages) { #pragma unroll for (int f = 0; f < filtersPerThread; f++) { target[f * B_Y * imgPixels * numImages + i * B_X] = prod[f][i]; } } } } else { #pragma unroll for (int i = 0; i < imgsPerThread; i++) { if (!checkCaseBounds || imgIdx + i * B_X < numImages) { #pragma unroll for (int f = 0; f < filtersPerThread; f++) { target[f * B_Y * imgPixels * numImages + i * B_X] = scaleTargets * target[f * B_Y * imgPixels * numImages + i * B_X] + scaleOutputs * prod[f][i]; } } } } } /* * Block size B_YxB_X * blockIdx.x determines pixel.x, image idx in batches of B_X*imgsPerThread * blockIdx.y determines pixel.y, filter idx in batches of B_Y*filtersPerThread * * So each block does one output pixel for some number of images/filters. * * threadIdx.x determines img idx * threadIdx.y determines filter idx * * imgs: (numFilters, imgPixels, numImages) * maxGrads: (numFilters, numOutputs, numImages) * maxActs: (numFilters, numOutputs, numImages) * target: (numFilters, imgPixels, numImages) * * numImages must be divisible by B_X*imgsPerThread * numFilters must be divisible by B_Y*filtersPerThread */ template __global__ void kLocalMaxUndo(float* imgs, float* maxGrads, float* maxActs, float* target, const int imgSize, const int numFilters, const int numImages, const int subsX, const int startX, const int strideX, const int outputsX, const float scaleTargets, const float scaleOutputs) { __shared__ float shImgs[B_Y*filtersPerThread][B_X*imgsPerThread]; const int numImgBlocks = DIVUP(numImages,B_X*imgsPerThread); const int blockPxX = blockIdx.x / numImgBlocks; const int blockPxY = blockIdx.y / (numFilters/(B_Y*filtersPerThread)); const int blockImgIdx = (blockIdx.x % numImgBlocks) * B_X * imgsPerThread; const int blockFilterIdx = (blockIdx.y % (numFilters/(B_Y*filtersPerThread))) * B_Y * filtersPerThread; const int blockPx = blockPxY * imgSize + blockPxX; const int numOutputs = outputsX * outputsX; const int imgPixels = imgSize * imgSize; const int startOutputY = blockPxY - startX < subsX ? 0 : 1 + (blockPxY - startX - subsX) / strideX; const int endOutputY = MIN(outputsX, 1 + (blockPxY - startX) / strideX); const int startOutputX = blockPxX - startX < subsX ? 0 : 1 + (blockPxX - startX - subsX) / strideX; const int endOutputX = MIN(outputsX, 1 + (blockPxX - startX) / strideX); const int imgIdx = blockImgIdx + threadIdx.x; imgs += ((blockFilterIdx + threadIdx.y) * imgPixels + blockPx) * numImages + imgIdx; maxGrads += ((blockFilterIdx + threadIdx.y) * numOutputs) * numImages + imgIdx; maxActs += ((blockFilterIdx + threadIdx.y) * numOutputs) * numImages + imgIdx; target += ((blockFilterIdx + threadIdx.y) * imgPixels + blockPx) * numImages + imgIdx; float prod[filtersPerThread][imgsPerThread]; #pragma unroll for (int f = 0; f < filtersPerThread; f++) { #pragma unroll for (int i = 0; i < imgsPerThread; i++) { prod[f][i] = 0; } } if (blockPxX >= startX && blockPxX < startX + strideX * (outputsX-1) + subsX && blockPxY >= startX && blockPxY < startX + strideX * (outputsX-1) + subsX) { #pragma unroll for (int i = 0; i < imgsPerThread; i++) { if (!checkCaseBounds || imgIdx + i * B_X < numImages) { #pragma unroll for (int f = 0; f < filtersPerThread; f++) { shImgs[threadIdx.y + B_Y * f][threadIdx.x + B_X * i] = imgs[f * B_Y * imgPixels * numImages + i * B_X]; } } } for (int my = startOutputY; my < endOutputY; my++) { for (int mx = startOutputX; mx < endOutputX; mx++) { const int outputIdx = my * outputsX + mx; #pragma unroll for (int i = 0; i < imgsPerThread; i++) { if (!checkCaseBounds || imgIdx + i * B_X < numImages) { #pragma unroll for (int f = 0; f < filtersPerThread; f++) { const float ma = maxActs[(f * B_Y * numOutputs + outputIdx) * numImages + i * B_X]; const float mg = maxGrads[(f * B_Y * numOutputs + outputIdx) * numImages + i * B_X]; const float img = shImgs[threadIdx.y + B_Y * f][threadIdx.x + B_X * i]; prod[f][i] += (img == ma) * mg; } } } } } } if (!add) { #pragma unroll for (int i = 0; i < imgsPerThread; i++) { if (!checkCaseBounds || imgIdx + i * B_X < numImages) { #pragma unroll for (int f = 0; f < filtersPerThread; f++) { target[f * B_Y * imgPixels * numImages + i * B_X] = prod[f][i]; } } } } else { #pragma unroll for (int i = 0; i < imgsPerThread; i++) { if (!checkCaseBounds || imgIdx + i * B_X < numImages) { #pragma unroll for (int f = 0; f < filtersPerThread; f++) { target[f * B_Y * imgPixels * numImages + i * B_X] = scaleTargets * target[f * B_Y * imgPixels * numImages + i * B_X] + scaleOutputs * prod[f][i]; } } } } } /* * acts := -2 x scale x acts x outGrads / denoms */ template __global__ void kRNormUndoPrelims(float* acts, cudaTextureObject_t denoms, cudaTextureObject_t outGrads, const uint numElements, const float scale) { const uint e = B_X * blockIdx.x * eltsPerThread + threadIdx.x; const uint numThreads = B_X * gridDim.x; for (uint i = e; i < numElements; i += numThreads*eltsPerThread) { #pragma unroll for (uint k = 0; k < eltsPerThread; k++) { if (i + k * B_X < numElements) { acts[i + k * B_X] = __fdividef(scale * tex1Dfetch(outGrads, i + k * B_X) * acts[i + k * B_X], tex1Dfetch(denoms, i + k * B_X)); } } } } /* * Block size B_YxB_X * blockIdx.x determines pixel.x, image idx in batches of B_X*imgsPerThread * blockIdx.y determines pixel.y, filter idx in batches of B_Y*filtersPerThread * * So each block does one output pixel for some number of images/filters. * * threadIdx.x determines img idx * threadIdx.y determines filter idx * * outGrads: (numFilters, imgPixels, numImages) * denoms: (numFilters, imgPixels, numImages) * inputs: (numFilters, imgPixels, numImages) * acts: (numFilters, imgPixels, numImages) * target: (numFilters, imgPixels, numImages) * * numImages must be divisible by B_X*imgsPerThread * numFilters must be divisible by B_Y*filtersPerThread * * TODO: this isn't really ideal */ template __global__ void kRNormUndo(float* outGrads, float* denoms, float* inputs, float* acts, float* target, const int imgSize, const int numFilters, const int numImages, const int sizeX, const float powScale, const float scaleTargets, const float scaleOutputs) { const int numImgBlocks = DIVUP(numImages,B_X*imgsPerThread); const int numFilterBlocks = numFilters/(B_Y*filtersPerThread); const int blockPxX = blockIdx.x / numImgBlocks; const int blockPxY = blockIdx.y / numFilterBlocks; const int blockImgIdx = (blockIdx.x % numImgBlocks) * B_X * imgsPerThread; const int blockFilterIdx = (blockIdx.y % numFilterBlocks) * B_Y * filtersPerThread; const int blockPx = blockPxY * imgSize + blockPxX; const int imgPixels = imgSize * imgSize; const int startY = MAX(0, blockPxY + sizeX/2 - sizeX + 1); const int startX = MAX(0, blockPxX + sizeX/2 - sizeX + 1); const int endY = MIN(imgSize, blockPxY + sizeX/2 + 1); const int endX = MIN(imgSize, blockPxX + sizeX/2 + 1); const int imgIdx = blockImgIdx + threadIdx.x; acts += ((blockFilterIdx + threadIdx.y) * imgPixels) * numImages + imgIdx; inputs += ((blockFilterIdx + threadIdx.y) * imgPixels + blockPx) * numImages + imgIdx; denoms += ((blockFilterIdx + threadIdx.y) * imgPixels + blockPx) * numImages + imgIdx; outGrads += ((blockFilterIdx + threadIdx.y) * imgPixels + blockPx) * numImages + imgIdx; target += ((blockFilterIdx + threadIdx.y) * imgPixels + blockPx) * numImages + imgIdx; float prod[filtersPerThread][imgsPerThread]; #pragma unroll for (int f = 0; f < filtersPerThread; f++) { #pragma unroll for (int i = 0; i < imgsPerThread; i++) { prod[f][i] = 0; } } for (int sy = startY; sy < endY; sy++) { for (int sx = startX; sx < endX; sx++) { const int outPx = sy * imgSize + sx; #pragma unroll for (int i = 0; i < imgsPerThread; i++) { if (!checkCaseBounds || imgIdx + i * B_X < numImages) { #pragma unroll for (int f = 0; f < filtersPerThread; f++) { prod[f][i] += acts[(f * B_Y * imgPixels + outPx) * numImages + i * B_X]; } } } } } // outGrads += blockPx * numImages; if (scaleTargets == 0) { #pragma unroll for (int i = 0; i < imgsPerThread; i++) { if (!checkCaseBounds || imgIdx + i * B_X < numImages) { #pragma unroll for (int f = 0; f < filtersPerThread; f++) { const float inp = inputs[(f * B_Y * imgPixels) * numImages + i * B_X]; const float out = outGrads[(f * B_Y * imgPixels) * numImages + i * B_X]; const float den = denoms[(f * B_Y * imgPixels) * numImages + i * B_X]; prod[f][i] = inp * prod[f][i] + out * __powf(den, -powScale); target[f * B_Y * imgPixels * numImages + i * B_X] = prod[f][i]; } } } } else { #pragma unroll for (int i = 0; i < imgsPerThread; i++) { if (!checkCaseBounds || imgIdx + i * B_X < numImages) { #pragma unroll for (int f = 0; f < filtersPerThread; f++) { const float inp = inputs[(f * B_Y * imgPixels) * numImages + i * B_X]; const float out = outGrads[(f * B_Y * imgPixels) * numImages + i * B_X]; const float den = denoms[(f * B_Y * imgPixels) * numImages + i * B_X]; prod[f][i] = inp * prod[f][i] + out * __powf(den, -powScale); target[f * B_Y * imgPixels * numImages + i * B_X] = scaleTargets * target[f * B_Y * imgPixels * numImages + i * B_X] + scaleOutputs * prod[f][i]; } } } } } /* * Block size 16xB_X * blockIdx.x determines 4x4 pixel.x region, image idx in batches of B_X*imgsPerThread * blockIdx.y determines 4x4 pixel.y region, filter idx in batches of filtersPerThread * * So each block does 4x4 region for some number of images/filters. * * threadIdx.x determines img idx * threadIdx.y determines pixel idx * * outGrads: (numFilters, imgPixels, numImages) * denoms: (numFilters, imgPixels, numImages) * inputs: (numFilters, imgPixels, numImages) * acts: (numFilters, imgPixels, numImages) * target: (numFilters, imgPixels, numImages) * * B_X one of 8, 16, 32 * imgsPerThread one of 1, 2, 4, 8, 16 * * B_XximgsPerThread MUST be divisible by 32. * Number of filters MUST be divisible by filtersPerThread. * * numImages must be divisible by B_X*imgsPerThread if checkCaseBounds is false * numFilters must be divisible by filtersPerThread * * Final write-out will not be fully coalesced unless B_X is 32. But there's a lot more * reading than writing here, and the reading is all coalesced, so it should be OK. */ template __global__ void kRNormUndo2(float* outGrads, float* denoms, float* inputs, float* acts, float* target, const int imgSize, const int numFilters, const int numImages, const int sizeX, const float powScale, const float scaleTargets, const float scaleOutputs) { __shared__ float shActs[filtersPerThread][B_X*imgsPerThread]; const int imgPixels = imgSize * imgSize; const int numImgBlocks = DIVUP(numImages, B_X*imgsPerThread); const int numFilterBlocks = numFilters/(filtersPerThread); const int blockPxX = 4*(blockIdx.x / numImgBlocks); const int blockPxY = 4*(blockIdx.y / numFilterBlocks); const int blockImgIdx = (blockIdx.x % numImgBlocks) * B_X * imgsPerThread; const int blockFilterIdx = (blockIdx.y % numFilterBlocks) * filtersPerThread; const int tidx = threadIdx.y * B_X + threadIdx.x; const int loadY = tidx / 32, loadX = tidx % 32; const int startPxX = MAX(0, -DIVUP(sizeX,2) + blockPxX + 1); const int startPxY = MAX(0, -DIVUP(sizeX,2) + blockPxY + 1); const int endPxX = MIN(imgSize, blockPxX + sizeX/2 + 4); const int endPxY = MIN(imgSize, blockPxY + sizeX/2 + 4); const int myPxX = blockPxX + threadIdx.y % 4; const int myPxY = blockPxY + threadIdx.y / 4; const int myPxIdx = myPxY * imgSize + myPxX; // const bool doWork = myPxX < imgSize && myPxY < imgSize; const int myStartPxY = -DIVUP(sizeX,2) + myPxY + 1; const int myStartPxX = -DIVUP(sizeX,2) + myPxX + 1; const int myEndPxY = myPxY + sizeX/2 + 1; const int myEndPxX = myPxX + sizeX/2 + 1; const int imgIdx = blockImgIdx + threadIdx.x; acts += (blockFilterIdx + loadY) * imgPixels * numImages + blockImgIdx + loadX; denoms += (blockFilterIdx * imgPixels + myPxIdx) * numImages + imgIdx; inputs += (blockFilterIdx * imgPixels + myPxIdx) * numImages + imgIdx; outGrads += (blockFilterIdx * imgPixels + myPxIdx) * numImages + imgIdx; target += (blockFilterIdx * imgPixels + myPxIdx) * numImages + imgIdx; float prod[filtersPerThread][imgsPerThread]; #pragma unroll for (int f = 0; f < filtersPerThread; f++) { #pragma unroll for (int i = 0; i < imgsPerThread; i++) { prod[f][i] = 0; } } for (int y = startPxY; y < endPxY; y++) { const bool isInY = y >= myStartPxY && y < myEndPxY; for (int x = startPxX; x < endPxX; x++) { const int px = y * imgSize + x; // All the threads load a pixel from memory #pragma unroll for (int ly = 0; ly < filtersPerThread; ly += B_X/2) { if (filtersPerThread % (B_X/2) == 0 || ly + loadY < filtersPerThread) { #pragma unroll for (int lx = 0; lx < B_X*imgsPerThread; lx += 32) { if (!checkCaseBounds || lx + loadX + blockImgIdx < numImages) { shActs[ly + loadY][lx + loadX] = acts[(ly * imgPixels + px) * numImages + lx]; } } } } __syncthreads(); // Each row of threads decides if it's interested in this pixel if (isInY && x >= myStartPxX && x < myEndPxX) { #pragma unroll for (int i = 0; i < imgsPerThread; i++) { if (!checkCaseBounds || imgIdx + i * B_X < numImages) { #pragma unroll for (int f = 0; f < filtersPerThread; f++) { prod[f][i] += shActs[f][threadIdx.x + i * B_X]; } } } } __syncthreads(); } } acts -= (loadY * imgPixels - myPxIdx) * numImages + loadX; acts += threadIdx.x; if (myPxX < imgSize && myPxY < imgSize) { if (!add) { #pragma unroll for (int i = 0; i < imgsPerThread; i++) { if (!checkCaseBounds || imgIdx + i * B_X < numImages) { #pragma unroll for (int f = 0; f < filtersPerThread; f++) { const float out = outGrads[f * imgPixels * numImages + i * B_X]; const float den = denoms[f * imgPixels * numImages + i * B_X]; const float inp = inputs[f * imgPixels * numImages + i * B_X]; prod[f][i] = inp * prod[f][i] + out * __powf(den, -powScale); target[f * imgPixels * numImages + i * B_X] = prod[f][i]; } } } } else { #pragma unroll for (int i = 0; i < imgsPerThread; i++) { if (!checkCaseBounds || imgIdx + i * B_X < numImages) { #pragma unroll for (int f = 0; f < filtersPerThread; f++) { const float out = outGrads[f * imgPixels * numImages + i * B_X]; const float den = denoms[f * imgPixels * numImages + i * B_X]; const float inp = inputs[f * imgPixels * numImages + i * B_X]; prod[f][i] = inp * prod[f][i] + out * __powf(den, -powScale); target[f * imgPixels * numImages + i * B_X] = scaleTargets * target[f * imgPixels * numImages + i * B_X] + scaleOutputs * prod[f][i]; } } } } } } void convLocalMaxUndo(NVMatrix& images, NVMatrix& maxGrads, NVMatrix& maxActs, NVMatrix& target, int subsX, int startX, int strideX, int outputsX) { convLocalMaxUndo(images, maxGrads, maxActs, target, subsX, startX, strideX, outputsX, 0, 1); } /* * imgs: (numFilters, imgPixels, numImages) * maxGrads: (numFilters, numOutputs, numImages) * rMaxActs: (numFilters, numOutputs, numImages) * target: (numFilters, imgPixels, numImages) */ void convLocalMaxUndo(NVMatrix& images, NVMatrix& maxGrads, NVMatrix& maxActs, NVMatrix& target, int subsX, int startX, int strideX, int outputsX, float scaleTargets, float scaleOutput) { int outputs = outputsX * outputsX; int numImages = images.getNumCols(); int numFilters = maxGrads.getNumRows() / outputs; int imgPixels = images.getNumRows() / numFilters; assert(images.getNumRows() == numFilters * imgPixels); int imgSize = int(sqrt(imgPixels)); assert(imgSize * imgSize == imgPixels); assert(maxGrads.getNumRows() == numFilters * outputs); assert(maxGrads.getNumCols() == numImages); assert(!images.isTrans()); assert(!target.isTrans()); assert(!maxGrads.isTrans()); assert(!maxActs.isTrans()); assert(images.isContiguous()); assert(maxGrads.isContiguous()); assert(maxActs.isContiguous()); assert(maxGrads.isSameDims(maxActs)); assert(numFilters % 16 == 0); // assert(numImages % 128 == 0); assert(strideX <= subsX); target.resize(images); assert(target.isContiguous()); int imgsPerThread = numImages % 128 == 0 ? 4 : numImages % 64 == 0 ? 2 : 1; int checkCaseBounds = numImages % (32*imgsPerThread) != 0; dim3 threads(32, 4); dim3 blocks(DIVUP(numImages,32*imgsPerThread) * imgSize, (numFilters / (4 * 2)) * imgSize); cudaStream_t stream = NVMatrix::getDefaultStream(); if (imgsPerThread == 4) { if (checkCaseBounds) { if (scaleTargets == 0 && scaleOutput == 1) { kLocalMaxUndo<4, 32, 4, 2, false, true><<>>(images.getDevData(), maxGrads.getDevData(), maxActs.getDevData(), target.getDevData(), imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, scaleTargets, scaleOutput); } else { kLocalMaxUndo<4, 32, 4, 2, true, true><<>>(images.getDevData(), maxGrads.getDevData(), maxActs.getDevData(), target.getDevData(), imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, scaleTargets, scaleOutput); } } else { if (scaleTargets == 0 && scaleOutput == 1) { kLocalMaxUndo<4, 32, 4, 2, false, false><<>>(images.getDevData(), maxGrads.getDevData(), maxActs.getDevData(), target.getDevData(), imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, scaleTargets, scaleOutput); } else { kLocalMaxUndo<4, 32, 4, 2, true, false><<>>(images.getDevData(), maxGrads.getDevData(), maxActs.getDevData(), target.getDevData(), imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, scaleTargets, scaleOutput); } } } else if (imgsPerThread == 2) { if (checkCaseBounds) { if (scaleTargets == 0 && scaleOutput == 1) { kLocalMaxUndo<4, 32, 2, 2, false, true><<>>(images.getDevData(), maxGrads.getDevData(), maxActs.getDevData(), target.getDevData(), imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, scaleTargets, scaleOutput); } else { kLocalMaxUndo<4, 32, 2, 2, true, true><<>>(images.getDevData(), maxGrads.getDevData(), maxActs.getDevData(), target.getDevData(), imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, scaleTargets, scaleOutput); } } else { if (scaleTargets == 0 && scaleOutput == 1) { kLocalMaxUndo<4, 32, 2, 2, false, false><<>>(images.getDevData(), maxGrads.getDevData(), maxActs.getDevData(), target.getDevData(), imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, scaleTargets, scaleOutput); } else { kLocalMaxUndo<4, 32, 2, 2, true, false><<>>(images.getDevData(), maxGrads.getDevData(), maxActs.getDevData(), target.getDevData(), imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, scaleTargets, scaleOutput); } } } else { if (checkCaseBounds) { if (scaleTargets == 0 && scaleOutput == 1) { kLocalMaxUndo<4, 32, 1, 2, false, true><<>>(images.getDevData(), maxGrads.getDevData(), maxActs.getDevData(), target.getDevData(), imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, scaleTargets, scaleOutput); } else { kLocalMaxUndo<4, 32, 1, 2, true, true><<>>(images.getDevData(), maxGrads.getDevData(), maxActs.getDevData(), target.getDevData(), imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, scaleTargets, scaleOutput); } } else { if (scaleTargets == 0 && scaleOutput == 1) { kLocalMaxUndo<4, 32, 1, 2, false, false><<>>(images.getDevData(), maxGrads.getDevData(), maxActs.getDevData(), target.getDevData(), imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, scaleTargets, scaleOutput); } else { kLocalMaxUndo<4, 32, 1, 2, true, false><<>>(images.getDevData(), maxGrads.getDevData(), maxActs.getDevData(), target.getDevData(), imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, scaleTargets, scaleOutput); } } } getLastCudaError("convLocalMaxUndo: kernel execution failed"); } void convLocalAvgUndo(NVMatrix& avgGrads, NVMatrix& target, int subsX, int startX, int strideX, int outputsX, int imgSize, bool sum) { convLocalAvgUndo(avgGrads, target, subsX, startX, strideX, outputsX, imgSize, sum, 0, 1); } /* * avgGrads: (numFilters, numOutputs, numImages) * target: (numFilters, imgPixels, numImages) */ void convLocalAvgUndo(NVMatrix& avgGrads, NVMatrix& target, int subsX, int startX, int strideX, int outputsX, int imgSize, bool sum, float scaleTargets, float scaleOutput) { int numImages = avgGrads.getNumCols(); int outputs = outputsX * outputsX; int imgPixels = imgSize * imgSize; int numFilters = avgGrads.getNumRows() / outputs; assert(avgGrads.getNumRows() == numFilters * outputs); assert(!target.isTrans()); assert(!avgGrads.isTrans()); assert(avgGrads.isContiguous()); assert(numFilters % 16 == 0); // assert(numImages % 128 == 0); assert(strideX <= subsX); target.resize(numFilters * imgPixels, numImages); assert(target.isContiguous()); int imgsPerThread = numImages % 128 == 0 ? 4 : numImages % 64 == 0 ? 2 : 1; int checkCaseBounds = numImages % (32*imgsPerThread) != 0; dim3 threads(32, 4); dim3 blocks(DIVUP(numImages,32*imgsPerThread) * imgSize, (numFilters / (4 * 4)) * imgSize); cudaStream_t stream = NVMatrix::getDefaultStream(); bool scale = !(scaleTargets == 0 && scaleOutput == 1); if (sum) { if (imgsPerThread == 4) { if (checkCaseBounds) { if (scaleTargets == 0 && scaleOutput == 1) { kLocalAvgUndo<4, 32, 4, 4, true, false, true> <<>>(avgGrads.getDevData(), target.getDevData(), imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, scaleTargets, scaleOutput); } else { kLocalAvgUndo<4, 32, 4, 4, true, true, true> <<>>(avgGrads.getDevData(), target.getDevData(), imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, scaleTargets, scaleOutput); } } else { if (scaleTargets == 0 && scaleOutput == 1) { kLocalAvgUndo<4, 32, 4, 4, true, false, false> <<>>(avgGrads.getDevData(), target.getDevData(), imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, scaleTargets, scaleOutput); } else { kLocalAvgUndo<4, 32, 4, 4, true, true, false> <<>>(avgGrads.getDevData(), target.getDevData(), imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, scaleTargets, scaleOutput); } } } else if (imgsPerThread == 2) { if (checkCaseBounds) { if (scaleTargets == 0 && scaleOutput == 1) { kLocalAvgUndo<4, 32, 2, 4, true, false, true> <<>>(avgGrads.getDevData(), target.getDevData(), imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, scaleTargets, scaleOutput); } else { kLocalAvgUndo<4, 32, 2, 4, true, true, true> <<>>(avgGrads.getDevData(), target.getDevData(), imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, scaleTargets, scaleOutput); } } else { if (scaleTargets == 0 && scaleOutput == 1) { kLocalAvgUndo<4, 32, 2, 4, true, false, false> <<>>(avgGrads.getDevData(), target.getDevData(), imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, scaleTargets, scaleOutput); } else { kLocalAvgUndo<4, 32, 2, 4, true, true, false> <<>>(avgGrads.getDevData(), target.getDevData(), imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, scaleTargets, scaleOutput); } } } else { if (checkCaseBounds) { if (scaleTargets == 0 && scaleOutput == 1) { kLocalAvgUndo<4, 32, 1, 4, true, false, true> <<>>(avgGrads.getDevData(), target.getDevData(), imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, scaleTargets, scaleOutput); } else { kLocalAvgUndo<4, 32, 1, 4, true, true, true> <<>>(avgGrads.getDevData(), target.getDevData(), imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, scaleTargets, scaleOutput); } } else { if (scaleTargets == 0 && scaleOutput == 1) { kLocalAvgUndo<4, 32, 1, 4, true, false, false> <<>>(avgGrads.getDevData(), target.getDevData(), imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, scaleTargets, scaleOutput); } else { kLocalAvgUndo<4, 32, 1, 4, true, true, false> <<>>(avgGrads.getDevData(), target.getDevData(), imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, scaleTargets, scaleOutput); } } } } else { if (imgsPerThread == 4) { if (checkCaseBounds) { if (scaleTargets == 0 && scaleOutput == 1) { kLocalAvgUndo<4, 32, 4, 4, false, false, true> <<>>(avgGrads.getDevData(), target.getDevData(), imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, scaleTargets, scaleOutput); } else { kLocalAvgUndo<4, 32, 4, 4, false, true, true> <<>>(avgGrads.getDevData(), target.getDevData(), imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, scaleTargets, scaleOutput); } } else { if (scaleTargets == 0 && scaleOutput == 1) { kLocalAvgUndo<4, 32, 4, 4, false, false, false> <<>>(avgGrads.getDevData(), target.getDevData(), imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, scaleTargets, scaleOutput); } else { kLocalAvgUndo<4, 32, 4, 4, false, true, false> <<>>(avgGrads.getDevData(), target.getDevData(), imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, scaleTargets, scaleOutput); } } } else if (imgsPerThread == 2) { if (checkCaseBounds) { if (scaleTargets == 0 && scaleOutput == 1) { kLocalAvgUndo<4, 32, 2, 4, false, false, true> <<>>(avgGrads.getDevData(), target.getDevData(), imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, scaleTargets, scaleOutput); } else { kLocalAvgUndo<4, 32, 2, 4, false, true, true> <<>>(avgGrads.getDevData(), target.getDevData(), imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, scaleTargets, scaleOutput); } } else { if (scaleTargets == 0 && scaleOutput == 1) { kLocalAvgUndo<4, 32, 2, 4, false, false, false> <<>>(avgGrads.getDevData(), target.getDevData(), imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, scaleTargets, scaleOutput); } else { kLocalAvgUndo<4, 32, 2, 4, false, true, false> <<>>(avgGrads.getDevData(), target.getDevData(), imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, scaleTargets, scaleOutput); } } } else { if (checkCaseBounds) { if (scaleTargets == 0 && scaleOutput == 1) { kLocalAvgUndo<4, 32, 1, 4, false, false, true> <<>>(avgGrads.getDevData(), target.getDevData(), imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, scaleTargets, scaleOutput); } else { kLocalAvgUndo<4, 32, 1, 4, false, true, true> <<>>(avgGrads.getDevData(), target.getDevData(), imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, scaleTargets, scaleOutput); } } else { if (scaleTargets == 0 && scaleOutput == 1) { kLocalAvgUndo<4, 32, 1, 4, false, false, false> <<>>(avgGrads.getDevData(), target.getDevData(), imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, scaleTargets, scaleOutput); } else { kLocalAvgUndo<4, 32, 1, 4, false, true, false> <<>>(avgGrads.getDevData(), target.getDevData(), imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, scaleTargets, scaleOutput); } } } } getLastCudaError("convLocalAvgUndo: kernel execution failed"); } void convResponseNorm(NVMatrix& images, NVMatrix& denoms, NVMatrix& target, int numFilters, int sizeX, float addScale, float powScale, float minDiv) { convContrastNorm(images, images, denoms, target, numFilters, sizeX, addScale, powScale, minDiv); } /* * images: (numFilters, imgPixels, numImages) * meanDiffs: (numFilters, imgPixels, numImages) * denoms: (numFilters, imgPixels, numImages) (out) * target: (numFilters, imgPixels, numImages) (out) */ void convContrastNorm(NVMatrix& images, NVMatrix& meanDiffs, NVMatrix& denoms, NVMatrix& target, int numFilters, int sizeX, float addScale, float powScale, float minDiv) { int numImages = images.getNumCols(); int imgPixels = images.getNumRows() / numFilters; assert(images.getNumRows() == numFilters * imgPixels); int imgSize = int(sqrt(imgPixels)); assert(imgSize * imgSize == imgPixels); assert(meanDiffs.isSameDims(images)); assert(!meanDiffs.isTrans()); assert(!images.isTrans()); assert(images.isContiguous()); assert(meanDiffs.isContiguous()); assert(numFilters % 16 == 0 || numFilters <= 8); target.resize(images); denoms.resize(images); assert(target.isContiguous()); cudaStream_t stream = NVMatrix::getDefaultStream(); if (sizeX >= 6 && numFilters % 4 == 0) { // This one is faster for large regions (my tests show regions >= 6...) int imgsPerThread = 8; int filtersPerThread = 4; int bx = 8; bool checkCaseBounds = numImages % (bx*imgsPerThread) != 0; assert((imgsPerThread * bx) % 32 == 0); assert(numFilters % filtersPerThread == 0); dim3 threads(bx, 16); dim3 blocks(DIVUP(imgSize, 4) * DIVUP(numImages, bx*imgsPerThread), DIVUP(imgSize, 4) * numFilters / filtersPerThread); if (checkCaseBounds) { cudaFuncSetCacheConfig(kCNorm2<8, 8, 4, true>, cudaFuncCachePreferL1); // L1 faster here kCNorm2<8, 8, 4, true><<>>(images.getDevData(), meanDiffs.getDevData(), denoms.getDevData(), target.getDevData(), imgSize, numFilters, numImages, sizeX, addScale, powScale, minDiv); } else { cudaFuncSetCacheConfig(kCNorm2<8, 8, 4, false>, cudaFuncCachePreferL1); // L1 faster here kCNorm2<8, 8, 4, false><<>>(images.getDevData(), meanDiffs.getDevData(), denoms.getDevData(), target.getDevData(), imgSize, numFilters, numImages, sizeX, addScale, powScale, minDiv); } } else { bool checkCaseBounds = numImages % 128 != 0; if (numFilters <= 8) { dim3 threads(128); dim3 blocks(DIVUP(numImages,128) * imgSize, imgSize); if (numFilters == 1) { if (checkCaseBounds) { cudaFuncSetCacheConfig(kCNorm_fewfilter<1, 1, true>, cudaFuncCachePreferL1); kCNorm_fewfilter<1, 1, true><<>>(images.getDevData(), meanDiffs.getDevData(), denoms.getDevData(), target.getDevData(), imgSize, numImages, sizeX, addScale, powScale, minDiv); } else { cudaFuncSetCacheConfig(kCNorm_fewfilter<1, 1, false>, cudaFuncCachePreferL1); kCNorm_fewfilter<1, 1, false><<>>(images.getDevData(), meanDiffs.getDevData(), denoms.getDevData(), target.getDevData(), imgSize, numImages, sizeX, addScale, powScale, minDiv); } } else if (numFilters == 2) { if (checkCaseBounds) { cudaFuncSetCacheConfig(kCNorm_fewfilter<1, 2, true>, cudaFuncCachePreferL1); kCNorm_fewfilter<1, 2, true><<>>(images.getDevData(), meanDiffs.getDevData(), denoms.getDevData(), target.getDevData(), imgSize, numImages, sizeX, addScale, powScale, minDiv); } else { cudaFuncSetCacheConfig(kCNorm_fewfilter<1, 2, false>, cudaFuncCachePreferL1); kCNorm_fewfilter<1, 2, false><<>>(images.getDevData(), meanDiffs.getDevData(), denoms.getDevData(), target.getDevData(), imgSize, numImages, sizeX, addScale, powScale, minDiv); } } else if (numFilters == 3) { if (checkCaseBounds) { cudaFuncSetCacheConfig(kCNorm_fewfilter<1, 3, true>, cudaFuncCachePreferL1); kCNorm_fewfilter<1, 3, true><<>>(images.getDevData(), meanDiffs.getDevData(), denoms.getDevData(), target.getDevData(), imgSize, numImages, sizeX, addScale, powScale, minDiv); } else { cudaFuncSetCacheConfig(kCNorm_fewfilter<1, 3, false>, cudaFuncCachePreferL1); kCNorm_fewfilter<1, 3, false><<>>(images.getDevData(), meanDiffs.getDevData(), denoms.getDevData(), target.getDevData(), imgSize, numImages, sizeX, addScale, powScale, minDiv); } } else if (numFilters == 4) { if (checkCaseBounds) { cudaFuncSetCacheConfig(kCNorm_fewfilter<1, 4, true>, cudaFuncCachePreferL1); kCNorm_fewfilter<1, 4, true><<>>(images.getDevData(), meanDiffs.getDevData(), denoms.getDevData(), target.getDevData(), imgSize, numImages, sizeX, addScale, powScale, minDiv); } else { cudaFuncSetCacheConfig(kCNorm_fewfilter<1, 4, false>, cudaFuncCachePreferL1); kCNorm_fewfilter<1, 4, false><<>>(images.getDevData(), meanDiffs.getDevData(), denoms.getDevData(), target.getDevData(), imgSize, numImages, sizeX, addScale, powScale, minDiv); } } else if (numFilters == 5) { if (checkCaseBounds) { cudaFuncSetCacheConfig(kCNorm_fewfilter<1, 5, true>, cudaFuncCachePreferL1); kCNorm_fewfilter<1, 5, true><<>>(images.getDevData(), meanDiffs.getDevData(), denoms.getDevData(), target.getDevData(), imgSize, numImages, sizeX, addScale, powScale, minDiv); } else { cudaFuncSetCacheConfig(kCNorm_fewfilter<1, 5, false>, cudaFuncCachePreferL1); kCNorm_fewfilter<1, 5, false><<>>(images.getDevData(), meanDiffs.getDevData(), denoms.getDevData(), target.getDevData(), imgSize, numImages, sizeX, addScale, powScale, minDiv); } } else if (numFilters == 6) { if (checkCaseBounds) { cudaFuncSetCacheConfig(kCNorm_fewfilter<1, 6, true>, cudaFuncCachePreferL1); kCNorm_fewfilter<1, 6, true><<>>(images.getDevData(), meanDiffs.getDevData(), denoms.getDevData(), target.getDevData(), imgSize, numImages, sizeX, addScale, powScale, minDiv); } else { cudaFuncSetCacheConfig(kCNorm_fewfilter<1, 6, false>, cudaFuncCachePreferL1); kCNorm_fewfilter<1, 6, false><<>>(images.getDevData(), meanDiffs.getDevData(), denoms.getDevData(), target.getDevData(), imgSize, numImages, sizeX, addScale, powScale, minDiv); } } else if (numFilters == 7) { if (checkCaseBounds) { cudaFuncSetCacheConfig(kCNorm_fewfilter<1, 7, true>, cudaFuncCachePreferL1); kCNorm_fewfilter<1, 7, true><<>>(images.getDevData(), meanDiffs.getDevData(), denoms.getDevData(), target.getDevData(), imgSize, numImages, sizeX, addScale, powScale, minDiv); } else { cudaFuncSetCacheConfig(kCNorm_fewfilter<1, 7, false>, cudaFuncCachePreferL1); kCNorm_fewfilter<1, 7, false><<>>(images.getDevData(), meanDiffs.getDevData(), denoms.getDevData(), target.getDevData(), imgSize, numImages, sizeX, addScale, powScale, minDiv); } } else if (numFilters == 8) { if (checkCaseBounds) { cudaFuncSetCacheConfig(kCNorm_fewfilter<1, 8, true>, cudaFuncCachePreferL1); kCNorm_fewfilter<1, 8, true><<>>(images.getDevData(), meanDiffs.getDevData(), denoms.getDevData(), target.getDevData(), imgSize, numImages, sizeX, addScale, powScale, minDiv); } else { cudaFuncSetCacheConfig(kCNorm_fewfilter<1, 8, false>, cudaFuncCachePreferL1); kCNorm_fewfilter<1, 8, false><<>>(images.getDevData(), meanDiffs.getDevData(), denoms.getDevData(), target.getDevData(), imgSize, numImages, sizeX, addScale, powScale, minDiv); } } } else { dim3 threads(32, 4); dim3 blocks(DIVUP(numImages,threads.x*4), (numFilters / (threads.y * 2)), imgPixels); if (checkCaseBounds) { cudaFuncSetCacheConfig(kCNorm_manyfilter<4, 32, 4, 2, true>, cudaFuncCachePreferL1); kCNorm_manyfilter<4, 32, 4, 2, true><<>>(images.getDevData(), meanDiffs.getDevData(), denoms.getDevData(), target.getDevData(), imgSize, numFilters, numImages, sizeX, addScale, powScale, minDiv); } else { cudaFuncSetCacheConfig(kCNorm_manyfilter<4, 32, 4, 2, false>, cudaFuncCachePreferL1); kCNorm_manyfilter<4, 32, 4, 2, false><<>>(images.getDevData(), meanDiffs.getDevData(), denoms.getDevData(), target.getDevData(), imgSize, numFilters, numImages, sizeX, addScale, powScale, minDiv); } } } getLastCudaError("convResponseNorm: kernel execution failed"); } void convContrastNormUndo(NVMatrix& outGrads, NVMatrix& denoms, NVMatrix& meanDiffs, NVMatrix& acts, NVMatrix& target, int numFilters, int sizeX, float addScale, float powScale, float scaleTargets, float scaleOutput) { convResponseNormUndo(outGrads, denoms, meanDiffs, acts, target, numFilters, sizeX, addScale, powScale, scaleTargets, scaleOutput); } /* * outGrads: (numFilters, imgPixels, numImages) * denoms: (numFilters, imgPixels, numImages) * inputs: (numFilters, imgPixels, numImages) * acts: (numFilters, imgPixels, numImages) * target: (numFilters, imgPixels, numImages) * * THIS WILL OVERWRITE THE ACTS MATRIX. */ void convResponseNormUndo(NVMatrix& outGrads, NVMatrix& denoms, NVMatrix& inputs, NVMatrix& acts, NVMatrix& target, int numFilters, int sizeX, float addScale, float powScale, float scaleTargets, float scaleOutput) { int numImages = outGrads.getNumCols(); int imgPixels = outGrads.getNumRows() / numFilters; int imgSize = int(sqrt(imgPixels)); assert(imgSize * imgSize == imgPixels); assert(outGrads.getNumRows() == numFilters * imgPixels); assert(denoms.isSameDims(outGrads)); assert(acts.isSameDims(denoms)); assert(!denoms.isTrans()); assert(!outGrads.isTrans()); assert(!acts.isTrans()); assert(!target.isTrans()); assert(outGrads.isContiguous()); assert(numFilters % 16 == 0); target.resize(outGrads); assert(target.isContiguous()); // First do acts := -2 x scale x acts x outGrads / denoms // so that the main routine only has to do an addition in its inner loop. int prelimEltsPerThread = 8; dim3 threads(128); dim3 blocks(DIVUP(outGrads.getNumElements(),(threads.x * prelimEltsPerThread))); bool checkPrelimBounds = outGrads.getNumElements() % (threads.x * prelimEltsPerThread) != 0; //printf("num elts: %d, blocks: %d\n", outGrads.getNumElements(), blocks.x); cudaStream_t stream = NVMatrix::getDefaultStream(); kRNormUndoPrelims<128, 8><<>>(acts.getDevData(), denoms.getTextureObject(), outGrads.getTextureObject(), outGrads.getNumElements(), -2*addScale*powScale); // Now the main routine if (sizeX >= 6 && numFilters % 4 == 0) { // This one is faster for large regions (my tests show regions >= 6...) // NOTE: this stuff is not optimized for Kepler. Only kRNormUndo is. int imgsPerThread = numImages % 128 == 0 ? 8 : numImages % 64 == 0 ? 4 : 2; int filtersPerThread = 4; int bx = 16; bool checkCaseBounds = numImages % (bx*imgsPerThread) != 0; assert((imgsPerThread * bx) % 32 == 0); threads = dim3(bx, 16); blocks = dim3(DIVUP(imgSize, 4) * DIVUP(numImages, bx*imgsPerThread), DIVUP(imgSize, 4) * numFilters / filtersPerThread); if (imgsPerThread == 8) { if (checkCaseBounds) { if (scaleTargets == 0 && scaleOutput == 1) { cudaFuncSetCacheConfig(kRNormUndo2<16, 8, 4, true, true>, cudaFuncCachePreferL1); kRNormUndo2<16, 8, 4, true, true><<>>(outGrads.getDevData(), denoms.getDevData(), inputs.getDevData(), acts.getDevData(), target.getDevData(), imgSize, numFilters, numImages, sizeX, powScale, scaleTargets, scaleOutput); } else { cudaFuncSetCacheConfig(kRNormUndo2<16, 8, 4, false, true>, cudaFuncCachePreferL1); kRNormUndo2<16, 8, 4, false, true><<>>(outGrads.getDevData(), denoms.getDevData(), inputs.getDevData(), acts.getDevData(), target.getDevData(), imgSize, numFilters, numImages, sizeX, powScale, scaleTargets, scaleOutput); } } else { if (scaleTargets == 0 && scaleOutput == 1) { cudaFuncSetCacheConfig(kRNormUndo2<16, 8, 4, true, false>, cudaFuncCachePreferL1); kRNormUndo2<16, 8, 4, true, false><<>>(outGrads.getDevData(), denoms.getDevData(), inputs.getDevData(), acts.getDevData(), target.getDevData(), imgSize, numFilters, numImages, sizeX, powScale, scaleTargets, scaleOutput); } else { cudaFuncSetCacheConfig(kRNormUndo2<16, 8, 4, false, false>, cudaFuncCachePreferL1); kRNormUndo2<16, 8, 4, false, false><<>>(outGrads.getDevData(), denoms.getDevData(), inputs.getDevData(), acts.getDevData(), target.getDevData(), imgSize, numFilters, numImages, sizeX, powScale, scaleTargets, scaleOutput); } } } else if (imgsPerThread == 4) { if (checkCaseBounds) { if (scaleTargets == 0 && scaleOutput == 1) { cudaFuncSetCacheConfig(kRNormUndo2<16, 4, 4, true, true>, cudaFuncCachePreferL1); kRNormUndo2<16, 4, 4, true, true><<>>(outGrads.getDevData(), denoms.getDevData(), inputs.getDevData(), acts.getDevData(), target.getDevData(), imgSize, numFilters, numImages, sizeX, powScale, scaleTargets, scaleOutput); } else { cudaFuncSetCacheConfig(kRNormUndo2<16, 4, 4, false, true>, cudaFuncCachePreferL1); kRNormUndo2<16, 4, 4, false, true><<>>(outGrads.getDevData(), denoms.getDevData(), inputs.getDevData(), acts.getDevData(), target.getDevData(), imgSize, numFilters, numImages, sizeX, powScale, scaleTargets, scaleOutput); } } else { if (scaleTargets == 0 && scaleOutput == 1) { cudaFuncSetCacheConfig(kRNormUndo2<16, 4, 4, true, false>, cudaFuncCachePreferL1); kRNormUndo2<16, 4, 4, true, false><<>>(outGrads.getDevData(), denoms.getDevData(), inputs.getDevData(), acts.getDevData(), target.getDevData(), imgSize, numFilters, numImages, sizeX, powScale, scaleTargets, scaleOutput); } else { cudaFuncSetCacheConfig(kRNormUndo2<16, 4, 4, false, false>, cudaFuncCachePreferL1); kRNormUndo2<16, 4, 4, false, false><<>>(outGrads.getDevData(), denoms.getDevData(), inputs.getDevData(), acts.getDevData(), target.getDevData(), imgSize, numFilters, numImages, sizeX, powScale, scaleTargets, scaleOutput); } } } else { if (checkCaseBounds) { if (scaleTargets == 0 && scaleOutput == 1) { cudaFuncSetCacheConfig(kRNormUndo2<16, 2, 4, true, true>, cudaFuncCachePreferL1); kRNormUndo2<16, 2, 4, true, true><<>>(outGrads.getDevData(), denoms.getDevData(), inputs.getDevData(), acts.getDevData(), target.getDevData(), imgSize, numFilters, numImages, sizeX, powScale, scaleTargets, scaleOutput); } else { cudaFuncSetCacheConfig(kRNormUndo2<16, 2, 4, false, true>, cudaFuncCachePreferL1); kRNormUndo2<16, 2, 4, false, true><<>>(outGrads.getDevData(), denoms.getDevData(), inputs.getDevData(), acts.getDevData(), target.getDevData(), imgSize, numFilters, numImages, sizeX, powScale, scaleTargets, scaleOutput); } } else { if (scaleTargets == 0 && scaleOutput == 1) { cudaFuncSetCacheConfig(kRNormUndo2<16, 2, 4, true, false>, cudaFuncCachePreferL1); kRNormUndo2<16, 2, 4, true, false><<>>(outGrads.getDevData(), denoms.getDevData(), inputs.getDevData(), acts.getDevData(), target.getDevData(), imgSize, numFilters, numImages, sizeX, powScale, scaleTargets, scaleOutput); } else { cudaFuncSetCacheConfig(kRNormUndo2<16, 2, 4, false, false>, cudaFuncCachePreferL1); kRNormUndo2<16, 2, 4, false, false><<>>(outGrads.getDevData(), denoms.getDevData(), inputs.getDevData(), acts.getDevData(), target.getDevData(), imgSize, numFilters, numImages, sizeX, powScale, scaleTargets, scaleOutput); } } } } else { int imgsPerThread = numImages % 128 == 0 ? 4 : 1; bool checkCaseBounds = numImages % (32*imgsPerThread) != 0; threads = dim3(32, 4); blocks = dim3(DIVUP(numImages,32*imgsPerThread) * imgSize, (numFilters / (4 * 2)) * imgSize); if (imgsPerThread == 4) { if (checkCaseBounds) { cudaFuncSetCacheConfig(kRNormUndo<4, 32, 4, 2, true>, cudaFuncCachePreferL1); kRNormUndo<4, 32, 4, 2, true><<>>(outGrads.getDevData(), denoms.getDevData(), inputs.getDevData(), acts.getDevData(), target.getDevData(), imgSize, numFilters, numImages, sizeX, powScale, scaleTargets, scaleOutput); } else { cudaFuncSetCacheConfig(kRNormUndo<4, 32, 4, 2, false>, cudaFuncCachePreferL1); kRNormUndo<4, 32, 4, 2, false><<>>(outGrads.getDevData(), denoms.getDevData(), inputs.getDevData(), acts.getDevData(), target.getDevData(), imgSize, numFilters, numImages, sizeX, powScale, scaleTargets, scaleOutput); } } else { if (checkCaseBounds) { cudaFuncSetCacheConfig(kRNormUndo<4, 32, 1, 2, true>, cudaFuncCachePreferL1); kRNormUndo<4, 32, 1, 2, true><<>>(outGrads.getDevData(), denoms.getDevData(), inputs.getDevData(), acts.getDevData(), target.getDevData(), imgSize, numFilters, numImages, sizeX, powScale, scaleTargets, scaleOutput); } else { cudaFuncSetCacheConfig(kRNormUndo<4, 32, 1, 2, false>, cudaFuncCachePreferL1); kRNormUndo<4, 32, 1, 2, false><<>>(outGrads.getDevData(), denoms.getDevData(), inputs.getDevData(), acts.getDevData(), target.getDevData(), imgSize, numFilters, numImages, sizeX, powScale, scaleTargets, scaleOutput); } } } getLastCudaError("kRNormUndo: kernel execution failed"); } /* * imgs: (numChannels, imgPixels, numImages) with given imgStride * target: (numChannels, tgtPixels, numImages) * * imgSize = scale * tgtSize */ void convResizeBilinear(NVMatrix& images, NVMatrix& target, int imgSize, int tgtSize, float scale) { assert(!images.isTrans()); assert(!target.isTrans()); int imgPixels = imgSize * imgSize; int tgtPixels = tgtSize * tgtSize; int numChannels = images.getNumRows() / imgPixels; int numImages = images.getNumCols(); assert(images.getNumRows() == numChannels * imgPixels); target.resize(numChannels * tgtPixels, numImages); assert(target.isContiguous()); int numChunksX = DIVUP(tgtSize, 4); int numChunks = numChunksX * numChunksX; double imgCenter = imgSize * 0.5; double tgtCenter = tgtSize * 0.5; double centerScale = imgCenter - tgtCenter * scale; int imgsPerThread = numImages % 128 == 0 ? 4 : numImages % 64 == 0 ? 2 : 1; bool checkCaseBounds = numImages % (32*imgsPerThread) != 0; cudaStream_t stream = NVMatrix::getDefaultStream(); dim3 threads(32, 16); dim3 blocks(DIVUP(numImages, imgsPerThread * 32), numChannels * numChunks); if (imgsPerThread == 4) { if (checkCaseBounds) { cudaFuncSetCacheConfig(kResizeBilinear<4, true>, cudaFuncCachePreferL1); kResizeBilinear<4, true><<>>(images.getDevData(), target.getDevData(), imgSize, tgtSize, numImages, images.getStride(), scale, centerScale); } else { cudaFuncSetCacheConfig(kResizeBilinear<4, false>, cudaFuncCachePreferL1); kResizeBilinear<4, false><<>>(images.getDevData(), target.getDevData(), imgSize, tgtSize, numImages, images.getStride(), scale, centerScale); } } else if (imgsPerThread == 2) { if (checkCaseBounds) { cudaFuncSetCacheConfig(kResizeBilinear<2, true>, cudaFuncCachePreferL1); kResizeBilinear<2, true><<>>(images.getDevData(), target.getDevData(), imgSize, tgtSize, numImages, images.getStride(), scale, centerScale); } else { cudaFuncSetCacheConfig(kResizeBilinear<2, false>, cudaFuncCachePreferL1); kResizeBilinear<2, false><<>>(images.getDevData(), target.getDevData(), imgSize, tgtSize, numImages, images.getStride(), scale, centerScale); } } else { if (checkCaseBounds) { cudaFuncSetCacheConfig(kResizeBilinear<1, true>, cudaFuncCachePreferL1); kResizeBilinear<1, true><<>>(images.getDevData(), target.getDevData(), imgSize, tgtSize, numImages, images.getStride(), scale, centerScale); } else { cudaFuncSetCacheConfig(kResizeBilinear<1, false>, cudaFuncCachePreferL1); kResizeBilinear<1, false><<>>(images.getDevData(), target.getDevData(), imgSize, tgtSize, numImages, images.getStride(), scale, centerScale); } } getLastCudaError("convResizeBilinear: kernel execution failed"); } /* * imgs: (3, imgPixels, numImages) with given imgStride * target: (3, imgPixels, numImages) */ void convRGBToYUV(NVMatrix& images, NVMatrix& target) { assert(!images.isTrans()); assert(!target.isTrans()); int imgPixels = images.getNumRows() / 3; int numImages = images.getNumCols(); assert(images.getNumRows() == 3 * imgPixels); target.resize(3 * imgPixels, numImages); assert(target.isContiguous()); int imgsPerThread = numImages % 128 == 0 ? 4 : numImages % 64 == 0 ? 2 : 1; bool checkCaseBounds = numImages % (32*imgsPerThread) != 0; cudaStream_t stream = NVMatrix::getDefaultStream(); dim3 threads(32, 4); dim3 blocks(DIVUP(numImages, imgsPerThread * 32), DIVUP(imgPixels, 4)); if (imgsPerThread == 4) { if (checkCaseBounds) { cudaFuncSetCacheConfig(kRGBToYUV<4, true>, cudaFuncCachePreferL1); kRGBToYUV<4, true><<>>(images.getDevData(), target.getDevData(), imgPixels, numImages, images.getStride()); } else { cudaFuncSetCacheConfig(kRGBToYUV<4, false>, cudaFuncCachePreferL1); kRGBToYUV<4, false><<>>(images.getDevData(), target.getDevData(), imgPixels, numImages, images.getStride()); } } else if (imgsPerThread == 2) { if (checkCaseBounds) { cudaFuncSetCacheConfig(kRGBToYUV<2, true>, cudaFuncCachePreferL1); kRGBToYUV<2, true><<>>(images.getDevData(), target.getDevData(), imgPixels, numImages, images.getStride()); } else { cudaFuncSetCacheConfig(kRGBToYUV<2, false>, cudaFuncCachePreferL1); kRGBToYUV<2, false><<>>(images.getDevData(), target.getDevData(), imgPixels, numImages, images.getStride()); } } else { if (checkCaseBounds) { cudaFuncSetCacheConfig(kRGBToYUV<1, true>, cudaFuncCachePreferL1); kRGBToYUV<1, true><<>>(images.getDevData(), target.getDevData(), imgPixels, numImages, images.getStride()); } else { cudaFuncSetCacheConfig(kRGBToYUV<1, false>, cudaFuncCachePreferL1); kRGBToYUV<1, false><<>>(images.getDevData(), target.getDevData(), imgPixels, numImages, images.getStride()); } } getLastCudaError("convRGBToYUV: kernel execution failed"); } /* * imgs: (3, imgPixels, numImages) with given imgStride * target: (3, imgPixels, numImages) */ void convRGBToLAB(NVMatrix& images, NVMatrix& target, bool center) { assert(!images.isTrans()); assert(!target.isTrans()); int imgPixels = images.getNumRows() / 3; int numImages = images.getNumCols(); assert(images.getNumRows() == 3 * imgPixels); target.resize(3 * imgPixels, numImages); assert(target.isContiguous()); int imgsPerThread = numImages % 128 == 0 ? 4 : numImages % 64 == 0 ? 2 : 1; bool checkCaseBounds = numImages % (32*imgsPerThread) != 0; dim3 threads(32, 4); dim3 blocks(DIVUP(numImages, imgsPerThread * 32), DIVUP(imgPixels, 4)); cudaStream_t stream = NVMatrix::getDefaultStream(); if (imgsPerThread == 4) { if (center) { if (checkCaseBounds) { cudaFuncSetCacheConfig(kRGBToLAB<4, true, true>, cudaFuncCachePreferL1); kRGBToLAB<4, true, true><<>>(images.getDevData(), target.getDevData(), imgPixels, numImages, images.getStride()); } else { cudaFuncSetCacheConfig(kRGBToLAB<4, false, true>, cudaFuncCachePreferL1); kRGBToLAB<4, false, true><<>>(images.getDevData(), target.getDevData(), imgPixels, numImages, images.getStride()); } } else { if (checkCaseBounds) { cudaFuncSetCacheConfig(kRGBToLAB<4, true, false>, cudaFuncCachePreferL1); kRGBToLAB<4, true, false><<>>(images.getDevData(), target.getDevData(), imgPixels, numImages, images.getStride()); } else { cudaFuncSetCacheConfig(kRGBToLAB<4, false, false>, cudaFuncCachePreferL1); kRGBToLAB<4, false, false><<>>(images.getDevData(), target.getDevData(), imgPixels, numImages, images.getStride()); } } } else if (imgsPerThread == 2) { if (center) { if (checkCaseBounds) { cudaFuncSetCacheConfig(kRGBToLAB<2, true, true>, cudaFuncCachePreferL1); kRGBToLAB<2, true, true><<>>(images.getDevData(), target.getDevData(), imgPixels, numImages, images.getStride()); } else { cudaFuncSetCacheConfig(kRGBToLAB<2, false, true>, cudaFuncCachePreferL1); kRGBToLAB<2, false, true><<>>(images.getDevData(), target.getDevData(), imgPixels, numImages, images.getStride()); } } else { if (checkCaseBounds) { cudaFuncSetCacheConfig(kRGBToLAB<2, true, false>, cudaFuncCachePreferL1); kRGBToLAB<2, true, false><<>>(images.getDevData(), target.getDevData(), imgPixels, numImages, images.getStride()); } else { cudaFuncSetCacheConfig(kRGBToLAB<2, false, false>, cudaFuncCachePreferL1); kRGBToLAB<2, false, false><<>>(images.getDevData(), target.getDevData(), imgPixels, numImages, images.getStride()); } } } else { if (center) { if (checkCaseBounds) { cudaFuncSetCacheConfig(kRGBToLAB<1, true, true>, cudaFuncCachePreferL1); kRGBToLAB<1, true, true><<>>(images.getDevData(), target.getDevData(), imgPixels, numImages, images.getStride()); } else { cudaFuncSetCacheConfig(kRGBToLAB<1, false, true>, cudaFuncCachePreferL1); kRGBToLAB<1, false, true><<>>(images.getDevData(), target.getDevData(), imgPixels, numImages, images.getStride()); } } else { if (checkCaseBounds) { cudaFuncSetCacheConfig(kRGBToLAB<1, true, false>, cudaFuncCachePreferL1); kRGBToLAB<1, true, false><<>>(images.getDevData(), target.getDevData(), imgPixels, numImages, images.getStride()); } else { cudaFuncSetCacheConfig(kRGBToLAB<1, false, false>, cudaFuncCachePreferL1); kRGBToLAB<1, false, false><<>>(images.getDevData(), target.getDevData(), imgPixels, numImages, images.getStride()); } } } getLastCudaError("convRGBToLAB: kernel execution failed"); } /* * imgs: (numChannels, imgPixels, numImages) with given imgStride * target: (numChannels, tgtPixels, numImages) */ void convCrop(NVMatrix& imgs, NVMatrix& target, int imgSize, int tgtSize, int startY, int startX) { int numImages = imgs.getNumCols(); int imgPixels = imgSize * imgSize; int tgtPixels = tgtSize * tgtSize; int numChannels = imgs.getNumRows() / imgPixels; assert(imgs.getNumRows() == imgPixels * numChannels); assert(imgPixels == imgSize * imgSize); assert(imgSize - startY >= tgtSize); assert(imgSize - startX >= tgtSize); assert(startY >= 0); assert(startX >= 0); target.resize(numChannels * tgtPixels, numImages); int imgsPerThread = numImages % 128 == 0 ? 4 : numImages % 64 == 0 ? 2 : 1; bool checkCaseBounds = numImages % (32*imgsPerThread) != 0; dim3 blocks(DIVUP(numImages, 32 * imgsPerThread), numChannels * DIVUP(tgtPixels, 4)); dim3 threads(32, 4); cudaStream_t stream = NVMatrix::getDefaultStream(); if (imgsPerThread == 4) { if (checkCaseBounds) { kCrop<4, true><<>>(imgs.getDevData(), target.getDevData(), numImages, imgs.getStride(), imgSize, tgtSize, startY, startX); } else { kCrop<4, false><<>>(imgs.getDevData(), target.getDevData(), numImages, imgs.getStride(), imgSize, tgtSize, startY, startX); } } else if (imgsPerThread == 2) { if (checkCaseBounds) { kCrop<2, true><<>>(imgs.getDevData(), target.getDevData(), numImages, imgs.getStride(), imgSize, tgtSize, startY, startX); } else { kCrop<2, false><<>>(imgs.getDevData(), target.getDevData(), numImages, imgs.getStride(), imgSize, tgtSize, startY, startX); } } else { if (checkCaseBounds) { kCrop<1, true><<>>(imgs.getDevData(), target.getDevData(), numImages, imgs.getStride(), imgSize, tgtSize, startY, startX); } else { kCrop<1, false><<>>(imgs.getDevData(), target.getDevData(), numImages, imgs.getStride(), imgSize, tgtSize, startY, startX); } } getLastCudaError("convCrop: kernel execution failed"); } /* * images: (numFilters, imgPixels, numImages) * meanDiffs: (numFilters, imgPixels, numImages) * denoms: (numFilters, imgPixels, numImages) (out) * target: (numFilters, imgPixels, numImages) (out) * Note: at present, I have no code to compute the meanDiffs. So it should be set * to be equal to images. In other words, this isn't really doing contrast normalization, * just response normalization. */ void convContrastNormCrossMap(NVMatrix& images, NVMatrix& meanDiffs, NVMatrix& target, int numFilters, int sizeF, float addScale, float powScale, float minDiv, bool blocked) { int numImages = images.getNumCols(); int imgPixels = images.getNumRows() / numFilters; assert(images.getNumRows() == numFilters * imgPixels); int imgSize = int(sqrt(imgPixels)); assert(imgSize * imgSize == imgPixels); assert(meanDiffs.isSameDims(images)); assert(sizeF > 0 && sizeF <= numFilters); assert(!meanDiffs.isTrans()); assert(!images.isTrans()); assert(images.isContiguous()); assert(meanDiffs.isContiguous()); assert(numFilters % 16 == 0); target.resize(images); // denoms.resize(images); assert(target.isContiguous()); bool checkCaseBounds = numImages % 128 != 0; dim3 threads(32, 4); dim3 blocks(DIVUP(numImages,32*4) * imgSize, (numFilters / 4) * imgSize); cudaStream_t stream = NVMatrix::getDefaultStream(); // printf("convContrastNormCrossMap imgs: %p, meanDiffs: %p, denoms: %p, target: %p, imgSize: %d, numFilters: %d, numImages: %d, sizeF: %d, addScale: %f, powScale: %f, minDiv: %f, blocked: %d\n", // images.getDevData(), meanDiffs.getDevData(), denoms.getDevData(), target.getDevData(), imgSize, numFilters, numImages, sizeF, addScale, powScale, minDiv, blocked); if (blocked) { if (checkCaseBounds) { cudaFuncSetCacheConfig(kFCNorm<4, 32, 4, true, true>, cudaFuncCachePreferL1); kFCNorm<4, 32, 4, true, true><<>>(images.getTextureObject(), meanDiffs.getTextureObject(), target.getDevData(), imgSize, numFilters, numImages, sizeF, addScale, powScale, minDiv); } else { cudaFuncSetCacheConfig(kFCNorm<4, 32, 4, false, true>, cudaFuncCachePreferL1); kFCNorm<4, 32, 4, false, true><<>>(images.getTextureObject(), meanDiffs.getTextureObject(), target.getDevData(), imgSize, numFilters, numImages, sizeF, addScale, powScale, minDiv); } } else { if (checkCaseBounds) { cudaFuncSetCacheConfig(kFCNorm<4, 32, 4, true, false>, cudaFuncCachePreferL1); kFCNorm<4, 32, 4, true, false><<>>(images.getTextureObject(), meanDiffs.getTextureObject(), target.getDevData(), imgSize, numFilters, numImages, sizeF, addScale, powScale, minDiv); } else { cudaFuncSetCacheConfig(kFCNorm<4, 32, 4, false, false>, cudaFuncCachePreferL1); kFCNorm<4, 32, 4, false, false><<>>(images.getTextureObject(), meanDiffs.getTextureObject(), target.getDevData(), imgSize, numFilters, numImages, sizeF, addScale, powScale, minDiv); } } getLastCudaError("convContrastNormCrossMap: kernel execution failed"); } /* * outGrads: (numFilters, imgPixels, numImages) * denoms: (numFilters, imgPixels, numImages) * inputs: (numFilters, imgPixels, numImages) * acts: (numFilters, imgPixels, numImages) * target: (numFilters, imgPixels, numImages) * * THIS WILL OVERWRITE THE ACTS MATRIX. */ void convResponseNormCrossMapUndo(NVMatrix& outGrads, NVMatrix& inputs, NVMatrix& acts, NVMatrix& target, int numFilters, int sizeF, float addScale, float powScale, float minDiv, bool blocked, float scaleTargets, float scaleOutput) { int numImages = outGrads.getNumCols(); int imgPixels = outGrads.getNumRows() / numFilters; int imgSize = int(sqrt(imgPixels)); assert(imgSize * imgSize == imgPixels); assert(sizeF > 0 && sizeF <= numFilters); assert(outGrads.getNumRows() == numFilters * imgPixels); assert(!outGrads.isTrans()); assert(!acts.isTrans()); assert(!target.isTrans()); assert(outGrads.isContiguous()); assert(numFilters % 16 == 0); target.resize(outGrads); assert(target.isContiguous()); // First do acts := -2 x scale x acts x outGrads / denoms // so that the main routine only has to do an addition in its inner loop. cudaStream_t stream = NVMatrix::getDefaultStream(); dim3 threads2 = dim3(32, 4); dim3 blocks2 = dim3(DIVUP(numImages,32*4) * imgSize, (numFilters / 4) * imgSize); bool checkCaseBounds = (numImages % 128) != 0; if (blocked) { if (scaleTargets == 0 && scaleOutput == 1) { if (checkCaseBounds) { cudaFuncSetCacheConfig(kFRNormUndo2<4, 32, 4, false, true, true>, cudaFuncCachePreferL1); kFRNormUndo2<4, 32, 4, false, true, true><<>>(outGrads.getTextureObject(), inputs.getTextureObject(), acts.getTextureObject(), target.getDevData(), imgSize, numFilters, numImages, sizeF, addScale, powScale, minDiv, scaleTargets, scaleOutput); } else { cudaFuncSetCacheConfig(kFRNormUndo2<4, 32, 4, false, false, true>, cudaFuncCachePreferL1); kFRNormUndo2<4, 32, 4, false, false, true><<>>(outGrads.getTextureObject(), inputs.getTextureObject(), acts.getTextureObject(), target.getDevData(), imgSize, numFilters, numImages, sizeF, addScale, powScale, minDiv, scaleTargets, scaleOutput); } } else { if (checkCaseBounds) { cudaFuncSetCacheConfig(kFRNormUndo2<4, 32, 4, true, true, true>, cudaFuncCachePreferL1); kFRNormUndo2<4, 32, 4, true, true, true><<>>(outGrads.getTextureObject(), inputs.getTextureObject(), acts.getTextureObject(), target.getDevData(), imgSize, numFilters, numImages, sizeF, addScale, powScale, minDiv, scaleTargets, scaleOutput); } else { cudaFuncSetCacheConfig(kFRNormUndo2<4, 32, 4, true, false, true>, cudaFuncCachePreferL1); kFRNormUndo2<4, 32, 4, true, false, true><<>>(outGrads.getTextureObject(), inputs.getTextureObject(), acts.getTextureObject(), target.getDevData(), imgSize, numFilters, numImages, sizeF, addScale, powScale, minDiv, scaleTargets, scaleOutput); } } } else { if (scaleTargets == 0 && scaleOutput == 1) { if (checkCaseBounds) { cudaFuncSetCacheConfig(kFRNormUndo2<4, 32, 4, false, true, false>, cudaFuncCachePreferL1); kFRNormUndo2<4, 32, 4, false, true, false><<>>(outGrads.getTextureObject(), inputs.getTextureObject(), acts.getTextureObject(), target.getDevData(), imgSize, numFilters, numImages, sizeF, addScale, powScale, minDiv, scaleTargets, scaleOutput); } else { cudaFuncSetCacheConfig(kFRNormUndo2<4, 32, 4, false, false, false>, cudaFuncCachePreferL1); kFRNormUndo2<4, 32, 4, false, false, false><<>>(outGrads.getTextureObject(), inputs.getTextureObject(), acts.getTextureObject(), target.getDevData(), imgSize, numFilters, numImages, sizeF, addScale, powScale, minDiv, scaleTargets, scaleOutput); } } else { if (checkCaseBounds) { cudaFuncSetCacheConfig(kFRNormUndo2<4, 32, 4, true, true, false>, cudaFuncCachePreferL1); kFRNormUndo2<4, 32, 4, true, true, false><<>>(outGrads.getTextureObject(), inputs.getTextureObject(), acts.getTextureObject(), target.getDevData(), imgSize, numFilters, numImages, sizeF, addScale, powScale, minDiv, scaleTargets, scaleOutput); } else { cudaFuncSetCacheConfig(kFRNormUndo2<4, 32, 4, true, false, false>, cudaFuncCachePreferL1); kFRNormUndo2<4, 32, 4, true, false, false><<>>(outGrads.getTextureObject(), inputs.getTextureObject(), acts.getTextureObject(), target.getDevData(), imgSize, numFilters, numImages, sizeF, addScale, powScale, minDiv, scaleTargets, scaleOutput); } } } getLastCudaError("convResponseNormCrossMapUndo: kernel execution failed"); } void convResponseNormCrossMap(NVMatrix& images, NVMatrix& target, int numFilters, int sizeF, float addScale, float powScale, float minDiv, bool blocked) { convContrastNormCrossMap(images, images, target, numFilters, sizeF, addScale, powScale, minDiv, blocked); } /* * images: (numFilters, imgPixels, numImages) * denoms: (numFilters, imgPixels, numImages) (out) * target: (numFilters, imgPixels, numImages) (out) */ void convResponseNormCrossMap(NVMatrix& images, NVMatrix& target, int numFilters, int sizeF, float addScale, float powScale, bool blocked) { convContrastNormCrossMap(images, images, target, numFilters, sizeF, addScale, powScale, 1, blocked); } cudaTextureObject_t GetTensorTextureObject(caffe2::TensorCUDA* tensor) { cudaTextureObject_t tex_obj; cudaResourceDesc res_desc; std::memset(&res_desc, 0, sizeof(res_desc)); res_desc.resType = cudaResourceTypeLinear; res_desc.res.linear.devPtr = tensor->mutable_data(); res_desc.res.linear.sizeInBytes = tensor->nbytes(); res_desc.res.linear.desc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat); cudaTextureDesc tex_desc; std::memset(&tex_desc, 0, sizeof(tex_desc)); CUDA_ENFORCE( cudaCreateTextureObject(&tex_obj, &res_desc, &tex_desc, nullptr)); return tex_obj; } ================================================ FILE: caffe2/contrib/cuda-convnet2/cudaconv3/src/filter_acts.cu ================================================ /* * Copyright 2014 Google Inc. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include #include "../../nvmatrix/include/nvmatrix.cuh" #include "../include/cudaconv2.cuh" __device__ __forceinline__ void filterActs_YxX_color_preload_ty_4_tx_32_f_16_cc_3_setImgCoords(int fPidx, int imgLoadModPosY, int imgLoadModPosX, int imgSizeX, int filterSize, int& iPidx) { int x = imgLoadModPosX + (fPidx) % filterSize; int y = imgLoadModPosY + (fPidx) / filterSize; iPidx = y >= 0 && y < imgSizeX && x >= 0 && x < imgSizeX ? y * imgSizeX + x : -1; } #define FA_COLOR3_IMPRELOAD(c,i) imPreload[c][i] = iPidxNext < 0 || (checkImgBounds && myImgIdx + i * B_X >= numImages) ? 0 : mm[c * imgPixels * imgStride + i * B_X]; #define FA_COLOR3_IMPRELOAD_TX(c,i) imPreload[c][i] = iPidxNext < 0 || (checkImgBounds && myImgIdx + i * B_X >= numImages) ? 0 : tex1Dfetch(images, imagesOffset2 + c * imgPixels * imgStride + i * B_X); /* * images: (numImgColors, imgSizeY, imgSizeX, numImages) with stride given * filters: (numFilterColors, filterPixels, numFilters) if conv * (numModules, numFilterColors, filterPixels, numFilters) otherwise * * targets: (numFilters, numModulesY, numModulesX, numImages) * */ template //__launch_bounds__(128,3) __global__ void filterActs_YxX_color_preload_ty_4_tx_32_i_4_f_16_px_4_cc_3_tex(cudaTextureObject_t images, cudaTextureObject_t filters, float* targets, const int numImages, const int numFilters, const int imgSizeY, const int imgSizeX, const int filterSize, const int paddingStart, const int moduleStride, const int numModulesY, const int numModulesX, const int imgStride, const float scaleTargets, const float scaleOutputs, const bool conv/*, const bool noloads*/) { __shared__ float shFilters[numColors][pixelCache][B_Y * filtersPerThread]; // pre-load 1 pixel from B_Y*filtersPerThread filters __shared__ float shImages[numColors][pixelCache][B_X * imgsPerThread]; // pre-load 1 pixel from B_X*imgsPerThread images const int imgPixels = imgSizeY * imgSizeX; const int filterPixels = filterSize * filterSize; const int blocksPerModule = numFilters / (B_Y*filtersPerThread); const int moduleIdx = blockIdx.y / blocksPerModule; const int blockFilterIdx = filtersPerThread * B_Y * (blockIdx.y % blocksPerModule); const int numModules = numModulesX * numModulesY; // Another fun insanity: the % B_X makes things faster, even thought threadIdx.x is // in the range 0..31. It appears that this allows the compiler to optimize? const int tx = threadIdx.x % B_X; const int ty = threadIdx.y % B_Y; const int tidx = ty * B_X + threadIdx.x; const int imgLoadModPosY = paddingStart + (moduleIdx / numModulesX) * moduleStride; const int imgLoadModPosX = paddingStart + (moduleIdx % numModulesX) * moduleStride; const int shFilterLoadY = tidx / (B_Y * filtersPerThread); const int shFilterLoadX = tidx % (B_Y * filtersPerThread); const int myImgIdx = blockIdx.x * B_X * imgsPerThread + threadIdx.x; // images += myImgIdx; // filters += blockFilterIdx // + shFilterLoadY * numFilters + shFilterLoadX; // if (!conv) { // NOTE: UNTESTED! // filters += moduleIdx * numColors * filterPixels * numFilters; // } const int imagesOffset = myImgIdx; const int filtersOffset = blockFilterIdx + shFilterLoadY * numFilters + shFilterLoadX + (conv ? 0 : moduleIdx * numColors * filterPixels * numFilters); targets += moduleIdx * numImages + (blockFilterIdx + threadIdx.y * filtersPerThread) * numImages * numModules + myImgIdx; float prod[imgsPerThread][filtersPerThread]; #pragma unroll for(int i = 0; i < imgsPerThread; i++) { #pragma unroll for(int f = 0; f < filtersPerThread; f++) { prod[i][f] = 0; } } int iPidxNext; float imPreload[numColors][imgsPerThread]; float fPreload[numColors][pixelCache*filtersPerThread/B_X]; #pragma unroll for (int c = 0; c < numColors; ++c) { #pragma unroll for (int p = 0; p < pixelCache; p += B_X/filtersPerThread) { if (p + shFilterLoadY < filterPixels) { fPreload[c][p*filtersPerThread/B_X] = tex1Dfetch(filters, filtersOffset + p * numFilters + c * numFilters * filterPixels); } else{ fPreload[c][p*filtersPerThread/B_X] = 0; } } } filterActs_YxX_color_preload_ty_4_tx_32_f_16_cc_3_setImgCoords(ty, imgLoadModPosY, imgLoadModPosX, imgSizeX, filterSize, iPidxNext); #pragma unroll for (int c = 0; c < numColors; ++c) { #pragma unroll for (int i = 0; i < imgsPerThread; i++) { if (iPidxNext >= 0 && (!checkImgBounds || myImgIdx + i * B_X < numImages)) { imPreload[c][i] = tex1Dfetch(images, imagesOffset + (c * imgPixels + iPidxNext) * imgStride + i * B_X); } else { imPreload[c][i] = 0; } } } for (int p = 0; p < filterPixels; p += pixelCache) { #pragma unroll for (int i = 0; i < imgsPerThread; i++) { #pragma unroll for (int c = 0; c < numColors; ++c) { // NOTE: bank conflicts here! shImages[c][ty][tx * imgsPerThread + i] = imPreload[c][i]; } } const int fPidxNext = p + pixelCache >= filterPixels ? 0 : p + pixelCache; filterActs_YxX_color_preload_ty_4_tx_32_f_16_cc_3_setImgCoords(fPidxNext + ty, imgLoadModPosY, imgLoadModPosX, imgSizeX, filterSize, iPidxNext); // const float* ff = &filters[numFilters * fPidxNext]; // const float* mm = &images[imgStride * iPidxNext]; const int filtersOffset2 = filtersOffset + numFilters * fPidxNext; const int imagesOffset2 = imagesOffset + imgStride * iPidxNext; FA_COLOR3_IMPRELOAD_TX(0,0); FA_COLOR3_IMPRELOAD_TX(0,1); FA_COLOR3_IMPRELOAD_TX(0,2); FA_COLOR3_IMPRELOAD_TX(0,3); #pragma unroll for (int c = 0; c < numColors; ++c) { #pragma unroll for (int pp = 0; pp < pixelCache; pp += B_X/filtersPerThread) { shFilters[c][pp + shFilterLoadY][shFilterLoadX] = fPreload[c][pp*filtersPerThread/B_X]; } } __syncthreads(); FA_COLOR3_IMPRELOAD_TX(1,0); FA_COLOR3_IMPRELOAD_TX(1,1); FA_COLOR3_IMPRELOAD_TX(1,2); FA_COLOR3_IMPRELOAD_TX(1,3); FA_COLOR3_IMPRELOAD_TX(2,0); FA_COLOR3_IMPRELOAD_TX(2,1); FA_COLOR3_IMPRELOAD_TX(2,2); FA_COLOR3_IMPRELOAD_TX(2,3); #pragma unroll for (int c = 0; c < numColors; c++) { #pragma unroll for (int pp = 0; pp < pixelCache*filtersPerThread/B_X; pp++) { fPreload[c][pp] = fPidxNext + pp*(B_X/filtersPerThread) + shFilterLoadY >= filterPixels ? 0 : tex1Dfetch(filters, filtersOffset2 + c * numFilters* filterPixels + pp*(B_X/filtersPerThread) * numFilters); } } #pragma unroll for (int pp = 0; pp < pixelCache; pp++) { #pragma unroll for (int c = 0; c < numColors; c++) { #pragma unroll for(int f = 0; f < filtersPerThread; f++) { #pragma unroll for(int i = 0; i < imgsPerThread; i++) { prod[i][f] += shImages[c][pp][tx * imgsPerThread + i] * shFilters[c][pp][ty * filtersPerThread + f]; } } } } __syncthreads(); } if (scale) { #pragma unroll for (int f = 0; f < filtersPerThread; f++) { #pragma unroll for (int i = 0; i < imgsPerThread; i++) { if (!checkImgBounds || myImgIdx + i * B_X < numImages) { targets[i * B_X + f * numImages * numModules] = scaleTargets * targets[i * B_X + f * numImages * numModules] + scaleOutputs * prod[i][f]; } } } } else { // Note: reversing order of these loops saves 2 registers, but costs time #pragma unroll for (int i = 0; i < imgsPerThread; i++) { #pragma unroll for (int f = 0; f < filtersPerThread; f++) { if (!checkImgBounds || myImgIdx + i * B_X < numImages) { targets[i * B_X + f * numImages * numModules] = scaleOutputs * prod[i][f]; } } } } } /* * images: (numImgColors, imgSizeY, imgSizeX, numImages) with stride given * filters: (numFilterColors, filterPixels, numFilters) if conv * (numModules, numFilterColors, filterPixels, numFilters) otherwise * * targets: (numFilters, numModulesY, numModulesX, numImages) * * This won't be pretty. */ template __global__ void filterActs_YxX_color_preload_ty_4_tx_32_i_4_f_12_px_4_cc_3_tex(cudaTextureObject_t images, cudaTextureObject_t filters, float* targets, const int numImages, const int numFilters, const int imgSizeY, const int imgSizeX, const int filterSize, const int paddingStart, const int moduleStride, const int numModulesY, const int numModulesX, const int imgStride, const float scaleTargets, const float scaleOutputs, const bool conv/*, const bool noloads*/) { __shared__ float shFilters[numColors][pixelCache][B_Y * filtersPerThread]; // pre-load 1 pixel from B_Y*filtersPerThread filters __shared__ float shImages[numColors][pixelCache][B_X * imgsPerThread]; // pre-load 1 pixel from B_X*imgsPerThread images const int imgPixels = imgSizeY * imgSizeX; const int filterPixels = filterSize * filterSize; const int blocksPerModule = numFilters / (B_Y*filtersPerThread); const int moduleIdx = blockIdx.y / blocksPerModule; const int blockFilterIdx = filtersPerThread * B_Y * (blockIdx.y % blocksPerModule); const int numModules = numModulesX * numModulesY; // Another fun insanity: the % B_X makes things faster, even though threadIdx.x is // in the range 0..31. It appears that this allows the compiler to optimize? const int tx = threadIdx.x % B_X; const int ty = threadIdx.y % B_Y; const int tidx = ty * B_X + threadIdx.x; const int warp = tidx / 32; const int imgLoadModPosY = paddingStart + (moduleIdx / numModulesX) * moduleStride; const int imgLoadModPosX = paddingStart + (moduleIdx % numModulesX) * moduleStride; const int shFilterLoadY = tidx / (B_Y * filtersPerThread); const int shFilterLoadX = tidx % (B_Y * filtersPerThread); const int myImgIdx = blockIdx.x * B_X * imgsPerThread + threadIdx.x; // images += myImgIdx; // filters += blockFilterIdx // + shFilterLoadY * numFilters + shFilterLoadX; // if (!conv) { // NOTE: UNTESTED! // filters += moduleIdx * numColors * filterPixels * numFilters; // } const int imagesOffset = myImgIdx; const int filtersOffset = blockFilterIdx + shFilterLoadY * numFilters + shFilterLoadX + (conv ? 0 : moduleIdx * numColors * filterPixels * numFilters); targets += moduleIdx * numImages + (blockFilterIdx + threadIdx.y * filtersPerThread) * numImages * numModules + myImgIdx; float prod[imgsPerThread][filtersPerThread]; #pragma unroll for(int i = 0; i < imgsPerThread; i++) { #pragma unroll for(int f = 0; f < filtersPerThread; f++) { prod[i][f] = 0; } } int iPidxNext; float imPreload[numColors][imgsPerThread]; float fPreload[numColors][DIVUP(pixelCache*filtersPerThread,B_X)]; if (warp < 3) { #pragma unroll for (int c = 0; c < numColors; ++c) { #pragma unroll for (int p = 0; p < pixelCache; p += 2) { if (p + shFilterLoadY < filterPixels) { fPreload[c][p/2] = tex1Dfetch(filters, filtersOffset + p * numFilters + c * numFilters * filterPixels); } else { fPreload[c][p/2] = 0; } } } } filterActs_YxX_color_preload_ty_4_tx_32_f_16_cc_3_setImgCoords(ty, imgLoadModPosY, imgLoadModPosX, imgSizeX, filterSize, iPidxNext); #pragma unroll for (int c = 0; c < numColors; ++c) { #pragma unroll for (int i = 0; i < imgsPerThread; i++) { if (iPidxNext >= 0 && (!checkImgBounds || myImgIdx + i * B_X < numImages)) { imPreload[c][i] = tex1Dfetch(images, imagesOffset + (c * imgPixels + iPidxNext) * imgStride + i * B_X); } else { imPreload[c][i] = 0; } } } for (int p = 0; p < filterPixels; p += pixelCache) { const int fPidxNext = p + pixelCache >= filterPixels ? 0 : p + pixelCache; filterActs_YxX_color_preload_ty_4_tx_32_f_16_cc_3_setImgCoords(fPidxNext + ty, imgLoadModPosY, imgLoadModPosX, imgSizeX, filterSize, iPidxNext); #pragma unroll for (int c = 0; c < numColors; ++c) { #pragma unroll for (int i = 0; i < imgsPerThread; i++) { // NOTE: bank conflicts here! shImages[c][ty][tx * imgsPerThread + i] = imPreload[c][i]; } } if (warp < 3) { #pragma unroll for (int c = 0; c < numColors; ++c) { #pragma unroll for (int pp = 0; pp < pixelCache; pp += 2) { shFilters[c][pp + shFilterLoadY][shFilterLoadX] = fPreload[c][pp/2]; } } } __syncthreads(); // const float* ff = &filters[numFilters * fPidxNext]; // const float* mm = &images[imgStride * iPidxNext]; const int filtersOffset2 = filtersOffset + numFilters * fPidxNext; const int imagesOffset2 = imagesOffset + imgStride * iPidxNext; #pragma unroll for (int i = 0; i < imgsPerThread; ++i) { #pragma unroll for (int c = 0; c < numColors; c++) { FA_COLOR3_IMPRELOAD_TX(c,i); } } #pragma unroll for (int c = 0; c < numColors; c++) { #pragma unroll for (int pp = 0; pp < 2; pp++) { fPreload[c][pp] = warp >= 3 || fPidxNext + pp*2 + shFilterLoadY >= filterPixels ? 0 : tex1Dfetch(filters, filtersOffset2 + c * numFilters* filterPixels + pp*2 * numFilters); } #pragma unroll for (int pp = 0; pp < pixelCache; pp++) { #pragma unroll for(int i = 0; i < imgsPerThread; i++) { #pragma unroll for(int f = 0; f < filtersPerThread; f++) { prod[i][f] += shImages[c][pp][tx * imgsPerThread + i] * shFilters[c][pp][ty * filtersPerThread + f]; } } } } __syncthreads(); } if (scale) { #pragma unroll for (int i = 0; i < imgsPerThread; i++) { #pragma unroll for (int f = 0; f < filtersPerThread; f++) { if (!checkImgBounds || myImgIdx + i * B_X < numImages) { targets[i * B_X + f * numImages * numModules] = scaleTargets * targets[i * B_X + f * numImages * numModules] + scaleOutputs * prod[i][f]; } } } } else { // Note: reversing order of these loops costs 2 registers, but saves time #pragma unroll for (int i = 0; i < imgsPerThread; i++) { #pragma unroll for (int f = 0; f < filtersPerThread; f++) { if (!checkImgBounds || myImgIdx + i * B_X < numImages) { targets[i * B_X + f * numImages * numModules] = scaleOutputs * prod[i][f]; } } } } } __device__ inline void filterActs_YxX_sparse2_preload_ty_4_tx_32_f_16_c_4_setPixelCoords(int filterSize, int imgSizeX, int imgLoadModPosY, int imgLoadModPosX, int imgY, int imgX, int& fPidx, int& iPidx) { int filterPxY = imgY - imgLoadModPosY; int filterPxX = imgX - imgLoadModPosX; fPidx = filterPxY * filterSize + filterPxX; iPidx = imgY * imgSizeX + imgX; // Pixel index in img } /* * images: (numImgColors, imgSizeY, imgSizeX, numImages) with stride given * filters: (numFilterColors, filterPixels, numFilters) if conv * (numModules, numFilterColors, filterPixels, numFilters) otherwise * * targets: (numFilters, numModulesY, numModulesX, numImages) * * Note: in git there's a 1.5% faster version of this which sues 167 registers instead of 154... * it's basically the same thing, but it doesn't do the next-pixel computation. It just avoids * pre-loading when it rolls over to the next pixel. */ template __global__ void filterActs_YxX_sparse2_preload_ty_4_tx_32_i_4_f_16_c_4(float* images, float* filters, float* targets, const int numImages, const int numFilters, const int imgSizeY, const int imgSizeX, const int filterSize, const int paddingStart, const int moduleStride, const int numModulesY, const int numModulesX, const int imgStride, const int numImgColors, const int numGroups, const float scaleTargets, const float scaleOutputs, const bool conv/*, const bool noloads*/) { __shared__ float shFilters[colorCache][B_Y * filtersPerThread]; // pre-load 1 pixel from B_Y*filtersPerThread filters __shared__ float shImages[colorCache][B_X * imgsPerThread]; // pre-load 1 pixel from B_X*imgsPerThread images const int imgPixels = imgSizeY * imgSizeX; const int filterPixels = filterSize * filterSize; const int numFilterColors = numImgColors / numGroups; const int blocksPerModule = numFilters / (B_Y*filtersPerThread); const int moduleIdx = blockIdx.y / blocksPerModule; const int blockFilterIdx = filtersPerThread * B_Y * (blockIdx.y % blocksPerModule); const int numFiltersPerGroup = numFilters / numGroups; const int blockGroupIdx = blockFilterIdx / numFiltersPerGroup; const int numModules = numModulesX * numModulesY; const int blockColorIdx = numFilterColors * blockGroupIdx; // Another fun insanity: the % B_X makes things faster, even thought threadIdx.x is // in the range 0..31. It appears that this allows the compiler to optimize? const int tx = threadIdx.x % B_X; const int ty = threadIdx.y % B_Y; const int tidx = ty * B_X + threadIdx.x; const int imgLoadModPosY = paddingStart + (moduleIdx / numModulesX) * moduleStride; const int imgLoadModPosX = paddingStart + (moduleIdx % numModulesX) * moduleStride; const int shFilterLoadY = tidx / (B_Y * filtersPerThread); const int shFilterLoadX = tidx % (B_Y * filtersPerThread); const int myImgIdx = blockIdx.x * B_X * imgsPerThread + threadIdx.x; images += (blockColorIdx + threadIdx.y) * imgPixels * imgStride + myImgIdx; filters +=blockFilterIdx + shFilterLoadY * numFilters * filterPixels + shFilterLoadX; if (!conv) { filters += moduleIdx * numFilterColors * filterPixels * numFilters; } targets += moduleIdx * numImages + (blockFilterIdx + threadIdx.y * filtersPerThread) * numImages * numModules + myImgIdx; float prod[imgsPerThread][filtersPerThread]; // float fCache[filtersPerThread]; #pragma unroll for(int i = 0; i < imgsPerThread; i++) { #pragma unroll for(int f = 0; f < filtersPerThread; f++) { prod[i][f] = 0; } } // NOTE: these max/min functions increase register usage as compared to my macros const int imgStartX = max(0, imgLoadModPosX); const int imgStartY = max(0, imgLoadModPosY); const int imgEndX = min(imgLoadModPosX + filterSize, imgSizeX); const int imgEndY = min(imgLoadModPosY + filterSize, imgSizeY); // __shared__ int imgPos[] int fPidx, iPidx; float imPreload[imgsPerThread]; float fPreload[colorCache*filtersPerThread/B_X]; // float fCache[filtersPerThread]; filterActs_YxX_sparse2_preload_ty_4_tx_32_f_16_c_4_setPixelCoords(filterSize, imgSizeX, imgLoadModPosY, imgLoadModPosX, imgStartY, imgStartX, fPidx, iPidx); #pragma unroll for (int i = 0; i < imgsPerThread; i++) { if (!checkImgBounds || myImgIdx + i * B_X < numImages) { imPreload[i] = images[imgStride * iPidx + i * B_X]; } else { imPreload[i] = 0; } } if (/*B_X % filtersPerThread == 0 ||*/ shFilterLoadY < B_X/filtersPerThread) { // This if statement reduces reg usage.. #pragma unroll for (int c = 0; c < colorCache; c += B_X/filtersPerThread) { fPreload[c*filtersPerThread/B_X] = filters[(c * filterPixels + fPidx) * numFilters]; } } for (int imgY = imgStartY; imgY < imgEndY; ++imgY) { // const int filterPxY = imgY - imgLoadModPosY; for (int imgX = imgStartX; imgX < imgEndX; ++imgX) { // const int filterPxX = imgX - imgLoadModPosX; // const int p = filterPxY * filterSize + filterPxX; // const int pixIdx = imgY * imgSizeX + imgX;// Pixel index in img // setPixelCoords(filterSize, imgSizeX, imgLoadModPosY, imgLoadModPosX, imgY, imgX, &p, &pixIdx); // float* m = &images[imgStride * pixIdx]; const bool lastPixel = imgY == imgEndY - 1 && imgX == imgEndX - 1; int imgYNext = imgY; int imgXNext = imgX; int fPidxNext, iPidxNext; if (!lastPixel) { imgYNext = imgY + (imgX + 1 == imgEndX); imgXNext = imgX + 1 == imgEndX ? imgStartX : imgX + 1; } filterActs_YxX_sparse2_preload_ty_4_tx_32_f_16_c_4_setPixelCoords(filterSize, imgSizeX, imgLoadModPosY, imgLoadModPosX, imgYNext, imgXNext, fPidxNext, iPidxNext); for (int oc = 0; oc < numFilterColors; oc += colorCache) { // oc stands for outer color (loop) const float* ff = &filters[numFilters * ((oc + colorCache) * filterPixels + fPidx)]; const float* mm = &images[imgStride * ((oc + colorCache) * imgPixels + iPidx)]; if (oc == numFilterColors - colorCache) { ff = &filters[fPidxNext * numFilters]; mm = &images[iPidxNext * imgStride]; fPidx = fPidxNext; iPidx = iPidxNext; } #pragma unroll for (int c = 0; c < colorCache; c += B_X/filtersPerThread) { shFilters[c + shFilterLoadY][shFilterLoadX] = fPreload[c*filtersPerThread/B_X]; } #pragma unroll for (int i = 0; i < imgsPerThread; i++) { // NOTE: bank conflicts here! shImages[ty][tx * imgsPerThread + i] = imPreload[i]; } imPreload[0] = (checkImgBounds && myImgIdx + 0 * B_X >= numImages) ? 0 : mm[0 * B_X]; imPreload[1] = (checkImgBounds && myImgIdx + 1 * B_X >= numImages) ? 0 : mm[1 * B_X]; imPreload[2] = (checkImgBounds && myImgIdx + 2 * B_X >= numImages) ? 0 : mm[2 * B_X]; __syncthreads(); #pragma unroll for(int i = 0; i < imgsPerThread; i++) { #pragma unroll for(int f = 0; f < filtersPerThread; f++) { prod[i][f] += shImages[0][threadIdx.x * imgsPerThread + i] * shFilters[0][threadIdx.y * filtersPerThread + f]; } } fPreload[0] = ff[0]; #pragma unroll for(int i = 0; i < imgsPerThread; i++) { #pragma unroll for(int f = 0; f < filtersPerThread; f++) { prod[i][f] += shImages[1][threadIdx.x * imgsPerThread + i] * shFilters[1][threadIdx.y * filtersPerThread + f]; } } fPreload[1] = ff[(B_X/filtersPerThread * filterPixels) * numFilters]; #pragma unroll for(int i = 0; i < imgsPerThread; i++) { #pragma unroll for(int f = 0; f < filtersPerThread; f++) { prod[i][f] += shImages[2][threadIdx.x * imgsPerThread + i] * shFilters[2][threadIdx.y * filtersPerThread + f]; } } imPreload[3] = (checkImgBounds && myImgIdx + 3 * B_X >= numImages) ? 0 : mm[3 * B_X]; #pragma unroll for(int i = 0; i < imgsPerThread; i++) { #pragma unroll for(int f = 0; f < filtersPerThread; f++) { prod[i][f] += shImages[3][threadIdx.x * imgsPerThread + i] * shFilters[3][threadIdx.y * filtersPerThread + f]; } } __syncthreads(); } } } if (scale) { #pragma unroll for (int f = 0; f < filtersPerThread; f++) { #pragma unroll for (int i = 0; i < imgsPerThread; i++) { if (!checkImgBounds || myImgIdx + i * B_X < numImages) { targets[i * B_X + f * numImages * numModules] = scaleTargets * targets[i * B_X + f * numImages * numModules] + scaleOutputs * prod[i][f]; } } } } else { // Note: reversing order of these loops saves 2 registers, but costs time #pragma unroll for (int i = 0; i < imgsPerThread; i++) { #pragma unroll for (int f = 0; f < filtersPerThread; f++) { if (!checkImgBounds || myImgIdx + i * B_X < numImages) { targets[i * B_X + f * numImages * numModules] = scaleOutputs * prod[i][f]; } } } } } /*****************************Function Revision Record***************************** * Author: Tencent BestImage Team(ankerguo@tencent.com) * * Date: 2015-05-18 * * Reason: Optimizing kernel to get faster speed according to GPU features * * Method: * * 1. reorganizing data structure to avoid bank conflict; * * 2. using vectorized data type; * * 3. improving instruction-level parallelism; * * 4. removing redundant 'if' branches; * * 5. removing local variables to save registers. * *********************************************************************************/ /* * images: (numImgColors, imgSizeY, imgSizeX, numImages) with stride given * filters: (numFilterColors, filterPixels, numFilters) if conv * (numModules, numFilterColors, filterPixels, numFilters) otherwise * * targets: (numFilters, numModulesY, numModulesX, numImages) * */ template __global__ void __launch_bounds__(128, 4) filterActs_YxX_sparse2_preload_ty_4_tx_32_i_4_f_16_c_4_tex(cudaTextureObject_t images, cudaTextureObject_t filters, float* targets, const int numImages, const int numFilters, const int imgSizeY, const int imgSizeX, const int filterSize, const int paddingStart, const int moduleStride, const int numModulesY, const int numModulesX, const int imgStride, const int numImgColors, const int numGroups, const float scaleTargets, const float scaleOutputs, const bool conv/*, const bool noloads*/) { // avoid bank conflict by reorganizing the data structure and improve the band width by using 'float2' instead of 'float' __shared__ float2 shFilters[colorCache / 2][B_Y * filtersPerThread]; // pre-load 1 pixel from B_Y*filtersPerThread filters __shared__ float2 shImages[colorCache][B_X * imgsPerThread / 2]; // pre-load 1 pixel from B_X*imgsPerThread images const int imgPixels = imgSizeY * imgSizeX; const int filterPixels = filterSize * filterSize; const int numFilterColors = numImgColors / numGroups; const int blocksPerModule = numFilters / (B_Y*filtersPerThread); const int moduleIdx = blockIdx.y / blocksPerModule; const int blockFilterIdx = filtersPerThread * B_Y * (blockIdx.y % blocksPerModule); const int numFiltersPerGroup = numFilters / numGroups; const int blockGroupIdx = blockFilterIdx / numFiltersPerGroup; const int numModules = numModulesX * numModulesY; const int blockColorIdx = numFilterColors * blockGroupIdx; // Another fun insanity: the % B_X makes things faster, even thought threadIdx.x is // in the range 0..31. It appears that this allows the compiler to optimize? const int tx = threadIdx.x % B_X; const int ty = threadIdx.y % B_Y; //const int tidx = ty * B_X + threadIdx.x; // reduce one register const int imgLoadModPosY = paddingStart + (moduleIdx / numModulesX) * moduleStride; const int imgLoadModPosX = paddingStart + (moduleIdx % numModulesX) * moduleStride; // reduce two registers //const int shFilterLoadY = tidx / (B_Y * filtersPerThread); //const int shFilterLoadX = tidx % (B_Y * filtersPerThread); const int myImgIdx = blockIdx.x * B_X * imgsPerThread + tx; const int imgOffset = (blockColorIdx + ty) * imgPixels * imgStride + myImgIdx; // images += (blockColorIdx + threadIdx.y) * imgPixels * imgStride + myImgIdx; const int filterOffset = blockFilterIdx + ((ty * B_X + tx) / (B_Y * filtersPerThread)) * numFilters * filterPixels + ((ty * B_X + tx) % (B_Y * filtersPerThread)) + (conv ? 0 : moduleIdx * numFilterColors * filterPixels * numFilters); // filters +=blockFilterIdx // + shFilterLoadY * numFilters * filterPixels + shFilterLoadX; // if (!conv) { // filters += moduleIdx * numFilterColors * filterPixels * numFilters; // } targets += moduleIdx * numImages + (blockFilterIdx + threadIdx.y * filtersPerThread) * numImages * numModules + myImgIdx; // combine two registers into one const int numModImages = numModules * numImages; float prod[imgsPerThread][filtersPerThread]; // float fCache[filtersPerThread]; #pragma unroll for(int i = 0; i < imgsPerThread; i++) { #pragma unroll for(int f = 0; f < filtersPerThread; f++) { prod[i][f] = 0; } } // NOTE: these max/min functions increase register usage as compared to my macros const int imgStartX = max(0, imgLoadModPosX); const int imgStartY = max(0, imgLoadModPosY); const int imgEndX = min(imgLoadModPosX + filterSize, imgSizeX); const int imgEndY = min(imgLoadModPosY + filterSize, imgSizeY); // __shared__ int imgPos[] int fPidx, iPidx; float imPreload[imgsPerThread]; // [4] float fPreload[colorCache*filtersPerThread/B_X]; // [2] // float fCache[filtersPerThread]; filterActs_YxX_sparse2_preload_ty_4_tx_32_f_16_c_4_setPixelCoords(filterSize, imgSizeX, imgLoadModPosY, imgLoadModPosX, imgStartY, imgStartX, fPidx, iPidx); // remove redundant conditions #pragma unroll for (int i = 0; i < imgsPerThread; i++) { imPreload[i] = tex1Dfetch(images, imgOffset + imgStride * iPidx + i * B_X); } #pragma unroll for (int c = 0; c < colorCache; c += B_X/filtersPerThread) { fPreload[c*filtersPerThread/B_X] = tex1Dfetch(filters, filterOffset + (c * filterPixels + fPidx) * numFilters); } for (int imgY = imgStartY; imgY < imgEndY; ++imgY) { // const int filterPxY = imgY - imgLoadModPosY; for (int imgX = imgStartX; imgX < imgEndX; ++imgX) { // const int filterPxX = imgX - imgLoadModPosX; // const int p = filterPxY * filterSize + filterPxX; // const int pixIdx = imgY * imgSizeX + imgX;// Pixel index in img // setPixelCoords(filterSize, imgSizeX, imgLoadModPosY, imgLoadModPosX, imgY, imgX, &p, &pixIdx); // float* m = &images[imgStride * pixIdx]; const bool lastPixel = imgY == imgEndY - 1 && imgX == imgEndX - 1; int imgYNext = imgY; int imgXNext = imgX; int fPidxNext, iPidxNext; if (!lastPixel) { imgYNext = imgY + (imgX + 1 == imgEndX); imgXNext = imgX + 1 == imgEndX ? imgStartX : imgX + 1; } filterActs_YxX_sparse2_preload_ty_4_tx_32_f_16_c_4_setPixelCoords(filterSize, imgSizeX, imgLoadModPosY, imgLoadModPosX, imgYNext, imgXNext, fPidxNext, iPidxNext); for (int oc = 0; oc < numFilterColors; oc += colorCache) { // oc stands for outer color (loop) // store the preloaded pixel of filter and image into shared memory shFilters[(ty * B_X + tx) / (B_Y * filtersPerThread)][(ty * B_X + tx) % (B_Y * filtersPerThread)].x = fPreload[0]; shFilters[(ty * B_X + tx) / (B_Y * filtersPerThread)][(ty * B_X + tx) % (B_Y * filtersPerThread)].y = fPreload[1]; shImages[ty][tx].x = imPreload[0]; shImages[ty][tx].y = imPreload[1]; shImages[ty][tx+B_X].x = imPreload[2]; shImages[ty][tx+B_X].y = imPreload[3]; int imgOffset2 = imgOffset + imgStride * ((oc + colorCache) * imgPixels + iPidx); int filterOffset2 = filterOffset + numFilters * ((oc + colorCache) * filterPixels + fPidx); if (oc == numFilterColors - colorCache) { filterOffset2 = filterOffset + fPidxNext * numFilters; imgOffset2 = imgOffset + iPidxNext * imgStride; fPidx = fPidxNext; iPidx = iPidxNext; } // preload one pixel of filter and image from texture, and no need to check 'checkImgBounds' with all callers setting it as false imPreload[0] = tex1Dfetch(images, imgOffset2); imPreload[1] = tex1Dfetch(images, imgOffset2 + B_X); imPreload[2] = tex1Dfetch(images, imgOffset2 + 2 * B_X); imPreload[3] = tex1Dfetch(images, imgOffset2 + 3 * B_X); fPreload[0] = tex1Dfetch(filters, filterOffset2); fPreload[1] = tex1Dfetch(filters, filterOffset2 + 2 * filterPixels * numFilters); __syncthreads(); // put together the instructions with same type to improve instruction-level parallelism // calculate the convolution between images and filters #pragma unroll for (int f = 0; f < filtersPerThread; f++) { #pragma unroll for (int r = 0; r < colorCache / 2; r++) { prod[0][f] += shImages[r][tx].x * shFilters[r][ty*filtersPerThread+f].x; prod[1][f] += shImages[r][tx].y * shFilters[r][ty*filtersPerThread+f].x; prod[2][f] += shImages[r][tx+B_X].x * shFilters[r][ty*filtersPerThread+f].x; prod[3][f] += shImages[r][tx+B_X].y * shFilters[r][ty*filtersPerThread+f].x; prod[0][f] += shImages[r+2][tx].x * shFilters[r][ty*filtersPerThread+f].y; prod[1][f] += shImages[r+2][tx].y * shFilters[r][ty*filtersPerThread+f].y; prod[2][f] += shImages[r+2][tx+B_X].x * shFilters[r][ty*filtersPerThread+f].y; prod[3][f] += shImages[r+2][tx+B_X].y * shFilters[r][ty*filtersPerThread+f].y; } } __syncthreads(); } } } if (scale) { #pragma unroll for (int f = 0; f < filtersPerThread; f++) { #pragma unroll for (int i = 0; i < imgsPerThread; i++) { // remove the redundant condition for less registers targets[i * B_X + f * numModImages] = scaleTargets * targets[i * B_X + f * numModImages] + scaleOutputs * prod[i][f]; } } } else { // Note: reversing order of these loops saves 2 registers, but costs time #pragma unroll for (int i = 0; i < imgsPerThread; i++) { #pragma unroll for (int f = 0; f < filtersPerThread; f++) { // remove the redundant condition for less registers targets[i * B_X + f * numModImages] = scaleOutputs * prod[i][f]; } } } } /* * Block size B_YxB_X. Each block applies B_Y * filtersPerThread filters to B_X * imgsPerThread images. * threadIdx.x determines image * threadIdx.y determines filter * * blockIdx.x determines image batch of B_X * imgsPerThread * blockIdx.y determines filter batch of module and B_Y * filtersPerThread * * images: (numColors, imgSizeY, imgSizeX, numImages) with stride given * filters: (numColors, filterPixels, numFilters) if conv * (numModules, numColors, filterPixels, numFilters) otherwise * * targets: (numFilters, numModulesY, numModulesX, numImages) * * * Number of filters per module should be divisible by B_Y * filtersPerThread * checkImgBounds indicates whether number of images is divisible by B_X * imgsPerThread * * The imgSize here is the size of the actual image without the padding. * */ template __global__ void filterActs_YxX_color(float* images, float* filters, float* targets, const int numImages, const int numFilters, const int imgSizeY, const int imgSizeX, const int filterSize, const int paddingStart, const int moduleStride, const int numModulesY, const int numModulesX, const int imgStride, const float scaleTargets, const float scaleOutputs, const bool conv) { __shared__ float shFilters[pixelCache*numColors][B_Y * filtersPerThread]; // pre-load pixelCache pixels from B_Y*filtersPerThread filters __shared__ float shImages[pixelCache*numColors][B_X * imgsPerThread]; // pre-load pixelCache pixels from B_X*imgsPerThread images const int imgPixels = imgSizeY * imgSizeX; const int filterPixels = filterSize * filterSize; const int blocksPerModule = numFilters / (B_Y*filtersPerThread); const int moduleIdx = blockIdx.y / blocksPerModule; const int blockFilterIdx = blockIdx.y % blocksPerModule; const int tidx = threadIdx.y * B_X + threadIdx.x; const int imgLoadModPosY = paddingStart + (moduleIdx / numModulesX) * moduleStride; const int imgLoadModPosX = paddingStart + (moduleIdx % numModulesX) * moduleStride; const int numModules = numModulesY * numModulesX; const int shFilterLoadY = tidx / (B_Y * filtersPerThread); const int shFilterLoadX = tidx % (B_Y * filtersPerThread); const int myImgIdx = blockIdx.x * B_X * imgsPerThread + threadIdx.x; images += myImgIdx; filters += filtersPerThread * B_Y * blockFilterIdx + shFilterLoadY * numFilters + shFilterLoadX; if (!conv) { filters += moduleIdx * numColors * filterPixels * numFilters; } targets += moduleIdx * numImages + (blockFilterIdx * B_Y * filtersPerThread + threadIdx.y*filtersPerThread) * numImages * numModulesY * numModulesX + myImgIdx; float prod[filtersPerThread][imgsPerThread]; #pragma unroll for(int f = 0; f < filtersPerThread; f++) { #pragma unroll for(int g = 0; g < imgsPerThread; g++) { prod[f][g] = 0; } } //float* shImgLoad = &shImages[0][threadIdx.x]; for (int p = 0; p < filterPixels; p += pixelCache) { /* * Load pixelCache pixels from B_Y*filtersPerThread filters * This condition covers the case when B_X is not divisible by filtersPerThread. * In this case, not all of the threads will participate in the loading operation. * This ensures that in each loop iteration, an integer number of rows of shFilters * are filled, which makes indexing simple. */ if (B_X % filtersPerThread == 0 || shFilterLoadY < B_X/filtersPerThread) { #pragma unroll for (int p2 = 0; p2 < pixelCache; p2 += B_X/filtersPerThread) { const bool omit = pixelCache % (B_X / filtersPerThread) == 0; const int preloadPx = shFilterLoadY + p2; if (omit || preloadPx < pixelCache) { if (p + preloadPx < filterPixels) { #pragma unroll for (int c = 0; c < numColors; c++) { shFilters[shFilterLoadY + p2 + c * pixelCache][shFilterLoadX] = filters[(c * filterPixels + p + p2) * numFilters]; } } else { #pragma unroll for (int c = 0; c < numColors; c++) { shFilters[shFilterLoadY + p2 + c * pixelCache][shFilterLoadX] = 0; } } } } } /* * Load pixelCache pixels from B_X*imgsPerThread images. */ #pragma unroll for (int ly = 0; ly < pixelCache; ly += B_Y) { const int preloadPx = ly + threadIdx.y; const int pixIdx = p + preloadPx; const bool omit = pixelCache % B_Y == 0; // Compile-time condition /* * Don't load any image pixels corresponding to filter pixels that don't exist. */ if (pixIdx < filterPixels && (omit || preloadPx < pixelCache)) { const int x = imgLoadModPosX + pixIdx % filterSize; const int y = imgLoadModPosY + pixIdx / filterSize; if (y >= 0 && y < imgSizeY && x >= 0 && x < imgSizeX) { float* m = &images[imgStride * (y * imgSizeX + x)]; #pragma unroll for (int c = 0; c < numColors; c++) { #pragma unroll for (int i = 0; i < imgsPerThread; i++) { if (!checkImgBounds || myImgIdx + i * B_X < numImages) { shImages[preloadPx + c * pixelCache][threadIdx.x * imgsPerThread + i] = m[c * imgStride * imgPixels + i * B_X]; } else { shImages[preloadPx + c * pixelCache][threadIdx.x * imgsPerThread + i] = 0; } } } } else { // Padding #pragma unroll for (int i = 0; i < imgsPerThread; i++) { #pragma unroll for (int c = 0; c < numColors; c++) { shImages[preloadPx + c * pixelCache][threadIdx.x * imgsPerThread + i] = 0; } } } } } __syncthreads(); #pragma unroll for (int i = 0; i < pixelCache*numColors; i++) { #pragma unroll for(int f = 0; f < filtersPerThread; f++) { #pragma unroll for(int g = 0; g < imgsPerThread; g++) { prod[f][g] += shImages[i][g + threadIdx.x * imgsPerThread] * shFilters[i][threadIdx.y * filtersPerThread + f]; } } } __syncthreads(); } if (scale) { #pragma unroll for (int f = 0; f < filtersPerThread; f++) { #pragma unroll for (int g = 0; g < imgsPerThread; g++) { if (!checkImgBounds || myImgIdx + g * B_X < numImages) { targets[g * B_X + f * numImages * numModules] = scaleTargets * targets[g * B_X + f * numImages * numModules] + scaleOutputs * prod[f][g]; } } } } else { #pragma unroll for (int g = 0; g < imgsPerThread; g++) { if (!checkImgBounds || myImgIdx + g * B_X < numImages) { #pragma unroll for (int f = 0; f < filtersPerThread; f++) { targets[g * B_X + f * numImages * numModules] = scaleOutputs * prod[f][g]; } } } } } /* * Block size B_YxB_X. Each block applies B_Y * filtersPerThread filters to B_X * imgsPerThread images. * threadIdx.x determines image * threadIdx.y determines filter * * blockIdx.x determines image batch of B_X * imgsPerThread * blockIdx.y determines filter batch of B_Y * filtersPerThread * * images: (numImgColors, imgSizeY, imgSizeX, numImages) with stride given * filters: (numFilterColors, filterPixels, numFilters) if conv * (numModules, numFilterColors, filterPixels, numFilters) otherwise * * targets: (numFilters, numModulesY, numModulesX, numImages) * * B_Y one of 4, 8, 16 * B_X one of 16, 32 * imgsPerThread one of 1, 2, 4 * filtersPerThread one of 1, 2, 4, 8 * colorCache: how many colors to put into shmem * * numFilters should be divisible by B_Y * filtersPerThread * numImages be divisible by B_X * imgsPerThread * numFilterColors should be divisible by colorCache. * numImgColors must be even. * numFilters must be divisible by numGroups. * no restrictions on pixelCache * The imgSize here is the size of the actual image without the padding. * As always, try to make B_X * imgsPerThread == B_Y * filtersPerThread for maximum efficiency. * */ template __global__ void filterActs_YxX_sparse2(float* images, float* filters, float* targets, const int numImages, const int numFilters, const int imgSizeY, const int imgSizeX, const int filterSize, const int paddingStart, const int moduleStride, const int numModulesY, const int numModulesX, const int imgStride, const int numImgColors, const int numGroups, const float scaleTargets, const float scaleOutputs, const bool conv) { __shared__ float shFilters[colorCache][B_Y * filtersPerThread]; // pre-load 1 pixel from B_Y*filtersPerThread filters __shared__ float shImages[colorCache][B_X * imgsPerThread]; // pre-load 1 pixel from B_X*imgsPerThread images const int imgPixels = imgSizeY * imgSizeX; const int filterPixels = filterSize * filterSize; const int numFilterColors = numImgColors / numGroups; const int blocksPerModule = numFilters / (B_Y*filtersPerThread); const int moduleIdx = blockIdx.y / blocksPerModule; const int blockFilterIdx = filtersPerThread * B_Y * (blockIdx.y % blocksPerModule); const int numFiltersPerGroup = numFilters / numGroups; const int blockGroupIdx = blockFilterIdx / numFiltersPerGroup; const int numModules = numModulesX * numModulesY; const int blockColorIdx = numFilterColors * blockGroupIdx; const int tidx = threadIdx.y * B_X + threadIdx.x; const int imgLoadModPosY = paddingStart + (moduleIdx / numModulesX) * moduleStride; const int imgLoadModPosX = paddingStart + (moduleIdx % numModulesX) * moduleStride; const int shFilterLoadY = tidx / (B_Y * filtersPerThread); const int shFilterLoadX = tidx % (B_Y * filtersPerThread); const int myImgIdx = blockIdx.x * B_X * imgsPerThread + threadIdx.x; images += (blockColorIdx + threadIdx.y) * imgPixels * imgStride + myImgIdx; filters +=blockFilterIdx + shFilterLoadY * numFilters * filterPixels + shFilterLoadX; if (!conv) { filters += moduleIdx * numFilterColors * filterPixels * numFilters; } targets += moduleIdx * numImages + (blockFilterIdx + threadIdx.y) * numImages * numModules + myImgIdx; float prod[filtersPerThread][imgsPerThread]; #pragma unroll for(int f = 0; f < filtersPerThread; f++) { #pragma unroll for(int g = 0; g < imgsPerThread; g++) { prod[f][g] = 0; } } const int imgStartX = MAX(0, imgLoadModPosX); const int imgStartY = MAX(0, imgLoadModPosY); const int imgEndX = MIN(imgLoadModPosX + filterSize, imgSizeX); const int imgEndY = MIN(imgLoadModPosY + filterSize, imgSizeY); // __shared__ int imgPos[] for (int imgY = imgStartY; imgY < imgEndY; ++imgY) { const int filterPxY = imgY - imgLoadModPosY; for (int imgX = imgStartX; imgX < imgEndX; ++imgX) { const int filterPxX = imgX - imgLoadModPosX; const int p = filterPxY * filterSize + filterPxX; for (int oc = 0; oc < numFilterColors; oc += colorCache) { // oc stands for outer color (loop) /* * Load a pixel from B_Y*filtersPerThread filters * This condition covers the case when B_X is not divisible by filtersPerThread. * In this case, not all of the threads will participate in the loading operation. * This ensures that in each loop iteration, an integer number of rows of shFilters * are filled, which makes indexing simple. * nvcc is behaving in a completely insane way: removing this condition under * template parameters that guarantee it to be true actually slows down * the computation. * */ if (/*B_X % filtersPerThread == 0 ||*/ shFilterLoadY < B_X/filtersPerThread) { #pragma unroll for (int c = 0; c < colorCache; c += B_X/filtersPerThread) { if (colorCache % (B_X/filtersPerThread) == 0 || c + shFilterLoadY < colorCache) { shFilters[c + shFilterLoadY][shFilterLoadX] = filters[((oc+c) * filterPixels + p) * numFilters]; } } } /* * Load a pixel from B_X*imgsPerThread images. */ const int pixIdx = imgY * imgSizeX + imgX;// Pixel index in img float* m = &images[imgStride * (oc * imgPixels + pixIdx)]; #pragma unroll for (int c = 0; c < colorCache; c += B_Y) { if (colorCache % B_Y == 0 || threadIdx.y + c < colorCache) { #pragma unroll for (int i = 0; i < imgsPerThread; i++) { if (!checkImgBounds || myImgIdx + i * B_X < numImages) { shImages[c + threadIdx.y][threadIdx.x + i * B_X] = m[c * imgStride * imgPixels + i * B_X]; } else { shImages[c + threadIdx.y][threadIdx.x + i * B_X] = 0; } } } } __syncthreads(); for (int c = 0; c < colorCache; c++) { #pragma unroll for(int g = 0; g < imgsPerThread; g++) { #pragma unroll for(int f = 0; f < filtersPerThread; f++) { prod[f][g] += shImages[c][g * B_X + threadIdx.x] * shFilters[c][threadIdx.y + f * B_Y]; } } } __syncthreads(); } } } if (scale) { #pragma unroll for (int g = 0; g < imgsPerThread; g++) { if (!checkImgBounds || myImgIdx + g * B_X < numImages) { #pragma unroll for (int f = 0; f < filtersPerThread; f++) { targets[g * B_X + f * B_Y * numImages * numModules] = scaleTargets * targets[g * B_X + f * B_Y * numImages * numModules] + scaleOutputs * prod[f][g]; } } } } else { // Note: reversing order of these loops saves 2 registers, but costs time #pragma unroll for (int f = 0; f < filtersPerThread; f++) { #pragma unroll for (int g = 0; g < imgsPerThread; g++) { if (!checkImgBounds || myImgIdx + g * B_X < numImages) { targets[g * B_X + f * B_Y * numImages * numModules] = scaleOutputs * prod[f][g]; } } } } } /*****************************Function Revision Record***************************** * Author: Tencent BestImage Team(ankerguo@tencent.com) * * Date: 2015-05-18 * * Reason: Optimizing kernel to get faster speed according to GPU features * * Method: * * 1. reorganizing data structure to avoid bank conflict; * * 2. using vectorized data type; * * Note: This function can be used when each thread loads even number of filter * * pixels(filtersPerThread * colorCache / B_X is even), and this can be * * optimized more when the number of loaded image's pixel is even. * *********************************************************************************/ template __global__ void filterActs_YxX_sparse2_f_vec(float* images, float* filters, float* targets, const int numImages, const int numFilters, const int imgSizeY, const int imgSizeX, const int filterSize, const int paddingStart, const int moduleStride, const int numModulesY, const int numModulesX, const int imgStride, const int numImgColors, const int numGroups, const float scaleTargets, const float scaleOutputs, const bool conv) { // improve shared memory's band width by using 'float2' instead of 'float' __shared__ float2 shFilters[colorCache/2][B_Y * filtersPerThread]; // pre-load 1 pixel from B_Y*filtersPerThread filters __shared__ float shImages[colorCache][B_X * imgsPerThread]; // pre-load 1 pixel from B_X*imgsPerThread images const int tx = threadIdx.x % B_X, ty = threadIdx.y % B_Y; const int imgPixels = imgSizeY * imgSizeX; const int filterPixels = filterSize * filterSize; const int numFilterColors = numImgColors / numGroups; const int blocksPerModule = numFilters / (B_Y*filtersPerThread); const int moduleIdx = blockIdx.y / blocksPerModule; const int blockFilterIdx = filtersPerThread * B_Y * (blockIdx.y % blocksPerModule); const int numFiltersPerGroup = numFilters / numGroups; const int blockGroupIdx = blockFilterIdx / numFiltersPerGroup; const int numModules = numModulesX * numModulesY; const int blockColorIdx = numFilterColors * blockGroupIdx; const int tidx = ty * B_X + tx; const int imgLoadModPosY = paddingStart + (moduleIdx / numModulesX) * moduleStride; const int imgLoadModPosX = paddingStart + (moduleIdx % numModulesX) * moduleStride; // load position of filters' pixels for current thread const int shFilterLoadY = tidx / (B_Y * filtersPerThread); const int shFilterLoadX = tidx % (B_Y * filtersPerThread); // load position of images' pixels for current thread const int shImgLoadY = tidx / (B_X * imgsPerThread); const int shImgLoadX = tidx % (B_X * imgsPerThread); const int myImgIdx = blockIdx.x * B_X * imgsPerThread + shImgLoadX; images += (blockColorIdx + shImgLoadY) * imgPixels * imgStride + myImgIdx; filters +=blockFilterIdx + shFilterLoadY * numFilters * filterPixels + shFilterLoadX; if (!conv) { filters += moduleIdx * numFilterColors * filterPixels * numFilters; } targets += moduleIdx * numImages + (blockFilterIdx + ty) * numImages * numModules + blockIdx.x * B_X * imgsPerThread + tx; float prod[filtersPerThread][imgsPerThread]; #pragma unroll for(int f = 0; f < filtersPerThread; f++) { #pragma unroll for(int g = 0; g < imgsPerThread; g++) { prod[f][g] = 0; } } const int imgStartX = MAX(0, imgLoadModPosX); const int imgStartY = MAX(0, imgLoadModPosY); const int imgEndX = MIN(imgLoadModPosX + filterSize, imgSizeX); const int imgEndY = MIN(imgLoadModPosY + filterSize, imgSizeY); // temporary buffer to store the filter's loaded pixels during each loop float fPreload[colorCache * filtersPerThread / B_X]; // temporary buffer to store the image's loaded pixels during each loop float iPreload[colorCache * imgsPerThread / B_Y]; // preload filter's pixels #pragma unroll for (int c = 0; c < colorCache; c += B_X/filtersPerThread) { fPreload[c * filtersPerThread / B_X] = filters[(c * filterPixels + (imgStartY - imgLoadModPosY) * filterSize + (imgStartX - imgLoadModPosX)) * numFilters]; } // preload image's pixels if (!checkImgBounds || myImgIdx < numImages) { #pragma unroll for (int c = 0; c < colorCache; c += B_Y / imgsPerThread) { iPreload[c * imgsPerThread / B_Y] = images[(c * imgPixels + imgStartY * imgSizeX + imgStartX) * imgStride]; } } else { #pragma unroll for (int c = 0; c < colorCache; c += B_Y / imgsPerThread) { iPreload[c * imgsPerThread / B_Y] = 0; } } for (int imgY = imgStartY; imgY < imgEndY; ++imgY) { //const int filterPxY = imgY - imgLoadModPosY; for (int imgX = imgStartX; imgX < imgEndX; ++imgX) { for (int oc = 0; oc < numFilterColors; oc += colorCache) { // oc stands for outer color (loop) // store the preloaded filter's pixels into shared memory #pragma unroll for (int c = 0; c < colorCache / 2; c += B_X / filtersPerThread) { shFilters[c + shFilterLoadY][shFilterLoadX].x = fPreload[c * filtersPerThread / B_X]; shFilters[c + shFilterLoadY][shFilterLoadX].y = fPreload[(c + colorCache / 2) * filtersPerThread / B_X]; } // store the preloaded image's pixels into shared memory #pragma unroll for (int c = 0; c < colorCache; c += B_Y / imgsPerThread) { shImages[c + shImgLoadY][shImgLoadX] = iPreload[c * imgsPerThread / B_Y]; } /* * Load a pixel from B_Y*filtersPerThread filters * This condition covers the case when B_X is not divisible by filtersPerThread. * In this case, not all of the threads will participate in the loading operation. * This ensures that in each loop iteration, an integer number of rows of shFilters * are filled, which makes indexing simple. * nvcc is behaving in a completely insane way: removing this condition under * template parameters that guarantee it to be true actually slows down * the computation. * */ /* preload image and filter pixels' data */ if ((oc + colorCache) == numFilterColors) { // move to next pixel when all colors of current pixel have been finished int imgXn = (imgX < (imgEndX - 1)) ? (imgX + 1) : imgStartX; int imgYn = imgY + (imgXn != (imgX + 1)); #pragma unroll for (int c = 0; c < colorCache; c += B_X/filtersPerThread) { fPreload[c * filtersPerThread / B_X] = filters[(c * filterPixels + (imgYn - imgLoadModPosY) * filterSize + (imgXn - imgLoadModPosX)) * numFilters]; } if (!checkImgBounds || myImgIdx < numImages) { #pragma unroll for (int c = 0; c < colorCache; c += B_Y / imgsPerThread) { iPreload[c * imgsPerThread / B_Y] = images[(c * imgPixels + imgYn * imgSizeX + imgXn) * imgStride]; } } else { #pragma unroll for (int c = 0; c < colorCache; c += B_Y / imgsPerThread) { iPreload[c * imgsPerThread / B_Y] = 0; } } } else { // move next colorCache #pragma unroll for (int c = 0; c < colorCache; c += B_X/filtersPerThread) { fPreload[c * filtersPerThread / B_X] = filters[((c + oc + colorCache) * filterPixels + (imgY - imgLoadModPosY) * filterSize + (imgX - imgLoadModPosX)) * numFilters]; } if (!checkImgBounds || myImgIdx < numImages) { #pragma unroll for (int c = 0; c < colorCache; c += B_Y / imgsPerThread) { iPreload[c * imgsPerThread / B_Y] = images[((c + oc + colorCache) * imgPixels + imgY * imgSizeX + imgX) * imgStride]; } } else { #pragma unroll for (int c = 0; c < colorCache; c += B_Y / imgsPerThread) { iPreload[c * imgsPerThread / B_Y] = 0; } } } __syncthreads(); // convolution for (int c = 0; c < colorCache / 2; c++) { #pragma unroll for(int g = 0; g < imgsPerThread; g++) { #pragma unroll for(int f = 0; f < filtersPerThread; f++) { prod[f][g] += shImages[c][g * B_X + tx] * shFilters[c][ty + f * B_Y].x; prod[f][g] += shImages[c + colorCache / 2][g * B_X + tx] * shFilters[c][ty + f * B_Y].y; } } } __syncthreads(); } } } // write convolution result into global memory if (scale) { #pragma unroll for (int g = 0; g < imgsPerThread; g++) { if (!checkImgBounds || myImgIdx + g * B_X < numImages) { #pragma unroll for (int f = 0; f < filtersPerThread; f++) { targets[g * B_X + f * B_Y * numImages * numModules] = scaleTargets * targets[g * B_X + f * B_Y * numImages * numModules] + scaleOutputs * prod[f][g]; } } } } else { // Note: reversing order of these loops saves 2 registers, but costs time #pragma unroll for (int f = 0; f < filtersPerThread; f++) { #pragma unroll for (int g = 0; g < imgsPerThread; g++) { if (!checkImgBounds || myImgIdx + g * B_X < numImages) { targets[g * B_X + f * B_Y * numImages * numModules] = scaleOutputs * prod[f][g]; } } } } } /* * images: (numImgColors, imgSizeY, imgSizeX, numImages) with stride given * filters: (numFilterColors, filterPixels, numFilters) if conv * (numModules, numFilterColors, filterPixels, numFilters) otherwise * * targets: (numFilters, numModules, numImages) * * Note: all of these convolution routines are optimized for the case when * the number of images (i.e. the minibatch size) is a multiple of 128. * Other batch sizes will work, but but I made no attempt whatsoever * to make them work fast. */ void _filterActs(caffe2::CUDAContext* context, caffe2::TensorCUDA* images, caffe2::TensorCUDA* filters, caffe2::TensorCUDA* targets, int imgSizeY, int numModulesY, int numModulesX, int paddingStart, int moduleStride, int numImgColors, int numGroups, float scaleTargets, float scaleOutput, bool conv) { CAFFE_ENFORCE(images->ndim() == 2); CAFFE_ENFORCE(filters->ndim() == 2); CAFFE_ENFORCE(targets->ndim() == 2); int numFilterColors = numImgColors / numGroups; int numFilters = filters->dim32(1); int numModules = numModulesY * numModulesX; int numImages = images->dim32(1); int imgPixels = images->dim32(0) / numImgColors; int imgSizeX = imgPixels / imgSizeY; int filterModuleMult = conv ? 1 : numModules; CAFFE_ENFORCE(numGroups > 1 || (numImgColors > 0 && (numImgColors <= 3 || numImgColors % 4 == 0))); CAFFE_ENFORCE(numGroups == 1 || numFilterColors % 4 == 0); CAFFE_ENFORCE(numFilters % (16 * numGroups) == 0); CAFFE_ENFORCE(numImgColors % numGroups == 0); CAFFE_ENFORCE(images->dim32(0) == imgPixels * numImgColors); CAFFE_ENFORCE(imgSizeY * imgSizeX == imgPixels); int numFiltersPerGroup = numFilters / numGroups; int imgStride = images->dim32(1); int filterPixels = filters->dim32(0) / (filterModuleMult * numFilterColors); int filterSize = int(sqrt(filterPixels)); CAFFE_ENFORCE(filterSize * filterSize == filterPixels); CAFFE_ENFORCE(filters->dim32(0) == filterModuleMult * numFilterColors * filterPixels); // These routines don't handle the case when only part of the image is visited in the convolution CAFFE_ENFORCE(paddingStart <= 0); CAFFE_ENFORCE(paddingStart + (numModulesX-1)*moduleStride + filterSize >= imgSizeX); CAFFE_ENFORCE(paddingStart + (numModulesY-1)*moduleStride + filterSize >= imgSizeY); CAFFE_ENFORCE(moduleStride <= filterSize); int imgsPerThread = numImages % 128 == 0 ? 4 : numImages % 64 == 0 ? 2 : 1; int filtersPerThread, threadsY = 4; if (numImgColors <= 3) { // Special kernels written for colors = 3, filters = 64 and colors = 3, filters = 48 cases. // The remaining cases use the old routines. // TODO: Modernize the remaining cases if you care about them. filtersPerThread = numFiltersPerGroup % 64 == 0 ? 16 : numFiltersPerGroup % 48 == 0 ? 12 : numFiltersPerGroup % 32 == 0 ? 8 : 4; } else { filtersPerThread = numFiltersPerGroup % 64 == 0 ? 16 : numFiltersPerGroup % 32 == 0 ? 8 : 4; threadsY = numFiltersPerGroup % 128 == 0 && numFilterColors % 8 == 0 && imgsPerThread != 4 ? 8 : 4; } int threadsX = 32; dim3 threads(threadsX, threadsY); dim3 blocks = dim3(DIVUP(numImages, threads.x * imgsPerThread), (numModules * numFilters) / (threads.y * filtersPerThread)); bool checkImgBounds = numImages % (threads.x*imgsPerThread) != 0; bool scale = scaleTargets != 0; if (scaleTargets == 0) { targets->Resize(std::vector{numFilters * numModules, numImages}); } else { CAFFE_ENFORCE(targets->dim32(0) == numFilters * numModules); CAFFE_ENFORCE(targets->dim32(1) == numImages); } cudaTextureObject_t tex_images = GetTensorTextureObject(images); cudaTextureObject_t tex_filters = GetTensorTextureObject(filters); float* images_data = images->mutable_data(); float* filters_data = filters->mutable_data(); float* targets_data = targets->mutable_data(); const std::size_t images_bytes = images->nbytes(); cudaStream_t stream = context->cuda_stream(); checkCudaErrors(cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte)); // using wider band width // Auto-generated calling code... // NOTE: The calling code is set up such that if checkImgBounds is true, then imgsPerThread = 1. // In principle it doesn't have to be this way, and you may want to optimize for that case. if (scale == false) { if (checkImgBounds == false) { if (numFilterColors % 8 == 0) { if (numImages % 128 == 0) { if (numFiltersPerGroup % 128 == 0) { if (images_bytes < TEXTURE_SIZE_MAX) { cudaFuncSetCacheConfig(filterActs_YxX_sparse2_preload_ty_4_tx_32_i_4_f_16_c_4_tex < 4, 32, 4, 16, 4, false, false >, cudaFuncCachePreferL1); filterActs_YxX_sparse2_preload_ty_4_tx_32_i_4_f_16_c_4_tex < 4, 32, 4, 16, 4, false, false > <<>>(tex_images, tex_filters, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); } else { cudaFuncSetCacheConfig(filterActs_YxX_sparse2_preload_ty_4_tx_32_i_4_f_16_c_4 < 4, 32, 4, 16, 4, false, false >, cudaFuncCachePreferL1); filterActs_YxX_sparse2_preload_ty_4_tx_32_i_4_f_16_c_4 < 4, 32, 4, 16, 4, false, false > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); } } else if (numFiltersPerGroup % 64 == 0) { if (images_bytes < TEXTURE_SIZE_MAX) { cudaFuncSetCacheConfig(filterActs_YxX_sparse2_preload_ty_4_tx_32_i_4_f_16_c_4_tex < 4, 32, 4, 16, 4, false, false >, cudaFuncCachePreferL1); filterActs_YxX_sparse2_preload_ty_4_tx_32_i_4_f_16_c_4_tex < 4, 32, 4, 16, 4, false, false > <<>>(tex_images, tex_filters, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); } else { cudaFuncSetCacheConfig(filterActs_YxX_sparse2_preload_ty_4_tx_32_i_4_f_16_c_4 < 4, 32, 4, 16, 4, false, false >, cudaFuncCachePreferL1); filterActs_YxX_sparse2_preload_ty_4_tx_32_i_4_f_16_c_4 < 4, 32, 4, 16, 4, false, false > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); } } else if (numFiltersPerGroup % 32 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_sparse2_f_vec < 4, 32, 4, 8, 8, false, false >, cudaFuncCachePreferShared); filterActs_YxX_sparse2_f_vec < 4, 32, 4, 8, 8, false, false > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 1 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 4, 4, 8, false, false >, cudaFuncCachePreferShared); filterActs_YxX_sparse2 < 4, 32, 4, 4, 8, false, false > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); } } else if (numImages % 64 == 0) { if (numFiltersPerGroup % 128 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_sparse2_f_vec < 8, 32, 2, 16, 8, false, false >, cudaFuncCachePreferShared); filterActs_YxX_sparse2_f_vec < 8, 32, 2, 16, 8, false, false > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 64 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_sparse2_f_vec < 4, 32, 2, 16, 8, false, false >, cudaFuncCachePreferShared); filterActs_YxX_sparse2_f_vec < 4, 32, 2, 16, 8, false, false > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 32 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_sparse2_f_vec < 4, 32, 2, 8, 8, false, false >, cudaFuncCachePreferShared); filterActs_YxX_sparse2_f_vec < 4, 32, 2, 8, 8, false, false > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 1 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 2, 4, 8, false, false >, cudaFuncCachePreferShared); filterActs_YxX_sparse2 < 4, 32, 2, 4, 8, false, false > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); } } else if (numImages % 32 == 0) { if (numFiltersPerGroup % 128 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_sparse2_f_vec < 8, 32, 1, 16, 8, false, false >, cudaFuncCachePreferShared); filterActs_YxX_sparse2_f_vec < 8, 32, 1, 16, 8, false, false > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 64 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_sparse2_f_vec < 4, 32, 1, 16, 8, false, false >, cudaFuncCachePreferShared); filterActs_YxX_sparse2_f_vec < 4, 32, 1, 16, 8, false, false > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 32 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_sparse2_f_vec < 4, 32, 1, 8, 8, false, false >, cudaFuncCachePreferShared); filterActs_YxX_sparse2_f_vec < 4, 32, 1, 8, 8, false, false > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 1 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 1, 4, 8, false, false >, cudaFuncCachePreferShared); filterActs_YxX_sparse2 < 4, 32, 1, 4, 8, false, false > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); } } } else if (numFilterColors % 4 == 0) { if (numImages % 128 == 0) { if (numFiltersPerGroup % 128 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 4, 16, 4, false, false >, cudaFuncCachePreferShared); filterActs_YxX_sparse2 < 4, 32, 4, 16, 4, false, false > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 64 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 4, 16, 4, false, false >, cudaFuncCachePreferShared); filterActs_YxX_sparse2 < 4, 32, 4, 16, 4, false, false > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 32 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 4, 8, 4, false, false >, cudaFuncCachePreferShared); filterActs_YxX_sparse2 < 4, 32, 4, 8, 4, false, false > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 1 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 4, 4, 4, false, false >, cudaFuncCachePreferShared); filterActs_YxX_sparse2 < 4, 32, 4, 4, 4, false, false > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); } } else if (numImages % 64 == 0) { if (numFiltersPerGroup % 128 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 2, 16, 4, false, false >, cudaFuncCachePreferShared); filterActs_YxX_sparse2 < 4, 32, 2, 16, 4, false, false > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 64 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 2, 16, 4, false, false >, cudaFuncCachePreferShared); filterActs_YxX_sparse2 < 4, 32, 2, 16, 4, false, false > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 32 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 2, 8, 4, false, false >, cudaFuncCachePreferShared); filterActs_YxX_sparse2 < 4, 32, 2, 8, 4, false, false > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 1 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 2, 4, 4, false, false >, cudaFuncCachePreferShared); filterActs_YxX_sparse2 < 4, 32, 2, 4, 4, false, false > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); } } else if (numImages % 32 == 0) { if (numFiltersPerGroup % 128 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 1, 16, 4, false, false >, cudaFuncCachePreferShared); filterActs_YxX_sparse2 < 4, 32, 1, 16, 4, false, false > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 64 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 1, 16, 4, false, false >, cudaFuncCachePreferShared); filterActs_YxX_sparse2 < 4, 32, 1, 16, 4, false, false > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 32 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 1, 8, 4, false, false >, cudaFuncCachePreferShared); filterActs_YxX_sparse2 < 4, 32, 1, 8, 4, false, false > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 1 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 1, 4, 4, false, false >, cudaFuncCachePreferShared); filterActs_YxX_sparse2 < 4, 32, 1, 4, 4, false, false > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); } } } else if (numFilterColors == 3) { if (numImages % 128 == 0) { if (numFiltersPerGroup % 64 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_color_preload_ty_4_tx_32_i_4_f_16_px_4_cc_3_tex < 4, 32, 4, 16, 3, 4, false, false >, cudaFuncCachePreferShared); filterActs_YxX_color_preload_ty_4_tx_32_i_4_f_16_px_4_cc_3_tex < 4, 32, 4, 16, 3, 4, false, false > <<>>(tex_images, tex_filters, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 48 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_color_preload_ty_4_tx_32_i_4_f_12_px_4_cc_3_tex < 4, 32, 4, 12, 3, 4, false, false >, cudaFuncCachePreferShared); filterActs_YxX_color_preload_ty_4_tx_32_i_4_f_12_px_4_cc_3_tex < 4, 32, 4, 12, 3, 4, false, false > <<>>(tex_images, tex_filters, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 32 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 4, 8, 3, 4, false, false >, cudaFuncCachePreferShared); filterActs_YxX_color < 4, 32, 4, 8, 3, 4, false, false > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 1 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 4, 4, 3, 4, false, false >, cudaFuncCachePreferShared); filterActs_YxX_color < 4, 32, 4, 4, 3, 4, false, false > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); } } else if (numImages % 64 == 0) { if (numFiltersPerGroup % 64 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 2, 16, 3, 4, false, false >, cudaFuncCachePreferShared); filterActs_YxX_color < 4, 32, 2, 16, 3, 4, false, false > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 48 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 2, 12, 3, 4, false, false >, cudaFuncCachePreferShared); filterActs_YxX_color < 4, 32, 2, 12, 3, 4, false, false > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 32 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 2, 8, 3, 4, false, false >, cudaFuncCachePreferShared); filterActs_YxX_color < 4, 32, 2, 8, 3, 4, false, false > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 1 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 2, 4, 3, 4, false, false >, cudaFuncCachePreferShared); filterActs_YxX_color < 4, 32, 2, 4, 3, 4, false, false > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); } } else if (numImages % 32 == 0) { if (numFiltersPerGroup % 64 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 1, 16, 3, 4, false, false >, cudaFuncCachePreferShared); filterActs_YxX_color < 4, 32, 1, 16, 3, 4, false, false > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 48 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 1, 12, 3, 4, false, false >, cudaFuncCachePreferShared); filterActs_YxX_color < 4, 32, 1, 12, 3, 4, false, false > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 32 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 1, 8, 3, 4, false, false >, cudaFuncCachePreferShared); filterActs_YxX_color < 4, 32, 1, 8, 3, 4, false, false > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 1 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 1, 4, 3, 4, false, false >, cudaFuncCachePreferShared); filterActs_YxX_color < 4, 32, 1, 4, 3, 4, false, false > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); } } } else if (numFilterColors == 2) { if (numImages % 128 == 0) { if (numFiltersPerGroup % 64 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 4, 16, 2, 4, false, false >, cudaFuncCachePreferShared); filterActs_YxX_color < 4, 32, 4, 16, 2, 4, false, false > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 48 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 4, 12, 2, 4, false, false >, cudaFuncCachePreferShared); filterActs_YxX_color < 4, 32, 4, 12, 2, 4, false, false > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 32 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 4, 8, 2, 4, false, false >, cudaFuncCachePreferShared); filterActs_YxX_color < 4, 32, 4, 8, 2, 4, false, false > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 1 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 4, 4, 2, 4, false, false >, cudaFuncCachePreferShared); filterActs_YxX_color < 4, 32, 4, 4, 2, 4, false, false > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); } } else if (numImages % 64 == 0) { if (numFiltersPerGroup % 64 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 2, 16, 2, 4, false, false >, cudaFuncCachePreferShared); filterActs_YxX_color < 4, 32, 2, 16, 2, 4, false, false > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 48 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 2, 12, 2, 4, false, false >, cudaFuncCachePreferShared); filterActs_YxX_color < 4, 32, 2, 12, 2, 4, false, false > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 32 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 2, 8, 2, 4, false, false >, cudaFuncCachePreferShared); filterActs_YxX_color < 4, 32, 2, 8, 2, 4, false, false > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 1 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 2, 4, 2, 4, false, false >, cudaFuncCachePreferShared); filterActs_YxX_color < 4, 32, 2, 4, 2, 4, false, false > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); } } else if (numImages % 32 == 0) { if (numFiltersPerGroup % 64 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 1, 16, 2, 4, false, false >, cudaFuncCachePreferShared); filterActs_YxX_color < 4, 32, 1, 16, 2, 4, false, false > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 48 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 1, 12, 2, 4, false, false >, cudaFuncCachePreferShared); filterActs_YxX_color < 4, 32, 1, 12, 2, 4, false, false > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 32 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 1, 8, 2, 4, false, false >, cudaFuncCachePreferShared); filterActs_YxX_color < 4, 32, 1, 8, 2, 4, false, false > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 1 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 1, 4, 2, 4, false, false >, cudaFuncCachePreferShared); filterActs_YxX_color < 4, 32, 1, 4, 2, 4, false, false > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); } } } else if (numFilterColors == 1) { if (numImages % 128 == 0) { if (numFiltersPerGroup % 64 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 4, 16, 1, 4, false, false >, cudaFuncCachePreferShared); filterActs_YxX_color < 4, 32, 4, 16, 1, 4, false, false > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 48 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 4, 12, 1, 4, false, false >, cudaFuncCachePreferShared); filterActs_YxX_color < 4, 32, 4, 12, 1, 4, false, false > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 32 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 4, 8, 1, 4, false, false >, cudaFuncCachePreferShared); filterActs_YxX_color < 4, 32, 4, 8, 1, 4, false, false > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 1 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 4, 4, 1, 4, false, false >, cudaFuncCachePreferShared); filterActs_YxX_color < 4, 32, 4, 4, 1, 4, false, false > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); } } else if (numImages % 64 == 0) { if (numFiltersPerGroup % 64 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 2, 16, 1, 4, false, false >, cudaFuncCachePreferShared); filterActs_YxX_color < 4, 32, 2, 16, 1, 4, false, false > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 48 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 2, 12, 1, 4, false, false >, cudaFuncCachePreferShared); filterActs_YxX_color < 4, 32, 2, 12, 1, 4, false, false > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 32 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 2, 8, 1, 4, false, false >, cudaFuncCachePreferShared); filterActs_YxX_color < 4, 32, 2, 8, 1, 4, false, false > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 1 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 2, 4, 1, 4, false, false >, cudaFuncCachePreferShared); filterActs_YxX_color < 4, 32, 2, 4, 1, 4, false, false > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); } } else if (numImages % 32 == 0) { if (numFiltersPerGroup % 64 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 1, 16, 1, 4, false, false >, cudaFuncCachePreferShared); filterActs_YxX_color < 4, 32, 1, 16, 1, 4, false, false > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 48 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 1, 12, 1, 4, false, false >, cudaFuncCachePreferShared); filterActs_YxX_color < 4, 32, 1, 12, 1, 4, false, false > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 32 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 1, 8, 1, 4, false, false >, cudaFuncCachePreferShared); filterActs_YxX_color < 4, 32, 1, 8, 1, 4, false, false > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 1 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 1, 4, 1, 4, false, false >, cudaFuncCachePreferShared); filterActs_YxX_color < 4, 32, 1, 4, 1, 4, false, false > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); } } } } else if (checkImgBounds == true) { if (numFilterColors % 8 == 0) { if (numImages % 1 == 0) { if (numFiltersPerGroup % 128 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 8, 32, 1, 16, 8, false, true >, cudaFuncCachePreferShared); filterActs_YxX_sparse2 < 8, 32, 1, 16, 8, false, true > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 64 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 1, 16, 8, false, true >, cudaFuncCachePreferShared); filterActs_YxX_sparse2 < 4, 32, 1, 16, 8, false, true > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 32 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 1, 8, 8, false, true >, cudaFuncCachePreferShared); filterActs_YxX_sparse2 < 4, 32, 1, 8, 8, false, true > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 1 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 1, 4, 8, false, true >, cudaFuncCachePreferShared); filterActs_YxX_sparse2 < 4, 32, 1, 4, 8, false, true > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); } } } else if (numFilterColors % 4 == 0) { if (numImages % 1 == 0) { if (numFiltersPerGroup % 128 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 1, 16, 4, false, true >, cudaFuncCachePreferShared); filterActs_YxX_sparse2 < 4, 32, 1, 16, 4, false, true > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 64 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 1, 16, 4, false, true >, cudaFuncCachePreferShared); filterActs_YxX_sparse2 < 4, 32, 1, 16, 4, false, true > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 32 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 1, 8, 4, false, true >, cudaFuncCachePreferShared); filterActs_YxX_sparse2 < 4, 32, 1, 8, 4, false, true > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 1 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 1, 4, 4, false, true >, cudaFuncCachePreferShared); filterActs_YxX_sparse2 < 4, 32, 1, 4, 4, false, true > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); } } } else if (numFilterColors == 3) { if (numImages % 1 == 0) { if (numFiltersPerGroup % 64 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 1, 16, 3, 4, false, true >, cudaFuncCachePreferShared); filterActs_YxX_color < 4, 32, 1, 16, 3, 4, false, true > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 48 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 1, 12, 3, 4, false, true >, cudaFuncCachePreferShared); filterActs_YxX_color < 4, 32, 1, 12, 3, 4, false, true > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 32 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 1, 8, 3, 4, false, true >, cudaFuncCachePreferShared); filterActs_YxX_color < 4, 32, 1, 8, 3, 4, false, true > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 1 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 1, 4, 3, 4, false, true >, cudaFuncCachePreferShared); filterActs_YxX_color < 4, 32, 1, 4, 3, 4, false, true > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); } } } else if (numFilterColors == 2) { if (numImages % 1 == 0) { if (numFiltersPerGroup % 64 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 1, 16, 2, 4, false, true >, cudaFuncCachePreferShared); filterActs_YxX_color < 4, 32, 1, 16, 2, 4, false, true > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 48 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 1, 12, 2, 4, false, true >, cudaFuncCachePreferShared); filterActs_YxX_color < 4, 32, 1, 12, 2, 4, false, true > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 32 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 1, 8, 2, 4, false, true >, cudaFuncCachePreferShared); filterActs_YxX_color < 4, 32, 1, 8, 2, 4, false, true > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 1 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 1, 4, 2, 4, false, true >, cudaFuncCachePreferShared); filterActs_YxX_color < 4, 32, 1, 4, 2, 4, false, true > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); } } } else if (numFilterColors == 1) { if (numImages % 1 == 0) { if (numFiltersPerGroup % 64 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 1, 16, 1, 4, false, true >, cudaFuncCachePreferShared); filterActs_YxX_color < 4, 32, 1, 16, 1, 4, false, true > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 48 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 1, 12, 1, 4, false, true >, cudaFuncCachePreferShared); filterActs_YxX_color < 4, 32, 1, 12, 1, 4, false, true > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 32 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 1, 8, 1, 4, false, true >, cudaFuncCachePreferShared); filterActs_YxX_color < 4, 32, 1, 8, 1, 4, false, true > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 1 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 1, 4, 1, 4, false, true >, cudaFuncCachePreferShared); filterActs_YxX_color < 4, 32, 1, 4, 1, 4, false, true > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); } } } } } else if (scale == true) { if (checkImgBounds == false) { if (numFilterColors % 8 == 0) { if (numImages % 128 == 0) { if (numFiltersPerGroup % 128 == 0) { if (images_bytes < TEXTURE_SIZE_MAX) { cudaFuncSetCacheConfig(filterActs_YxX_sparse2_preload_ty_4_tx_32_i_4_f_16_c_4_tex < 4, 32, 4, 16, 4, true, false >, cudaFuncCachePreferL1); filterActs_YxX_sparse2_preload_ty_4_tx_32_i_4_f_16_c_4_tex < 4, 32, 4, 16, 4, true, false > <<>>(tex_images, tex_filters, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); } else { cudaFuncSetCacheConfig(filterActs_YxX_sparse2_preload_ty_4_tx_32_i_4_f_16_c_4 < 4, 32, 4, 16, 4, true, false >, cudaFuncCachePreferL1); filterActs_YxX_sparse2_preload_ty_4_tx_32_i_4_f_16_c_4 < 4, 32, 4, 16, 4, true, false > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); } } else if (numFiltersPerGroup % 64 == 0) { if (images_bytes < TEXTURE_SIZE_MAX) { cudaFuncSetCacheConfig(filterActs_YxX_sparse2_preload_ty_4_tx_32_i_4_f_16_c_4_tex < 4, 32, 4, 16, 4, true, false >, cudaFuncCachePreferL1); filterActs_YxX_sparse2_preload_ty_4_tx_32_i_4_f_16_c_4_tex < 4, 32, 4, 16, 4, true, false > <<>>(tex_images, tex_filters, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); } else { cudaFuncSetCacheConfig(filterActs_YxX_sparse2_preload_ty_4_tx_32_i_4_f_16_c_4 < 4, 32, 4, 16, 4, true, false >, cudaFuncCachePreferL1); filterActs_YxX_sparse2_preload_ty_4_tx_32_i_4_f_16_c_4 < 4, 32, 4, 16, 4, true, false > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); } } else if (numFiltersPerGroup % 32 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_sparse2_f_vec < 4, 32, 4, 8, 8, true, false >, cudaFuncCachePreferShared); filterActs_YxX_sparse2_f_vec < 4, 32, 4, 8, 8, true, false > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 1 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 4, 4, 8, true, false >, cudaFuncCachePreferShared); filterActs_YxX_sparse2 < 4, 32, 4, 4, 8, true, false > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); } } else if (numImages % 64 == 0) { if (numFiltersPerGroup % 128 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_sparse2_f_vec < 8, 32, 2, 16, 8, true, false >, cudaFuncCachePreferShared); filterActs_YxX_sparse2_f_vec < 8, 32, 2, 16, 8, true, false > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 64 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_sparse2_f_vec < 4, 32, 2, 16, 8, true, false >, cudaFuncCachePreferShared); filterActs_YxX_sparse2_f_vec < 4, 32, 2, 16, 8, true, false > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 32 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_sparse2_f_vec < 4, 32, 2, 8, 8, true, false >, cudaFuncCachePreferShared); filterActs_YxX_sparse2_f_vec < 4, 32, 2, 8, 8, true, false > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 1 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 2, 4, 8, true, false >, cudaFuncCachePreferShared); filterActs_YxX_sparse2 < 4, 32, 2, 4, 8, true, false > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); } } else if (numImages % 32 == 0) { if (numFiltersPerGroup % 128 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_sparse2_f_vec < 8, 32, 1, 16, 8, true, false >, cudaFuncCachePreferShared); filterActs_YxX_sparse2_f_vec < 8, 32, 1, 16, 8, true, false > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 64 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_sparse2_f_vec < 4, 32, 1, 16, 8, true, false >, cudaFuncCachePreferShared); filterActs_YxX_sparse2_f_vec < 4, 32, 1, 16, 8, true, false > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 32 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_sparse2_f_vec < 4, 32, 1, 8, 8, true, false >, cudaFuncCachePreferShared); filterActs_YxX_sparse2_f_vec < 4, 32, 1, 8, 8, true, false > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 1 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 1, 4, 8, true, false >, cudaFuncCachePreferShared); filterActs_YxX_sparse2 < 4, 32, 1, 4, 8, true, false > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); } } } else if (numFilterColors % 4 == 0) { if (numImages % 128 == 0) { if (numFiltersPerGroup % 128 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 4, 16, 4, true, false >, cudaFuncCachePreferShared); filterActs_YxX_sparse2 < 4, 32, 4, 16, 4, true, false > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 64 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 4, 16, 4, true, false >, cudaFuncCachePreferShared); filterActs_YxX_sparse2 < 4, 32, 4, 16, 4, true, false > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 32 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 4, 8, 4, true, false >, cudaFuncCachePreferShared); filterActs_YxX_sparse2 < 4, 32, 4, 8, 4, true, false > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 1 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 4, 4, 4, true, false >, cudaFuncCachePreferShared); filterActs_YxX_sparse2 < 4, 32, 4, 4, 4, true, false > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); } } else if (numImages % 64 == 0) { if (numFiltersPerGroup % 128 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 2, 16, 4, true, false >, cudaFuncCachePreferShared); filterActs_YxX_sparse2 < 4, 32, 2, 16, 4, true, false > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 64 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 2, 16, 4, true, false >, cudaFuncCachePreferShared); filterActs_YxX_sparse2 < 4, 32, 2, 16, 4, true, false > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 32 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 2, 8, 4, true, false >, cudaFuncCachePreferShared); filterActs_YxX_sparse2 < 4, 32, 2, 8, 4, true, false > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 1 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 2, 4, 4, true, false >, cudaFuncCachePreferShared); filterActs_YxX_sparse2 < 4, 32, 2, 4, 4, true, false > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); } } else if (numImages % 32 == 0) { if (numFiltersPerGroup % 128 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 1, 16, 4, true, false >, cudaFuncCachePreferShared); filterActs_YxX_sparse2 < 4, 32, 1, 16, 4, true, false > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 64 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 1, 16, 4, true, false >, cudaFuncCachePreferShared); filterActs_YxX_sparse2 < 4, 32, 1, 16, 4, true, false > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 32 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 1, 8, 4, true, false >, cudaFuncCachePreferShared); filterActs_YxX_sparse2 < 4, 32, 1, 8, 4, true, false > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 1 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 1, 4, 4, true, false >, cudaFuncCachePreferShared); filterActs_YxX_sparse2 < 4, 32, 1, 4, 4, true, false > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); } } } else if (numFilterColors == 3) { if (numImages % 128 == 0) { if (numFiltersPerGroup % 64 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_color_preload_ty_4_tx_32_i_4_f_16_px_4_cc_3_tex < 4, 32, 4, 16, 3, 4, true, false >, cudaFuncCachePreferShared); filterActs_YxX_color_preload_ty_4_tx_32_i_4_f_16_px_4_cc_3_tex < 4, 32, 4, 16, 3, 4, true, false > <<>>(tex_images, tex_filters, targets_data,numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 48 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_color_preload_ty_4_tx_32_i_4_f_12_px_4_cc_3_tex < 4, 32, 4, 12, 3, 4, true, false >, cudaFuncCachePreferShared); filterActs_YxX_color_preload_ty_4_tx_32_i_4_f_12_px_4_cc_3_tex < 4, 32, 4, 12, 3, 4, true, false > <<>>(tex_images, tex_filters, targets_data,numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 32 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 4, 8, 3, 4, true, false >, cudaFuncCachePreferShared); filterActs_YxX_color < 4, 32, 4, 8, 3, 4, true, false > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 1 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 4, 4, 3, 4, true, false >, cudaFuncCachePreferShared); filterActs_YxX_color < 4, 32, 4, 4, 3, 4, true, false > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); } } else if (numImages % 64 == 0) { if (numFiltersPerGroup % 64 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 2, 16, 3, 4, true, false >, cudaFuncCachePreferShared); filterActs_YxX_color < 4, 32, 2, 16, 3, 4, true, false > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 48 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 2, 12, 3, 4, true, false >, cudaFuncCachePreferShared); filterActs_YxX_color < 4, 32, 2, 12, 3, 4, true, false > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 32 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 2, 8, 3, 4, true, false >, cudaFuncCachePreferShared); filterActs_YxX_color < 4, 32, 2, 8, 3, 4, true, false > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 1 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 2, 4, 3, 4, true, false >, cudaFuncCachePreferShared); filterActs_YxX_color < 4, 32, 2, 4, 3, 4, true, false > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); } } else if (numImages % 32 == 0) { if (numFiltersPerGroup % 64 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 1, 16, 3, 4, true, false >, cudaFuncCachePreferShared); filterActs_YxX_color < 4, 32, 1, 16, 3, 4, true, false > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 48 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 1, 12, 3, 4, true, false >, cudaFuncCachePreferShared); filterActs_YxX_color < 4, 32, 1, 12, 3, 4, true, false > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 32 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 1, 8, 3, 4, true, false >, cudaFuncCachePreferShared); filterActs_YxX_color < 4, 32, 1, 8, 3, 4, true, false > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 1 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 1, 4, 3, 4, true, false >, cudaFuncCachePreferShared); filterActs_YxX_color < 4, 32, 1, 4, 3, 4, true, false > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); } } } else if (numFilterColors == 2) { if (numImages % 128 == 0) { if (numFiltersPerGroup % 64 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 4, 16, 2, 4, true, false >, cudaFuncCachePreferShared); filterActs_YxX_color < 4, 32, 4, 16, 2, 4, true, false > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 48 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 4, 12, 2, 4, true, false >, cudaFuncCachePreferShared); filterActs_YxX_color < 4, 32, 4, 12, 2, 4, true, false > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 32 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 4, 8, 2, 4, true, false >, cudaFuncCachePreferShared); filterActs_YxX_color < 4, 32, 4, 8, 2, 4, true, false > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 1 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 4, 4, 2, 4, true, false >, cudaFuncCachePreferShared); filterActs_YxX_color < 4, 32, 4, 4, 2, 4, true, false > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); } } else if (numImages % 64 == 0) { if (numFiltersPerGroup % 64 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 2, 16, 2, 4, true, false >, cudaFuncCachePreferShared); filterActs_YxX_color < 4, 32, 2, 16, 2, 4, true, false > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 48 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 2, 12, 2, 4, true, false >, cudaFuncCachePreferShared); filterActs_YxX_color < 4, 32, 2, 12, 2, 4, true, false > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 32 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 2, 8, 2, 4, true, false >, cudaFuncCachePreferShared); filterActs_YxX_color < 4, 32, 2, 8, 2, 4, true, false > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 1 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 2, 4, 2, 4, true, false >, cudaFuncCachePreferShared); filterActs_YxX_color < 4, 32, 2, 4, 2, 4, true, false > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); } } else if (numImages % 32 == 0) { if (numFiltersPerGroup % 64 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 1, 16, 2, 4, true, false >, cudaFuncCachePreferShared); filterActs_YxX_color < 4, 32, 1, 16, 2, 4, true, false > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 48 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 1, 12, 2, 4, true, false >, cudaFuncCachePreferShared); filterActs_YxX_color < 4, 32, 1, 12, 2, 4, true, false > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 32 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 1, 8, 2, 4, true, false >, cudaFuncCachePreferShared); filterActs_YxX_color < 4, 32, 1, 8, 2, 4, true, false > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 1 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 1, 4, 2, 4, true, false >, cudaFuncCachePreferShared); filterActs_YxX_color < 4, 32, 1, 4, 2, 4, true, false > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); } } } else if (numFilterColors == 1) { if (numImages % 128 == 0) { if (numFiltersPerGroup % 64 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 4, 16, 1, 4, true, false >, cudaFuncCachePreferShared); filterActs_YxX_color < 4, 32, 4, 16, 1, 4, true, false > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 48 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 4, 12, 1, 4, true, false >, cudaFuncCachePreferShared); filterActs_YxX_color < 4, 32, 4, 12, 1, 4, true, false > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 32 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 4, 8, 1, 4, true, false >, cudaFuncCachePreferShared); filterActs_YxX_color < 4, 32, 4, 8, 1, 4, true, false > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 1 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 4, 4, 1, 4, true, false >, cudaFuncCachePreferShared); filterActs_YxX_color < 4, 32, 4, 4, 1, 4, true, false > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); } } else if (numImages % 64 == 0) { if (numFiltersPerGroup % 64 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 2, 16, 1, 4, true, false >, cudaFuncCachePreferShared); filterActs_YxX_color < 4, 32, 2, 16, 1, 4, true, false > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 48 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 2, 12, 1, 4, true, false >, cudaFuncCachePreferShared); filterActs_YxX_color < 4, 32, 2, 12, 1, 4, true, false > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 32 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 2, 8, 1, 4, true, false >, cudaFuncCachePreferShared); filterActs_YxX_color < 4, 32, 2, 8, 1, 4, true, false > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 1 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 2, 4, 1, 4, true, false >, cudaFuncCachePreferShared); filterActs_YxX_color < 4, 32, 2, 4, 1, 4, true, false > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); } } else if (numImages % 32 == 0) { if (numFiltersPerGroup % 64 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 1, 16, 1, 4, true, false >, cudaFuncCachePreferShared); filterActs_YxX_color < 4, 32, 1, 16, 1, 4, true, false > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 48 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 1, 12, 1, 4, true, false >, cudaFuncCachePreferShared); filterActs_YxX_color < 4, 32, 1, 12, 1, 4, true, false > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 32 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 1, 8, 1, 4, true, false >, cudaFuncCachePreferShared); filterActs_YxX_color < 4, 32, 1, 8, 1, 4, true, false > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 1 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 1, 4, 1, 4, true, false >, cudaFuncCachePreferShared); filterActs_YxX_color < 4, 32, 1, 4, 1, 4, true, false > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); } } } } else if (checkImgBounds == true) { if (numFilterColors % 8 == 0) { if (numImages % 1 == 0) { if (numFiltersPerGroup % 128 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 8, 32, 1, 16, 8, true, true >, cudaFuncCachePreferShared); filterActs_YxX_sparse2 < 8, 32, 1, 16, 8, true, true > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 64 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 1, 16, 8, true, true >, cudaFuncCachePreferShared); filterActs_YxX_sparse2 < 4, 32, 1, 16, 8, true, true > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 32 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 1, 8, 8, true, true >, cudaFuncCachePreferShared); filterActs_YxX_sparse2 < 4, 32, 1, 8, 8, true, true > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 1 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 1, 4, 8, true, true >, cudaFuncCachePreferShared); filterActs_YxX_sparse2 < 4, 32, 1, 4, 8, true, true > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); } } } else if (numFilterColors % 4 == 0) { if (numImages % 1 == 0) { if (numFiltersPerGroup % 128 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 1, 16, 4, true, true >, cudaFuncCachePreferShared); filterActs_YxX_sparse2 < 4, 32, 1, 16, 4, true, true > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 64 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 1, 16, 4, true, true >, cudaFuncCachePreferShared); filterActs_YxX_sparse2 < 4, 32, 1, 16, 4, true, true > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 32 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 1, 8, 4, true, true >, cudaFuncCachePreferShared); filterActs_YxX_sparse2 < 4, 32, 1, 8, 4, true, true > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 1 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 1, 4, 4, true, true >, cudaFuncCachePreferShared); filterActs_YxX_sparse2 < 4, 32, 1, 4, 4, true, true > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); } } } else if (numFilterColors == 3) { if (numImages % 1 == 0) { if (numFiltersPerGroup % 64 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 1, 16, 3, 4, true, true >, cudaFuncCachePreferShared); filterActs_YxX_color < 4, 32, 1, 16, 3, 4, true, true > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 48 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 1, 12, 3, 4, true, true >, cudaFuncCachePreferShared); filterActs_YxX_color < 4, 32, 1, 12, 3, 4, true, true > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 32 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 1, 8, 3, 4, true, true >, cudaFuncCachePreferShared); filterActs_YxX_color < 4, 32, 1, 8, 3, 4, true, true > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 1 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 1, 4, 3, 4, true, true >, cudaFuncCachePreferShared); filterActs_YxX_color < 4, 32, 1, 4, 3, 4, true, true > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); } } } else if (numFilterColors == 2) { if (numImages % 1 == 0) { if (numFiltersPerGroup % 64 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 1, 16, 2, 4, true, true >, cudaFuncCachePreferShared); filterActs_YxX_color < 4, 32, 1, 16, 2, 4, true, true > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 48 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 1, 12, 2, 4, true, true >, cudaFuncCachePreferShared); filterActs_YxX_color < 4, 32, 1, 12, 2, 4, true, true > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 32 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 1, 8, 2, 4, true, true >, cudaFuncCachePreferShared); filterActs_YxX_color < 4, 32, 1, 8, 2, 4, true, true > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 1 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 1, 4, 2, 4, true, true >, cudaFuncCachePreferShared); filterActs_YxX_color < 4, 32, 1, 4, 2, 4, true, true > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); } } } else if (numFilterColors == 1) { if (numImages % 1 == 0) { if (numFiltersPerGroup % 64 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 1, 16, 1, 4, true, true >, cudaFuncCachePreferShared); filterActs_YxX_color < 4, 32, 1, 16, 1, 4, true, true > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 48 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 1, 12, 1, 4, true, true >, cudaFuncCachePreferShared); filterActs_YxX_color < 4, 32, 1, 12, 1, 4, true, true > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 32 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 1, 8, 1, 4, true, true >, cudaFuncCachePreferShared); filterActs_YxX_color < 4, 32, 1, 8, 1, 4, true, true > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 1 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 1, 4, 1, 4, true, true >, cudaFuncCachePreferShared); filterActs_YxX_color < 4, 32, 1, 4, 1, 4, true, true > <<>>(images_data, filters_data, targets_data, numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); } } } } } checkCudaErrors(cudaDestroyTextureObject(tex_images)); checkCudaErrors(cudaDestroyTextureObject(tex_filters)); checkCudaErrors(cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeFourByte)); getLastCudaError("filterActs: kernel execution failed"); } void convFilterActs(caffe2::CUDAContext* context, caffe2::TensorCUDA* images, caffe2::TensorCUDA* filters, caffe2::TensorCUDA* targets, int imgSizeY, int numModulesY, int numModulesX, int paddingStart, int moduleStride, int numImgColors, int numGroups) { convFilterActs(context, images, filters, targets, imgSizeY, numModulesY, numModulesX, paddingStart, moduleStride, numImgColors, numGroups, 0, 1); } void convFilterActs(caffe2::CUDAContext* context, caffe2::TensorCUDA* images, caffe2::TensorCUDA* filters, caffe2::TensorCUDA* targets, int imgSizeY, int numModulesY, int numModulesX, int paddingStart, int moduleStride, int numImgColors, int numGroups, float scaleTargets, float scaleOutput) { _filterActs(context, images, filters, targets, imgSizeY, numModulesY, numModulesX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput, true); } void localFilterActs(caffe2::CUDAContext* context, caffe2::TensorCUDA* images, caffe2::TensorCUDA* filters, caffe2::TensorCUDA* targets, int imgSizeY, int numModulesY, int numModulesX, int paddingStart, int moduleStride, int numImgColors, int numGroups) { localFilterActs(context, images, filters, targets, imgSizeY, numModulesY, numModulesX, paddingStart, moduleStride, numImgColors, numGroups, 0, 1); } void localFilterActs(caffe2::CUDAContext* context, caffe2::TensorCUDA* images, caffe2::TensorCUDA* filters, caffe2::TensorCUDA* targets, int imgSizeY, int numModulesY, int numModulesX, int paddingStart, int moduleStride, int numImgColors, int numGroups, float scaleTargets, float scaleOutput) { _filterActs(context, images, filters, targets, imgSizeY, numModulesY, numModulesX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput, false); } ================================================ FILE: caffe2/contrib/cuda-convnet2/cudaconv3/src/img_acts.cu ================================================ /* * Copyright 2014 Google Inc. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include #include "../include/cudaconv2.cuh" /* * Block size: 16x16. * blockIdx.x determines case in batches of 16*imgsPerThread. * blockIdx.y determines 4x4 image region in target image. * * threadIdx.x determines case. * threadIdx.y determines pixel. * * hidActs: (numFilters, numModulesY, numModulesX, numImages) * filters: (numColors, filterPixels, numFilters) if conv * (numModulesY, numModulesX, numColors, filterPixels, numFilters) otherwise * targets: (numColors, imgSizeY, imgSizeX, numImages) * * Each block reconstructs one 4x4 pixels from 16*imgsPerThread cases. * * Number of filters must be divisible by 16. * Number of images must be divisible by 16*imgsPerThread if checkCaseBounds is false. * 16 * imgsPerThread must be divisible by 32. * * This version loads 32 cases at a time, so it gets full coalescing on that load. * It only loads 16 weights at a time, so those aren't fully coalesced. * This version conserves shared memory by loading 16 filters at a time rather than 32. */ template __global__ void img_acts_color(const float* hidActs, const float* filters, float* targets, const int numModulesY, const int numModulesX, const int numImages, const int numFilters, const int filterSize, const int imgSizeY, const int imgSizeX, const int paddingStart, const int moduleStride, const float scaleTargets, const float scaleOutputs) { __shared__ float shFilters[numColors*16][16 + 1]; __shared__ float shHidActs[16][16*imgsPerThread]; const int blockCaseIdx = blockIdx.x * 16*imgsPerThread; const int numRegionsX = DIVUP(imgSizeX, 4); const int blockRegionIdx = blockIdx.y; const int blockRegionIdxX = blockRegionIdx % numRegionsX; const int blockRegionIdxY = blockRegionIdx / numRegionsX; const int blockRegionLeft = blockRegionIdxX * 4; const int blockRegionTop = blockRegionIdxY * 4; const int pxYInRegion = threadIdx.y / 4, pxXInRegion = threadIdx.y % 4; const int pxY = blockRegionTop + pxYInRegion; const int pxX = blockRegionLeft + pxXInRegion; const int pxIdx = pxY * imgSizeX + pxX; const bool isPxInImg = pxY < imgSizeY && pxX < imgSizeX; const int numModules = numModulesY * numModulesX; const int filterPixels = filterSize * filterSize; const int imgPixels = imgSizeX * imgSizeY; const int tidx = threadIdx.y * 16 + threadIdx.x; const int loadY = tidx / 32, loadX = tidx % 32; hidActs += blockCaseIdx + loadY * numImages * numModules + loadX; filters += threadIdx.x; targets += pxIdx * numImages + blockCaseIdx + threadIdx.x; float prod[numColors][imgsPerThread]; #pragma unroll for (int c = 0; c < numColors; c++) { #pragma unroll for (int i = 0; i < imgsPerThread; i++) { prod[c][i] = 0; } } const int startY = blockRegionTop - paddingStart < filterSize ? 0 : 1 + (blockRegionTop - paddingStart - filterSize) / moduleStride; const int endY = MIN(numModulesY, 1 + (blockRegionTop + 3 - paddingStart) / moduleStride); const int startX = blockRegionLeft - paddingStart < filterSize ? 0 : 1 + (blockRegionLeft - paddingStart - filterSize) / moduleStride; const int endX = MIN(numModulesX, 1 + (blockRegionLeft + 3 - paddingStart) / moduleStride); float* shilterLoad = &shFilters[threadIdx.y][threadIdx.x]; float* shHidActLoad = &shHidActs[loadY][loadX]; for (int my = startY; my < endY; my++) { const int moduleTop = paddingStart + my * moduleStride; const int pxInModuleY = pxY - moduleTop; for (int mx = startX; mx < endX; mx++) { const int moduleIdx = my * numModulesX + mx; const int moduleLeft = paddingStart + mx * moduleStride; const int pxInModuleX = pxX - moduleLeft; const bool isPxInModule = pxInModuleY >= 0 && pxInModuleY < filterSize && pxInModuleX >= 0 && pxInModuleX < filterSize; const int pxIdxInModule = pxInModuleY * filterSize + pxInModuleX; for (int f = 0; f < numFilters; f += 16) { // multiply with 16 filters at a time // Now the threads split up into half-warps, and each half-warp decides if it's interested. const float* hLoad = &hidActs[(moduleIdx + f * numModules) * numImages]; #pragma unroll for (int i = 0; i < imgsPerThread * 16; i += 32) { if (!checkCaseBounds || blockCaseIdx + i + loadX < numImages) { #pragma unroll for (int j = 0; j < 16; j += 8) { // load 16 rows of imgsPerThread*16 cols, 8 * 32 elements at a time. shHidActLoad[j * 16 * imgsPerThread + i] = hLoad[j * numModules * numImages + i]; } } else { #pragma unroll for (int j = 0; j < 16; j += 8) { // load 16 rows of imgsPerThread*16 cols, 8 * 32 elements at a time. shHidActLoad[j * 16 * imgsPerThread + i] = 0; } } } if (isPxInImg && isPxInModule) { // This half-warp is interested, so it's going to load the weights from this module to its pixel. // Not fully coalesced read :( // But taking out this read entirely only reduces the runtime by ~2.8%, so it isn't costing me much. const float* fLoad = conv ? &filters[pxIdxInModule * numFilters + f] : &filters[(moduleIdx * numColors * filterPixels + pxIdxInModule) * numFilters + f]; #pragma unroll for (int c = 0; c < numColors; c++) { shilterLoad[c * 16 * (16 + 1)] = fLoad[c * filterPixels * numFilters]; } } __syncthreads(); // Do some actual computation if (isPxInImg && isPxInModule) { #pragma unroll for (int c = 0; c < numColors; c++) { #pragma unroll for (int w = 0; w < 16; w++) { #pragma unroll for (int i = 0; i < imgsPerThread; i++) { prod[c][i] += shFilters[threadIdx.y + c * 16][w] * shHidActs[w][threadIdx.x + i * 16]; } } } } __syncthreads(); } } } // Not fully coalesced write :(... shmem (and fully coalesced) version is actually slightly slower, though if (isPxInImg) { if (scale) { #pragma unroll for (int i = 0; i < imgsPerThread; i++) { if (!checkCaseBounds || blockCaseIdx + threadIdx.x + i * 16 < numImages) { #pragma unroll for (int c = 0; c < numColors; c++) { targets[c * imgPixels * numImages + i * 16] = scaleTargets * targets[c * imgPixels * numImages + i * 16] + scaleOutputs * prod[c][i]; } } } } else { #pragma unroll for (int i = 0; i < imgsPerThread; i++) { if (!checkCaseBounds || blockCaseIdx + threadIdx.x + i * 16 < numImages) { #pragma unroll for (int c = 0; c < numColors; c++) { targets[c * imgPixels * numImages + i * 16] = scaleOutputs * prod[c][i]; } } } } } } /* * Block size: 16x16. * blockIdx.x determines case in batches of 16*imgsPerThread, also color in batches of colorsPerThread. * In essence, blockIdx.x.x = 1..numImages/(16*imgsPerThread) * blockIdx.x.y = 1..numImgColors/colorsPerThread * blockIdx.y determines 4x4 image region in target image. * * threadIdx.x determines case. * threadIdx.y determines pixel. * * hidActs: (numFilters, numModulesY, numModulesX, numImages) * filters: (numFilterColors, filterPixels, numFilters) if conv * (numModulesY, numModulesX, numFilterColors, filterPixels, numFilters) otherwise * targets: (numImageColors, imgSizeY, imgSizeX, numImages) * * Each block reconstructs one 4x4 pixels from 16*imgsPerThread cases. * * numImages must be divisible by 16*imgsPerThread if checkCaseBounds is false. * 16 * imgsPerThread must be divisible by 32. * numImageColors/numGroups must be divisible by colorsPerThread. * * This version loads 32 cases at a time, so it gets full coalescing on that load. * It only loads 16 weights at a time, so those aren't fully coalesced. * This version conserves shared memory by loading 16 filters at a time rather than 32. * * To be used when there are 4-16 color channels. */ template __global__ void img_acts_mediumcolor(const float* hidActs, const float* filters, float* targets, const int numModulesY, const int numModulesX, const int numImages, const int numFilters, const int filterSize, const int imgSizeY, const int imgSizeX, const int paddingStart, const int moduleStride, const int numImgColors, const int numGroups, const float scaleTargets, const float scaleOutputs) { __shared__ float shFilters[colorsPerThread*16][16 + 1]; __shared__ float shHidActs[16][16*imgsPerThread]; const int numImgBlocks = DIVUP(numImages,16*imgsPerThread); const int blockCaseIdx = (blockIdx.x % numImgBlocks) * 16*imgsPerThread; const int imgColorIdx = (blockIdx.x / numImgBlocks) * colorsPerThread; // color idx globally const int numFilterColors = numImgColors / numGroups; const int blockGroupIdx = imgColorIdx / numFilterColors; const int filterColorIdx = imgColorIdx % numFilterColors; // color idx within group const int numFiltersPerGroup = numFilters / numGroups; const int blockFilterIdx = blockGroupIdx * numFiltersPerGroup; const int numRegionsX = DIVUP(imgSizeX, 4); const int blockRegionIdx = blockIdx.y; const int blockRegionIdxX = blockRegionIdx % numRegionsX; const int blockRegionIdxY = blockRegionIdx / numRegionsX; const int blockRegionLeft = blockRegionIdxX * 4; const int blockRegionTop = blockRegionIdxY * 4; const int pxYInRegion = threadIdx.y / 4, pxXInRegion = threadIdx.y % 4; const int pxY = blockRegionTop + pxYInRegion; const int pxX = blockRegionLeft + pxXInRegion; const int pxIdx = pxY * imgSizeX + pxX; const bool isPxInImg = pxY < imgSizeY && pxX < imgSizeX; const uint numModules = numModulesY * numModulesX; const int filterPixels = filterSize * filterSize; const int imgPixels = imgSizeY * imgSizeX; const int tidx = threadIdx.y * 16 + threadIdx.x; const int loadY = tidx / 32, loadX = tidx % 32; hidActs += blockCaseIdx + (blockFilterIdx + loadY) * numImages * numModules + loadX; filters += blockFilterIdx + filterColorIdx * filterPixels * numFilters + threadIdx.x; targets += imgColorIdx * imgPixels * numImages + pxIdx * numImages + blockCaseIdx + threadIdx.x; float prod[colorsPerThread][imgsPerThread]; #pragma unroll for (int c = 0; c < colorsPerThread; c++) { #pragma unroll for (int i = 0; i < imgsPerThread; i++) { prod[c][i] = 0; } } const int startY = blockRegionTop - paddingStart < filterSize ? 0 : 1 + (blockRegionTop - paddingStart - filterSize) / moduleStride; const int endY = MIN(numModulesY, 1 + (blockRegionTop + 3 - paddingStart) / moduleStride); const int startX = blockRegionLeft - paddingStart < filterSize ? 0 : 1 + (blockRegionLeft - paddingStart - filterSize) / moduleStride; const int endX = MIN(numModulesX, 1 + (blockRegionLeft + 3 - paddingStart) / moduleStride); float* shFilterLoad = &shFilters[threadIdx.y][threadIdx.x]; float* shHidActLoad = &shHidActs[loadY][loadX]; for (int my = startY; my < endY; my++) { const int moduleTop = paddingStart + my * moduleStride; const int pxInModuleY = pxY - moduleTop; for (int mx = startX; mx < endX; mx++) { const int moduleIdx = my * numModulesX + mx; const int moduleLeft = paddingStart + mx * moduleStride; const int pxInModuleX = pxX - moduleLeft; const bool isPxInModule = pxInModuleY >= 0 && pxInModuleY < filterSize && pxInModuleX >= 0 && pxInModuleX < filterSize; const int pxIdxInModule = pxInModuleY * filterSize + pxInModuleX; for (int f = 0; f < numFiltersPerGroup; f += 16) { // multipply with 16 filters at a time // Now the threads split up into half-warps, and each half-warp decides if it's interested. const float* hLoad = &hidActs[(moduleIdx + f * numModules) * numImages]; #pragma unroll for (int i = 0; i < imgsPerThread * 16; i += 32) { if (!checkCaseBounds || blockCaseIdx + loadX + i < numImages) { #pragma unroll for (int j = 0; j < 16; j += 8) { // load 16 rows of imgsPerThread*16 cols, 8 * 32 elements at a time. shHidActLoad[j * 16 * imgsPerThread + i] = hLoad[j * numModules * numImages + i]; } } else { #pragma unroll for (int j = 0; j < 16; j += 8) { // load 16 rows of imgsPerThread*16 cols, 8 * 32 elements at a time. shHidActLoad[j * 16 * imgsPerThread + i] = 0; } } } if (isPxInImg && isPxInModule) { // This half-warp is interested, so it's going to load the weights from this module to its pixel. // Not fully coalesced read :( // But taking out this read entirely only reduces the runtime by ~2.8%, so it isn't costing me much. const float* fLoad = conv ? &filters[pxIdxInModule * numFilters + f] : &filters[moduleIdx * numFilterColors * filterPixels * numFilters + pxIdxInModule * numFilters + f]; #pragma unroll for (int c = 0; c < colorsPerThread; c++) { shFilterLoad[c * 16 * (16 + 1)] = fLoad[c * filterPixels * numFilters]; } } __syncthreads(); // Do some actual computation if (isPxInImg && isPxInModule) { #pragma unroll for (int c = 0; c < colorsPerThread; c++) { #pragma unroll for (int w = 0; w < 16; w++) { #pragma unroll for (int i = 0; i < imgsPerThread; i++) { prod[c][i] += shFilters[threadIdx.y + c * 16][w] * shHidActs[w][threadIdx.x + i * 16]; } } } } __syncthreads(); } } } // Not fully coalesced write :(... shmem (and fully coalesced) version is actually slightly slower, though if (isPxInImg) { if (scale) { #pragma unroll for (int i = 0; i < imgsPerThread; i++) { if (!checkCaseBounds || blockCaseIdx + threadIdx.x + i * 16 < numImages) { #pragma unroll for (int c = 0; c < colorsPerThread; c++) { targets[c * imgPixels * numImages + i * 16] = scaleTargets * targets[c * imgPixels * numImages + i * 16] + scaleOutputs * prod[c][i]; } } } } else { #pragma unroll for (int i = 0; i < imgsPerThread; i++) { if (!checkCaseBounds || blockCaseIdx + threadIdx.x + i * 16 < numImages) { #pragma unroll for (int c = 0; c < colorsPerThread; c++) { targets[c * imgPixels * numImages + i * 16] = scaleOutputs * prod[c][i]; } } } } } } /* * Block size: B_YxB_X. * blockIdx.x determines case in batches of B_X*imgsPerThread, also color in batches of B_Y*colorsPerThread. * In essence, blockIdx.x.x = 1..numImages/(B_X*imgsPerThread) * blockIdx.x.y = 1..numImgColors/(B_Y*colorsPerThread) * blockIdx.y determines image pixel in target image. * * threadIdx.x determines case. * threadIdx.y determines color. * * hidActs: (numFilters, numModulesY, numModulesX, numImages) * filters: (numFilterColors, filterPixels, numFilters) if conv * (numModulesY, numModulesX, numFilterColors, filterPixels, numFilters) otherwise * targets: (numImageColors, imgSizeY, imgSizeX, numImages) * * Each block reconstructs one B_Y*colorsPerThread colors from 1 pixel from B_X*imgsPerThread cases. * * numImages must be divisible by B_X*imgsPerThread if checkCaseBounds is false. * numFiltersPerGroup must be divisible by filterCache. * * B_X * imgsPerThread must be divisible by 32. * numFilterColors must be divisible by B_Y*colorsPerThread. * B_X*B_Y must be divisible by 32. * filterCache must be divisible by B_X*B_Y/32 * B_X*B_Y must be divisible by filterCache * This version loads 32 cases at a time, so it gets full coalescing on that load. * It only loads filterCache weights at a time, so those aren't fully coalesced (depending on size of filterCache). * * To be used when there are >= 16 color channels. */ template __global__ void conv_img_acts_manycolor(const float* hidActs, const float* filters, float* targets, const int numModulesY, const int numModulesX, const int numImages, const int numFilters, const int filterSize, const int imgSizeY, const int imgSizeX, const int paddingStart, const int moduleStride, const int numImgColors, const int numGroups, const float scaleTargets, const float scaleOutputs) { __shared__ float shFilters[colorsPerThread*B_Y][filterCache + 1]; __shared__ float shHidActs[filterCache][B_X*imgsPerThread]; const int numImgBlocks = DIVUP(numImages,B_X*imgsPerThread); const int blockCaseIdx = (blockIdx.x % numImgBlocks) * B_X*imgsPerThread; const int imgColorIdx = (blockIdx.x / numImgBlocks) * B_Y*colorsPerThread; // color idx globally const int numFilterColors = numImgColors / numGroups; const int blockGroupIdx = imgColorIdx / numFilterColors; const int filterColorIdx = imgColorIdx % numFilterColors; // color idx within group const int numFiltersPerGroup = numFilters / numGroups; const int blockFilterIdx = blockGroupIdx * numFiltersPerGroup; const int blockPixelIdx = blockIdx.y; const int blockPixelIdxX = blockPixelIdx % imgSizeX; const int blockPixelIdxY = blockPixelIdx / imgSizeX; const int filterPixels = filterSize * filterSize; const int imgPixels = imgSizeY * imgSizeX; const int tidx = threadIdx.y * B_X + threadIdx.x; const int hidActLoadY = tidx / 32, hidActLoadX = tidx % 32; const int filtersLoadY = tidx / filterCache, filtersLoadX = tidx % filterCache; const int numModules = numModulesY * numModulesX; hidActs += blockCaseIdx + (blockFilterIdx + hidActLoadY) * numImages * numModules + hidActLoadX; filters += blockFilterIdx + (filterColorIdx + filtersLoadY) * filterPixels * numFilters + filtersLoadX; targets += (imgColorIdx + threadIdx.y) * imgPixels * numImages + blockPixelIdx * numImages + blockCaseIdx + threadIdx.x; float prod[colorsPerThread][imgsPerThread]; #pragma unroll for (int c = 0; c < colorsPerThread; c++) { #pragma unroll for (int i = 0; i < imgsPerThread; i++) { prod[c][i] = 0; } } const int startY = blockPixelIdxY - paddingStart < filterSize ? 0 : 1 + (blockPixelIdxY - paddingStart - filterSize) / moduleStride; const int endY = MIN(numModulesY, 1 + (blockPixelIdxY - paddingStart) / moduleStride); const int startX = blockPixelIdxX - paddingStart < filterSize ? 0 : 1 + (blockPixelIdxX - paddingStart - filterSize) / moduleStride; const int endX = MIN(numModulesX, 1 + (blockPixelIdxX - paddingStart) / moduleStride); float* shFilterLoad = &shFilters[filtersLoadY][filtersLoadX]; float* shHidActLoad = &shHidActs[hidActLoadY][hidActLoadX]; for (int my = startY; my < endY; my++) { const int moduleTop = paddingStart + my * moduleStride; const int pxInFilterY = blockPixelIdxY - moduleTop; for (int mx = startX; mx < endX; mx++) { const int moduleIdx = my * numModulesX + mx; const int moduleLeft = paddingStart + mx * moduleStride; const int pxInFilterX = blockPixelIdxX - moduleLeft; const int pxIdxInFilter = pxInFilterY * filterSize + pxInFilterX; for (int f = 0; f < numFiltersPerGroup; f += filterCache) { // multiply with filterCache filters at a time const float* hLoad = &hidActs[(moduleIdx + f * numModules) * numImages]; #pragma unroll for (int i = 0; i < imgsPerThread * B_X; i += 32) { if (!checkCaseBounds || blockCaseIdx + hidActLoadX + i < numImages) { #pragma unroll for (int j = 0; j < filterCache; j += B_X*B_Y/32) { // load filterCache rows of imgsPerThread*B_X cols, 8 * 32 elements at a time. shHidActLoad[j * B_X * imgsPerThread + i] = hLoad[j * numModules * numImages + i]; } } else { #pragma unroll for (int j = 0; j < filterCache; j += B_X*B_Y/32) { // load filterCache rows of imgsPerThread*B_X cols, 8 * 32 elements at a time. shHidActLoad[j * B_X * imgsPerThread + i] = 0; } } } const float* fLoad = conv ? &filters[pxIdxInFilter * numFilters + f] : &filters[moduleIdx * numFilterColors * filterPixels * numFilters + pxIdxInFilter * numFilters + f]; #pragma unroll for (int i = 0; i < colorsPerThread*B_Y; i+= B_X*B_Y/filterCache) { if ((colorsPerThread*B_Y) % (B_X*B_Y/filterCache) == 0 || i + filtersLoadY < colorsPerThread*B_Y) { shFilterLoad[i * (filterCache + 1)] = fLoad[i * filterPixels * numFilters]; } } __syncthreads(); // Do some actual computation #pragma unroll for (int i = 0; i < imgsPerThread; i++) { #pragma unroll for (int w = 0; w < filterCache; w++) { #pragma unroll for (int c = 0; c < colorsPerThread; c++) { prod[c][i] += shFilters[c * B_Y + threadIdx.y][w] * shHidActs[w][threadIdx.x + i * B_X]; } } } __syncthreads(); } } } if (scale) { #pragma unroll for (int i = 0; i < imgsPerThread; i++) { if (!checkCaseBounds || blockCaseIdx + threadIdx.x + i * B_X < numImages) { #pragma unroll for (int c = 0; c < colorsPerThread; c++) { targets[c * B_Y * imgPixels * numImages + i * B_X] = scaleTargets * targets[c * B_Y * imgPixels * numImages + i * B_X] + scaleOutputs * prod[c][i]; } } } } else { #pragma unroll for (int i = 0; i < imgsPerThread; i++) { if (!checkCaseBounds || blockCaseIdx + threadIdx.x + i * B_X < numImages) { #pragma unroll for (int c = 0; c < colorsPerThread; c++) { targets[c * B_Y * imgPixels * numImages + i * B_X] = scaleOutputs * prod[c][i]; } } } } } /* * Block size: B_YxB_X. * blockIdx.x determines case in batches of B_X*imgsPerThread, also color in batches of B_Y*colorsPerThread. * In essence, blockIdx.x.x = 1..numImages/(B_X*imgsPerThread) * blockIdx.x.y = 1..numImgColors/(B_Y*colorsPerThread) * blockIdx.y determines image pixel in target image. * * threadIdx.x determines case. * threadIdx.y determines color. * * hidActs: (numFilters, numModulesY, numModulesX, numImages) * filters: (numFilterColors, filterPixels, numFilters) if conv * (numModulesY, numModulesX, numFilterColors, filterPixels, numFilters) otherwise * targets: (numImageColors, imgSizeY, imgSizeX, numImages) * * Each block reconstructs one B_Y*colorsPerThread colors from 1 pixel from B_X*imgsPerThread cases. * * numImages must be divisible by B_X*imgsPerThread if checkCaseBounds is false. * numFiltersPerGroup must be divisible by filterCacheF. * * numFilterColors must be divisible by B_Y*colorsPerThread. * B_X*B_Y must be divisible by filterCacheF * filterCacheF must be divisible by filterCacheH * * This version loads 32 cases at a time, so it gets full coalescing on that load. * It only loads filterCacheF weights at a time, so those aren't fully coalesced (depending on size of filterCacheF). * * To be used when there are >= 16 color channels. */ template __global__ void conv_img_acts_manycolor_kepler(const float* hidActs, const float* filters, float* targets, const int numModulesY, const int numModulesX, const int numImages, const int numFilters, const int filterSize, const int imgSizeY, const int imgSizeX, const int paddingStart, const int moduleStride, const int numImgColors, const int numGroups, const float scaleTargets, const float scaleOutputs) { __shared__ float shFilters[colorsPerThread*B_Y][filterCacheF]; __shared__ float shHidActs[filterCacheH][B_X*imgsPerThread]; const int numImgBlocks = DIVUP(numImages,B_X*imgsPerThread); const int blockCaseIdx = (blockIdx.x % numImgBlocks) * B_X*imgsPerThread; const int imgColorIdx = (blockIdx.x / numImgBlocks) * B_Y*colorsPerThread; // color idx globally const int numFilterColors = numImgColors / numGroups; const int blockGroupIdx = imgColorIdx / numFilterColors; const int filterColorIdx = imgColorIdx % numFilterColors; // color idx within group const int numFiltersPerGroup = numFilters / numGroups; const int blockFilterIdx = blockGroupIdx * numFiltersPerGroup; const int blockPixelIdx = blockIdx.y; const int blockPixelIdxX = blockPixelIdx % imgSizeX; const int blockPixelIdxY = blockPixelIdx / imgSizeX; const int filterPixels = filterSize * filterSize; const int imgPixels = imgSizeY * imgSizeX; const int tidx = threadIdx.y * B_X + threadIdx.x; const int hidActLoadY = threadIdx.y, hidActLoadX = threadIdx.x; //const int hidActLoadY = tidx / (B_X*imgsPerThread), hidActLoadX = tidx % (B_X*imgsPerThread); const int filtersLoadY = tidx / filterCacheF, filtersLoadX = tidx % filterCacheF; // nvcc is behaving idiotically again, these useless declarations save registers //const int outputY = threadIdx.y, outputX = threadIdx.x; //const int ty = threadIdx.y, tx = threadIdx.x; const int numModules = numModulesY * numModulesX; hidActs += blockCaseIdx + (blockFilterIdx + hidActLoadY) * numImages * numModules + hidActLoadX; filters += blockFilterIdx + (filterColorIdx + filtersLoadY) * filterPixels * numFilters + filtersLoadX; targets += (imgColorIdx + threadIdx.y) * imgPixels * numImages + blockPixelIdx * numImages + blockCaseIdx + threadIdx.x; float prod[colorsPerThread][imgsPerThread]; #pragma unroll for (int c = 0; c < colorsPerThread; c++) { #pragma unroll for (int i = 0; i < imgsPerThread; i++) { prod[c][i] = 0; } } const int startY = blockPixelIdxY - paddingStart < filterSize ? 0 : 1 + (blockPixelIdxY - paddingStart - filterSize) / moduleStride; const int endY = min(numModulesY, 1 + (blockPixelIdxY - paddingStart) / moduleStride); const int startX = blockPixelIdxX - paddingStart < filterSize ? 0 : 1 + (blockPixelIdxX - paddingStart - filterSize) / moduleStride; const int endX = min(numModulesX, 1 + (blockPixelIdxX - paddingStart) / moduleStride); float* shFilterLoad = &shFilters[filtersLoadY][filtersLoadX]; float* shHidActLoad = &shHidActs[hidActLoadY][hidActLoadX]; //const bool noFLoop = filterCacheF == filterCacheH; for (int my = startY; my < endY; my++) { const int moduleTop = paddingStart + my * moduleStride; const int pxInFilterY = blockPixelIdxY - moduleTop; for (int mx = startX; mx < endX; mx++) { const int moduleIdx = my * numModulesX + mx; const int moduleLeft = paddingStart + mx * moduleStride; const int pxInFilterX = blockPixelIdxX - moduleLeft; const int pxIdxInFilter = pxInFilterY * filterSize + pxInFilterX; for (int f = 0; f < numFiltersPerGroup; f += filterCacheF) { // multiply with filterCacheF filters at a time const float* fLoad = conv ? &filters[pxIdxInFilter * numFilters + f] : &filters[moduleIdx * numFilterColors * filterPixels * numFilters + pxIdxInFilter * numFilters + f]; #pragma unroll for (int i = 0; i < colorsPerThread*B_Y; i+= B_X*B_Y/filterCacheF) { if ((colorsPerThread*B_Y) % (B_X*B_Y/filterCacheF) == 0 || i + filtersLoadY < colorsPerThread*B_Y) { shFilterLoad[i * filterCacheF] = fLoad[i * filterPixels * numFilters]; } } //#pragma unroll for (int fh = f; fh < f + filterCacheF; fh += filterCacheH) { //conv_img_acts_manycolor_dummy_fhLoop(hidActs, shHidActLoad, shHidActs, shFilters, moduleIdx, numImages, hidActLoadY, hidActLoadX, blockCaseIdx, numModules, f, fh, prod); const float* hLoad = &hidActs[(moduleIdx + fh * numModules) * numImages]; #pragma unroll for (int j = 0; j < filterCacheH; j += B_Y) { if (filterCacheH % B_Y == 0 || hidActLoadY + j < filterCacheH) { #pragma unroll for (int i = 0; i < imgsPerThread*B_X; i += B_X) { if (!checkCaseBounds || blockCaseIdx + hidActLoadX + i < numImages) { shHidActLoad[j * B_X * imgsPerThread + i] = hLoad[j * numModules * numImages + i]; } else { shHidActLoad[j * B_X * imgsPerThread + i] = 0; } } } } __syncthreads(); // Do some actual computation // Using these variables causes register usage to go from 161 --> 123. // But nonetheless, the high-register version is faster. //const float* shF = &shFilters[threadIdx.y][fh-f]; //const float* const shF2 = &shFilters[threadIdx.y][fh]; //const float* shH = &shHidActs[0][threadIdx.x]; #pragma unroll for (int w = 0; w < filterCacheH; w++) { #pragma unroll for (int c = 0; c < colorsPerThread; c++) { #pragma unroll for (int i = 0; i < imgsPerThread; i++) { prod[c][i] += shFilters[c * B_Y + threadIdx.y][fh-f + w] * shHidActs[w][threadIdx.x + i * B_X]; } } } __syncthreads(); } } } } if (scale) { #pragma unroll for (int i = 0; i < imgsPerThread; i++) { if (!checkCaseBounds || blockCaseIdx + threadIdx.x + i * B_X < numImages) { #pragma unroll for (int c = 0; c < colorsPerThread; c++) { targets[c * B_Y * imgPixels * numImages + i * B_X] = scaleTargets * targets[c * B_Y * imgPixels * numImages + i * B_X] + scaleOutputs * prod[c][i]; } } } } else { #pragma unroll for (int i = 0; i < imgsPerThread; i++) { if (!checkCaseBounds || blockCaseIdx + threadIdx.x + i * B_X < numImages) { #pragma unroll for (int c = 0; c < colorsPerThread; c++) { targets[c * B_Y * imgPixels * numImages + i * B_X] = scaleOutputs * prod[c][i]; } } } } } /* * New Titan-optimized stuff. */ __device__ __forceinline__ void conv_img_acts_manycolor_preload_ty_8_tx_32_c_8_ff_32_fh_16_setCoords(const int my, const int mx, const int numModulesX, const int paddingStart, const int moduleStride, const int blockPixelIdxY, const int blockPixelIdxX, const int filterSize, int &moduleIdx, int &pxIdxInFilter) { const int moduleTop = paddingStart + my * moduleStride; const int pxInFilterY = blockPixelIdxY - moduleTop; moduleIdx = my * numModulesX + mx; // out const int moduleLeft = paddingStart + mx * moduleStride; const int pxInFilterX = blockPixelIdxX - moduleLeft; pxIdxInFilter = pxInFilterY * filterSize + pxInFilterX; // out } #define IA_PRELOAD_LOOP(w,offset) _Pragma("unroll") \ for (int i = 0; i < imgsPerThread; i++) { \ _Pragma("unroll") \ for (int c = 0; c < colorsPerThread; c++) { \ prod[c][i] += shFilters[c * B_Y + threadIdx.y][(w)+(offset)] * shHidActs[w][threadIdx.x * imgsPerThread + i]; \ } \ } \ /* * Same loop as above but inverted. */ #define IA_PRELOAD_LOOP2(w,offset) _Pragma("unroll") \ for (int c = 0; c < colorsPerThread; c++) { \ _Pragma("unroll") \ for (int i = 0; i < imgsPerThread; i++) { \ prod[c][i] += shFilters[c * B_Y + threadIdx.y][(w)+(offset)] * shHidActs[w][threadIdx.x * imgsPerThread + i]; \ } \ } \ #define IA_PRELOAD_LOOP3(i,offset) _Pragma("unroll") \ for (int w = 0; w < filterCacheH; w++) { \ _Pragma("unroll") \ for (int c = 0; c < colorsPerThread; c++) { \ prod[c][i] += shFilters[c * B_Y + threadIdx.y][(w)+(offset)] * shHidActs[w][threadIdx.x * imgsPerThread + i]; \ } \ } \ #define IA_PRELOAD_W(z) wPreload[z] = fLoad[(z) * B_X*B_Y/filterCacheF * filterPixels * numFilters]; #define IA_PRELOAD_W_TX(z) wPreload[z] = tex1Dfetch(filters, filtersLoadOffset + (z) * B_X*B_Y/filterCacheF * filterPixels * numFilters); #define IA_PRELOAD_H(y,x) if (!checkCaseBounds || myCaseIdx + (x) * B_X < numImages) { \ hPreload[y][x] = hLoad[(y) * B_Y * numModules * numImages + (x) * B_X]; \ } #define IA_PRELOAD_H_TX(y,x) if (!checkCaseBounds || myCaseIdx + (x) * B_X < numImages) { \ hPreload[y][x] = tex1Dfetch(hidActs, hidActsLoadOffset + (y) * B_Y * numModules * numImages + (x) * B_X); \ } template __global__ void __launch_bounds__(256, 2) // 256 threads per block, 2 blocks per multiprocessor // These launch bounds ensure 25% occupancy (128 registers used) // as oppposed to 13% (130 registers) achieved by defaults. conv_img_acts_manycolor_preloadfh_ty_8_tx_32_c_8_ff_32_fh_16_tex(cudaTextureObject_t hidActs, cudaTextureObject_t filters, float* targets, const int numModulesY, const int numModulesX, const int numImages, const int numFilters, const int filterSize, const int imgSizeY, const int imgSizeX, const int paddingStart, const int moduleStride, const int numImgColors, const int numGroups, const float scaleTargets, const float scaleOutputs) { __shared__ float shFilters[colorsPerThread*B_Y][filterCacheF]; __shared__ float shHidActs[filterCacheH][B_X*imgsPerThread]; const int numImgBlocks = DIVUP(numImages,B_X*imgsPerThread); const int blockCaseIdx = (blockIdx.x % numImgBlocks) * B_X*imgsPerThread; const int myCaseIdx = blockCaseIdx + threadIdx.x; const int imgColorIdx = (blockIdx.x / numImgBlocks) * B_Y*colorsPerThread; // color idx globally const int numFilterColors = numImgColors / numGroups; const int blockGroupIdx = imgColorIdx / numFilterColors; const int filterColorIdx = imgColorIdx % numFilterColors; // color idx within group const int numFiltersPerGroup = numFilters / numGroups; const int blockFilterIdx = blockGroupIdx * numFiltersPerGroup; const int blockPixelIdx = blockIdx.y; const int blockPixelIdxX = blockPixelIdx % imgSizeX; const int blockPixelIdxY = blockPixelIdx / imgSizeX; const int filterPixels = filterSize * filterSize; const int imgPixels = imgSizeY * imgSizeX; const int tidx = threadIdx.y * B_X + threadIdx.x; // const int hidActLoadY = threadIdx.y % B_Y, hidActLoadX = threadIdx.x % B_X; //const int hidActLoadY = tidx / (B_X*imgsPerThread), hidActLoadX = tidx % (B_X*imgsPerThread); const int filtersLoadY = tidx / filterCacheF, filtersLoadX = tidx % filterCacheF; // nvcc is behaving idiotically again, these useless declarations save registers //const int outputY = threadIdx.y, outputX = threadIdx.x; //const int ty = threadIdx.y, tx = threadIdx.x; const int numModules = numModulesY * numModulesX; const int hidActsOffset = (blockFilterIdx + threadIdx.y) * numImages * numModules + myCaseIdx; const int filtersOffset = blockFilterIdx + (filterColorIdx + filtersLoadY) * filterPixels * numFilters + filtersLoadX; // hidActs += (blockFilterIdx + threadIdx.y) * numImages * numModules + myCaseIdx; // filters += blockFilterIdx + (filterColorIdx + filtersLoadY) * filterPixels * numFilters + filtersLoadX; targets += (imgColorIdx + threadIdx.y) * imgPixels * numImages + blockPixelIdx * numImages + myCaseIdx; float prod[colorsPerThread][imgsPerThread]; #pragma unroll for (int i = 0; i < imgsPerThread; i++) { #pragma unroll for (int c = 0; c < colorsPerThread; c++) { prod[c][i] = 0; } } const int startY = blockPixelIdxY - paddingStart < filterSize ? 0 : 1 + (blockPixelIdxY - paddingStart - filterSize) / moduleStride; const int endY = min(numModulesY, 1 + (blockPixelIdxY - paddingStart) / moduleStride); const int startX = blockPixelIdxX - paddingStart < filterSize ? 0 : 1 + (blockPixelIdxX - paddingStart - filterSize) / moduleStride; const int endX = min(numModulesX, 1 + (blockPixelIdxX - paddingStart) / moduleStride); float* shFilterLoad = &shFilters[filtersLoadY][filtersLoadX]; float* shHidActLoad = &shHidActs[threadIdx.y][threadIdx.x * imgsPerThread]; //const bool noFLoop = filterCacheF == filterCacheH; /* * Initial preload */ float hPreload[filterCacheH/B_Y][imgsPerThread]; // [2][4] float wPreload[filterCacheF*colorsPerThread/B_X]; // [8] int moduleIdx, pxIdxInFilter; conv_img_acts_manycolor_preload_ty_8_tx_32_c_8_ff_32_fh_16_setCoords(startY, startX, numModulesX, paddingStart, moduleStride, blockPixelIdxY, blockPixelIdxX, filterSize, moduleIdx, pxIdxInFilter); // const float* fLoad = conv ? &filters[pxIdxInFilter * numFilters + 0] // : &filters[moduleIdx * numFilterColors * filterPixels * numFilters + pxIdxInFilter * numFilters + 0]; int filtersLoadOffset = filtersOffset + (conv ? pxIdxInFilter * numFilters + 0 : moduleIdx * numFilterColors * filterPixels * numFilters + pxIdxInFilter * numFilters); #pragma unroll for (int i = 0; i < colorsPerThread*B_Y; i+= B_X*B_Y/filterCacheF) { if ((colorsPerThread*B_Y) % (B_X*B_Y/filterCacheF) == 0 || i + filtersLoadY < colorsPerThread*B_Y) { wPreload[i * filterCacheF/(B_X*B_Y)] = tex1Dfetch(filters, filtersLoadOffset + i * filterPixels * numFilters); } } // const float* hLoad = &hidActs[(moduleIdx + 0 * numModules) * numImages]; int hidActsLoadOffset = hidActsOffset + (moduleIdx + 0 * numModules) * numImages; #pragma unroll for (int j = 0; j < filterCacheH; j += B_Y) { if (filterCacheH % B_Y == 0 || threadIdx.y + j < filterCacheH) { #pragma unroll for (int i = 0; i < imgsPerThread; i++) { if (!checkCaseBounds || myCaseIdx + i * B_X < numImages) { hPreload[j/B_Y][i] = tex1Dfetch(hidActs, hidActsLoadOffset + j * numModules * numImages + i * B_X); } } } } for (int my = startY; my < endY; my++) { const int moduleTop = paddingStart + my * moduleStride; const int pxInFilterY = blockPixelIdxY - moduleTop; for (int mx = startX; mx < endX; mx++) { moduleIdx = my * numModulesX + mx; const int moduleLeft = paddingStart + mx * moduleStride; const int pxInFilterX = blockPixelIdxX - moduleLeft; pxIdxInFilter = pxInFilterY * filterSize + pxInFilterX; int myNext = my, mxNext = mx, moduleIdxNext, pxIdxInFilterNext; const bool lastModule = my == endY - 1 && mx == endX - 1; if (!lastModule) { mxNext = mx + 1 == endX ? startX : mx + 1; myNext = my + (mx + 1 == endX); } conv_img_acts_manycolor_preload_ty_8_tx_32_c_8_ff_32_fh_16_setCoords(myNext, mxNext, numModulesX, paddingStart, moduleStride, blockPixelIdxY, blockPixelIdxX, filterSize, moduleIdxNext, pxIdxInFilterNext); for (int f = 0; f < numFiltersPerGroup; f += filterCacheF) { // multiply with filterCacheF filters at a time #pragma unroll for (int i = 0; i < colorsPerThread*B_Y; i+= B_X*B_Y/filterCacheF) { if ((colorsPerThread*B_Y) % (B_X*B_Y/filterCacheF) == 0 || i + filtersLoadY < colorsPerThread*B_Y) { shFilterLoad[i * filterCacheF] = wPreload[i * filterCacheF/(B_X*B_Y)]; } } filtersLoadOffset = filtersOffset + (conv ? pxIdxInFilter * numFilters + f + filterCacheF : moduleIdx * numFilterColors * filterPixels * numFilters + pxIdxInFilter * numFilters + f + filterCacheF); if (f == numFiltersPerGroup - filterCacheF) { filtersLoadOffset = filtersOffset + (conv ? pxIdxInFilterNext * numFilters : moduleIdxNext * numFilterColors * filterPixels * numFilters + pxIdxInFilterNext * numFilters); } #pragma unroll for (int j = 0; j < filterCacheH; j += B_Y) { if (filterCacheH % B_Y == 0 || threadIdx.y + j < filterCacheH) { #pragma unroll for (int i = 0; i < imgsPerThread; i++) { // NOTE: bank conflicts here! if (!checkCaseBounds || myCaseIdx + i * B_X < numImages) { shHidActLoad[j * B_X * imgsPerThread + i] = hPreload[j/B_Y][i]; } } } } __syncthreads(); hidActsLoadOffset = hidActsOffset + (moduleIdx + (f + filterCacheH) * numModules) * numImages; #pragma unroll for (int z = 0; z < 4; ++z) { IA_PRELOAD_LOOP(z,0); IA_PRELOAD_W_TX(z); } #pragma unroll for (int z = 4; z < 12; ++z) { IA_PRELOAD_LOOP(z,0); IA_PRELOAD_H_TX((z-4)/4,z%4); } #pragma unroll for (int z = 12; z < 16; ++z) { IA_PRELOAD_LOOP(z,0); } __syncthreads(); #pragma unroll for (int j = 0; j < filterCacheH; j += B_Y) { if (filterCacheH % B_Y == 0 || threadIdx.y + j < filterCacheH) { #pragma unroll for (int i = 0; i < imgsPerThread; i++) { if (!checkCaseBounds || myCaseIdx + i * B_X < numImages) { shHidActLoad[j * B_X * imgsPerThread + i] = hPreload[j/B_Y][i]; } } } } __syncthreads(); hidActsLoadOffset = hidActsOffset + (moduleIdx + (f + filterCacheF) * numModules) * numImages; if (f == numFiltersPerGroup - filterCacheF) { hidActsLoadOffset = hidActsOffset + moduleIdxNext * numImages; } #pragma unroll for (int z = 0; z < 4; ++z) { IA_PRELOAD_LOOP(z,filterCacheH); IA_PRELOAD_W_TX(z+4); } #pragma unroll for (int z = 4; z < 12; ++z) { IA_PRELOAD_LOOP(z,filterCacheH); IA_PRELOAD_H_TX((z-4)/4, z%4); } #pragma unroll for (int z = 12; z < 16; ++z) { IA_PRELOAD_LOOP(z,filterCacheH); } __syncthreads(); } } } if (scale) { #pragma unroll for (int c = 0; c < colorsPerThread; c++) { #pragma unroll for (int i = 0; i < imgsPerThread; i++) { if (!checkCaseBounds || myCaseIdx + i * B_X < numImages) { targets[c * B_Y * imgPixels * numImages + i * B_X] = scaleTargets * targets[c * B_Y * imgPixels * numImages + i * B_X] + scaleOutputs * prod[c][i]; } } } } else { #pragma unroll for (int c = 0; c < colorsPerThread; c++) { #pragma unroll for (int i = 0; i < imgsPerThread; i++) { if (!checkCaseBounds || myCaseIdx + i * B_X < numImages) { targets[c * B_Y * imgPixels * numImages + i * B_X] = scaleOutputs * prod[c][i]; } } } } } template __global__ void //__launch_bounds__(128, 3) // 128 threads per block, 3 blocks per multiprocessor conv_img_acts_manycolor_preloadfh_ty_4_tx_32_c_12_ff_16_fh_16(cudaTextureObject_t hidActs, cudaTextureObject_t filters, float* targets, const int numModulesY, const int numModulesX, const int numImages, const int numFilters, const int filterSize, const int imgSizeY, const int imgSizeX, const int paddingStart, const int moduleStride, const int numImgColors, const int numGroups, const float scaleTargets, const float scaleOutputs) { __shared__ float shFilters[colorsPerThread*B_Y][filterCacheF]; __shared__ float shHidActs[filterCacheH][B_X*imgsPerThread]; const int numImgBlocks = DIVUP(numImages,B_X*imgsPerThread); const int blockCaseIdx = (blockIdx.x % numImgBlocks) * B_X*imgsPerThread; const int myCaseIdx = blockCaseIdx + threadIdx.x; const int imgColorIdx = (blockIdx.x / numImgBlocks) * B_Y*colorsPerThread; // color idx globally const int numFilterColors = numImgColors / numGroups; const int blockGroupIdx = imgColorIdx / numFilterColors; const int filterColorIdx = imgColorIdx % numFilterColors; // color idx within group const int numFiltersPerGroup = numFilters / numGroups; const int blockFilterIdx = blockGroupIdx * numFiltersPerGroup; const int blockPixelIdx = blockIdx.y; const int blockPixelIdxX = blockPixelIdx % imgSizeX; const int blockPixelIdxY = blockPixelIdx / imgSizeX; const int filterPixels = filterSize * filterSize; const int imgPixels = imgSizeY * imgSizeX; const int tidx = threadIdx.y * B_X + threadIdx.x; // const int hidActLoadY = threadIdx.y % B_Y, hidActLoadX = threadIdx.x % B_X; //const int hidActLoadY = tidx / (B_X*imgsPerThread), hidActLoadX = tidx % (B_X*imgsPerThread); const int filtersLoadY = tidx / filterCacheF, filtersLoadX = tidx % filterCacheF; // nvcc is behaving idiotically again, these useless declarations save registers //const int outputY = threadIdx.y, outputX = threadIdx.x; //const int ty = threadIdx.y, tx = threadIdx.x; const int numModules = numModulesY * numModulesX; const int hidActsOffset = (blockFilterIdx + threadIdx.y) * numImages * numModules + myCaseIdx; const int filtersOffset = blockFilterIdx + (filterColorIdx + filtersLoadY) * filterPixels * numFilters + filtersLoadX; // hidActs += (blockFilterIdx + threadIdx.y) * numImages * numModules + myCaseIdx; // filters += blockFilterIdx + (filterColorIdx + filtersLoadY) * filterPixels * numFilters + filtersLoadX; targets += (imgColorIdx + threadIdx.y) * imgPixels * numImages + blockPixelIdx * numImages + myCaseIdx; float prod[colorsPerThread][imgsPerThread]; #pragma unroll for (int i = 0; i < imgsPerThread; i++) { #pragma unroll for (int c = 0; c < colorsPerThread; c++) { prod[c][i] = 0; } } const int startY = blockPixelIdxY - paddingStart < filterSize ? 0 : 1 + (blockPixelIdxY - paddingStart - filterSize) / moduleStride; const int endY = min(numModulesY, 1 + (blockPixelIdxY - paddingStart) / moduleStride); const int startX = blockPixelIdxX - paddingStart < filterSize ? 0 : 1 + (blockPixelIdxX - paddingStart - filterSize) / moduleStride; const int endX = min(numModulesX, 1 + (blockPixelIdxX - paddingStart) / moduleStride); float* shFilterLoad = &shFilters[filtersLoadY][filtersLoadX]; float* shHidActLoad = &shHidActs[threadIdx.y][threadIdx.x * imgsPerThread]; //const bool noFLoop = filterCacheF == filterCacheH; /* * Initial preload */ float hPreload[filterCacheH/B_Y][imgsPerThread]; // [4][4] float wPreload[filterCacheF*colorsPerThread/B_X]; // [6] int moduleIdx, pxIdxInFilter; conv_img_acts_manycolor_preload_ty_8_tx_32_c_8_ff_32_fh_16_setCoords(startY, startX, numModulesX, paddingStart, moduleStride, blockPixelIdxY, blockPixelIdxX, filterSize, moduleIdx, pxIdxInFilter); // const float* fLoad = conv ? &filters[pxIdxInFilter * numFilters + 0] // : &filters[moduleIdx * numFilterColors * filterPixels * numFilters + pxIdxInFilter * numFilters + 0]; int filtersLoadOffset = filtersOffset + (conv ? pxIdxInFilter * numFilters : moduleIdx * numFilterColors * filterPixels * numFilters + pxIdxInFilter * numFilters); #pragma unroll for (int i = 0; i < colorsPerThread*B_Y; i+= B_X*B_Y/filterCacheF) { if ((colorsPerThread*B_Y) % (B_X*B_Y/filterCacheF) == 0 || i + filtersLoadY < colorsPerThread*B_Y) { wPreload[i * filterCacheF/(B_X*B_Y)] = tex1Dfetch(filters, filtersLoadOffset + i * filterPixels * numFilters); } } // const float* hLoad = &hidActs[moduleIdx * numImages]; int hidActsLoadOffset = hidActsOffset + moduleIdx * numImages; #pragma unroll for (int j = 0; j < filterCacheH; j += B_Y) { if (filterCacheH % B_Y == 0 || threadIdx.y + j < filterCacheH) { #pragma unroll for (int i = 0; i < imgsPerThread; i++) { if (!checkCaseBounds || myCaseIdx + i * B_X < numImages) { hPreload[j/B_Y][i] = tex1Dfetch(hidActs, hidActsLoadOffset + j * numModules * numImages + i * B_X); } } } } for (int my = startY; my < endY; my++) { const int moduleTop = paddingStart + my * moduleStride; const int pxInFilterY = blockPixelIdxY - moduleTop; for (int mx = startX; mx < endX; mx++) { moduleIdx = my * numModulesX + mx; const int moduleLeft = paddingStart + mx * moduleStride; const int pxInFilterX = blockPixelIdxX - moduleLeft; pxIdxInFilter = pxInFilterY * filterSize + pxInFilterX; int myNext = my, mxNext = mx, moduleIdxNext, pxIdxInFilterNext; const bool lastModule = my == endY - 1 && mx == endX - 1; if (!lastModule) { mxNext = mx + 1 == endX ? startX : mx + 1; myNext = my + (mx + 1 == endX); } conv_img_acts_manycolor_preload_ty_8_tx_32_c_8_ff_32_fh_16_setCoords(myNext, mxNext, numModulesX, paddingStart, moduleStride, blockPixelIdxY, blockPixelIdxX, filterSize, moduleIdxNext, pxIdxInFilterNext); for (int f = 0; f < numFiltersPerGroup; f += filterCacheF) { // multiply with filterCacheF filters at a time #pragma unroll for (int i = 0; i < colorsPerThread*B_Y; i+= B_X*B_Y/filterCacheF) { if ((colorsPerThread*B_Y) % (B_X*B_Y/filterCacheF) == 0 || i + filtersLoadY < colorsPerThread*B_Y) { shFilterLoad[i * filterCacheF] = wPreload[i * filterCacheF/(B_X*B_Y)]; } } filtersLoadOffset = filtersOffset + (conv ? pxIdxInFilter * numFilters + f + filterCacheF : moduleIdx * numFilterColors * filterPixels * numFilters + pxIdxInFilter * numFilters + f + filterCacheF); if (f == numFiltersPerGroup - filterCacheF) { filtersLoadOffset = filtersOffset + (conv ? pxIdxInFilterNext * numFilters : moduleIdxNext * numFilterColors * filterPixels * numFilters + pxIdxInFilterNext * numFilters); } #pragma unroll for (int j = 0; j < filterCacheH; j += B_Y) { if (filterCacheH % B_Y == 0 || threadIdx.y + j < filterCacheH) { #pragma unroll for (int i = 0; i < imgsPerThread; i++) { // NOTE: bank conflicts here! if (!checkCaseBounds || myCaseIdx + i * B_X < numImages) { shHidActLoad[j * B_X * imgsPerThread + i] = hPreload[j/B_Y][i]; } } } } hidActsLoadOffset = hidActsOffset + (moduleIdx + (f + filterCacheF) * numModules) * numImages; if (f == numFiltersPerGroup - filterCacheF) { hidActsLoadOffset = hidActsOffset + moduleIdxNext * numImages; } __syncthreads(); // It seems that there is no point explicitly interleaving loads // and computations because the scheduler does that anyway. IA_PRELOAD_LOOP2(0,0); IA_PRELOAD_LOOP2(1,0); IA_PRELOAD_LOOP2(2,0); IA_PRELOAD_LOOP2(3,0); IA_PRELOAD_LOOP2(4,0); IA_PRELOAD_LOOP2(5,0); IA_PRELOAD_LOOP2(6,0); IA_PRELOAD_LOOP2(7,0); IA_PRELOAD_LOOP2(8,0); IA_PRELOAD_LOOP2(9,0); IA_PRELOAD_LOOP2(10,0); IA_PRELOAD_LOOP2(11,0); IA_PRELOAD_LOOP2(12,0); IA_PRELOAD_LOOP2(13,0); IA_PRELOAD_LOOP2(14,0); IA_PRELOAD_LOOP2(15,0); IA_PRELOAD_W_TX(0); IA_PRELOAD_W_TX(1); IA_PRELOAD_W_TX(2); IA_PRELOAD_W_TX(3); IA_PRELOAD_W_TX(4); IA_PRELOAD_W_TX(5); IA_PRELOAD_H_TX(0,0); IA_PRELOAD_H_TX(0,1); IA_PRELOAD_H_TX(0,2); IA_PRELOAD_H_TX(0,3); IA_PRELOAD_H_TX(1,0); IA_PRELOAD_H_TX(1,1); IA_PRELOAD_H_TX(1,2); IA_PRELOAD_H_TX(1,3); IA_PRELOAD_H_TX(2,0); IA_PRELOAD_H_TX(2,1); IA_PRELOAD_H_TX(2,2); IA_PRELOAD_H_TX(2,3); IA_PRELOAD_H_TX(3,0); IA_PRELOAD_H_TX(3,1); IA_PRELOAD_H_TX(3,2); IA_PRELOAD_H_TX(3,3); __syncthreads(); } } } if (scale) { #pragma unroll for (int c = 0; c < colorsPerThread; c++) { #pragma unroll for (int i = 0; i < imgsPerThread; i++) { if (!checkCaseBounds || myCaseIdx + i * B_X < numImages) { targets[c * B_Y * imgPixels * numImages + i * B_X] = scaleTargets * targets[c * B_Y * imgPixels * numImages + i * B_X] + scaleOutputs * prod[c][i]; } } } } else { #pragma unroll for (int c = 0; c < colorsPerThread; c++) { #pragma unroll for (int i = 0; i < imgsPerThread; i++) { if (!checkCaseBounds || myCaseIdx + i * B_X < numImages) { targets[c * B_Y * imgPixels * numImages + i * B_X] = scaleOutputs * prod[c][i]; } } } } } /* * hidActs: (numFilters, numModules, numImages) * filters: (numFilterColors, filterPixels, numFilters) if conv * (numModules, numFilterColors, filterPixels, numFilters) otherwise * targets: (overSample, numImgColors, imgPixels, numImages) * * Note: all of these convolution routines are optimized for the case when * the number of images (i.e. the minibatch size) is a multiple of 128. * Other batch sizes will work, but but I made no attempt whatsoever * to make them work fast. */ void _imgActs(caffe2::CUDAContext* context, caffe2::TensorCUDA* hidActs, caffe2::TensorCUDA* filters, caffe2::TensorCUDA* targets, int imgSizeY, int imgSizeX, int numModulesY, int paddingStart, int moduleStride, int numImgColors, int numGroups, float scaleTargets, float scaleOutput, bool conv) { CAFFE_ENFORCE(hidActs->ndim() == 2); CAFFE_ENFORCE(filters->ndim() == 2); CAFFE_ENFORCE(targets->ndim() == 2); int numFilterColors = numImgColors / numGroups; int numImages = hidActs->dim32(1); int numFilters = filters->dim32(1); int numModules = hidActs->dim32(0) / numFilters; int filterModuleMult = conv ? 1 : numModules; int filterPixels = filters->dim32(0) / (filterModuleMult * numFilterColors); int filterSize = sqrt(filterPixels); int imgPixels = imgSizeY * imgSizeX; int numModulesX = numModules / numModulesY; CAFFE_ENFORCE(numImgColors % numGroups == 0); CAFFE_ENFORCE(numFilters % (16*numGroups) == 0); // TODO: insisting on 32 filters due to bug in calling code below. fix that. CAFFE_ENFORCE(numGroups > 1 || (numImgColors > 0 && (numImgColors <= 3 || numImgColors % 2 == 0))); CAFFE_ENFORCE(numGroups == 1 || numFilterColors % 4 == 0); CAFFE_ENFORCE(filterPixels == filterSize * filterSize); CAFFE_ENFORCE(hidActs->dim32(0) == numModules * numFilters); CAFFE_ENFORCE(filters->dim32(0) == filterModuleMult * numFilterColors * filterPixels); CAFFE_ENFORCE(numModules == numModulesY * numModulesX); // These routines don't handle the case when only part of the image is visited in the convolution CAFFE_ENFORCE(paddingStart <= 0); CAFFE_ENFORCE(paddingStart + (numModulesX-1)*moduleStride + filterSize >= imgSizeX); CAFFE_ENFORCE(paddingStart + (numModulesY-1)*moduleStride + filterSize >= imgSizeY); CAFFE_ENFORCE(moduleStride <= filterSize); dim3 blocks; dim3 threads; int colorsPerThread, imgsPerThread; if (numFilterColors % 8 == 0) { threads = dim3(32, numFilterColors % 64 == 0 ? 8 : 4); colorsPerThread = numFilterColors % 64 == 0 ? 8 : numFilterColors % 48 == 0 ? 12 : numFilterColors % 32 == 0 ? 8 : numFilterColors % 16 == 0 ? 4 : 2; imgsPerThread = numImages % 128 == 0 ? 4 : numImages % 64 == 0 ? 2 : 1; CAFFE_ENFORCE(numFilterColors % (threads.y * colorsPerThread) == 0); blocks = dim3(DIVUP(numImages, threads.x*imgsPerThread) * (numImgColors/(threads.y*colorsPerThread)), imgPixels); // NOTE: the case when channels % 32 == 0 but channels % 48 != 0 and channels % 64 != 0 has not been optimized!! } else if (numFilterColors > 3) { // NOTE: THIS CASE HAS NOT BEEN OPTIMIZED FOR KEPLER!! imgsPerThread = numImages % 128 == 0 ? 8 : numImages % 64 == 0 ? 4 : 2; threads = dim3(16, 16); colorsPerThread = numFilterColors % 4 == 0 ? 4 : 2; blocks = dim3(DIVUP(numImages,threads.x*imgsPerThread) * (numImgColors / colorsPerThread), DIVUP(imgSizeY,4) * DIVUP(imgSizeX,4)); } else { // NOTE: THIS CASE HAS NOT BEEN OPTIMIZED FOR KEPLER!! imgsPerThread = numImages % 128 == 0 ? 8 : numImages % 64 == 0 ? 4 : 2; threads = dim3(16, 16); blocks = dim3(DIVUP(numImages,threads.x*imgsPerThread), DIVUP(imgSizeY,4) * DIVUP(imgSizeX,4)); } bool checkCaseBounds = numImages % (threads.x * imgsPerThread) != 0; if (scaleTargets == 0) { // do not scale or use targets matrix targets->Resize(std::vector{numImgColors*imgPixels, numImages}); } else { CAFFE_ENFORCE(targets->dim32(0) == numImgColors * imgPixels); CAFFE_ENFORCE(targets->dim32(1) == numImages); } const bool scale = scaleTargets != 0; cudaTextureObject_t tex_hidacts = GetTensorTextureObject(hidActs); cudaTextureObject_t tex_filters = GetTensorTextureObject(filters); float* hidacts_data = hidActs->mutable_data(); float* filters_data = filters->mutable_data(); float* targets_data = targets->mutable_data(); cudaStream_t stream = context->cuda_stream(); // cudaFuncSetCacheConfig(conv_img_acts_manycolor_preloadfh_ty_4_tx_32_c_12_ff_16_fh_16< 4, 32, 4, 12, 16, 16, false, false, true >, cudaFuncCachePreferShared); // conv_img_acts_manycolor_preloadfh_ty_4_tx_32_c_12_ff_16_fh_16< 4, 32, 4, 12, 16, 16, false, false, true ><<>>( // tex_hidacts, tex_filters, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, // imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); //return; // printf("conv: %d\n", conv); // printf("scale: %d\n", scale); // printf("checkCaseBounds: %d\n", checkCaseBounds); // printf("numFilterColors: %d\n", numFilterColors); // printf("numImages: %d\n", numImages); // cudaStream_t stream = NVMatrix::getDefaultStream(); if (conv == true) { if (scale == false) { if (checkCaseBounds == false) { if (numFilterColors % 8 == 0) { if (numFilterColors % 64 == 0) { if (numFilters % 32 == 0) { if (numImages % 128 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_preloadfh_ty_8_tx_32_c_8_ff_32_fh_16_tex< 8, 32, 4, 8, 32, 16, false, false, true >, cudaFuncCachePreferShared); conv_img_acts_manycolor_preloadfh_ty_8_tx_32_c_8_ff_32_fh_16_tex< 8, 32, 4, 8, 32, 16, false, false, true ><<>>(tex_hidacts, tex_filters, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } else if (numImages % 64 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 8, 32, 2, 8, 32, 16, false, false, true >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 8, 32, 2, 8, 32, 16, false, false, true ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } else if (numImages % 32 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 8, 32, 1, 8, 32, 16, false, false, true >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 8, 32, 1, 8, 32, 16, false, false, true ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } else if (numImages % 16 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 8, 32, 1, 8, 32, 16, false, false, true >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 8, 32, 1, 8, 32, 16, false, false, true ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } } else if (numFilters % 16 == 0) { if (numImages % 128 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 8, 32, 4, 8, 16, 16, false, false, true >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 8, 32, 4, 8, 16, 16, false, false, true ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } else if (numImages % 64 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 8, 32, 2, 8, 16, 16, false, false, true >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 8, 32, 2, 8, 16, 16, false, false, true ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } else if (numImages % 32 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 8, 32, 1, 8, 16, 16, false, false, true >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 8, 32, 1, 8, 16, 16, false, false, true ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } else if (numImages % 16 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 8, 32, 1, 8, 16, 16, false, false, true >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 8, 32, 1, 8, 16, 16, false, false, true ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } } } else if (numFilterColors % 48 == 0) { if (numFilters % 16 == 0) { if (numImages % 128 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_preloadfh_ty_4_tx_32_c_12_ff_16_fh_16< 4, 32, 4, 12, 16, 16, false, false, true >, cudaFuncCachePreferShared); conv_img_acts_manycolor_preloadfh_ty_4_tx_32_c_12_ff_16_fh_16< 4, 32, 4, 12, 16, 16, false, false, true ><<>>(tex_hidacts, tex_filters, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } else if (numImages % 64 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 2, 12, 16, 16, false, false, true >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 4, 32, 2, 12, 16, 16, false, false, true ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } else if (numImages % 32 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 1, 12, 16, 16, false, false, true >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 4, 32, 1, 12, 16, 16, false, false, true ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } else if (numImages % 16 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 1, 12, 16, 16, false, false, true >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 4, 32, 1, 12, 16, 16, false, false, true ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } } } else if (numFilterColors % 32 == 0) { if (numFilters % 32 == 0) { if (numImages % 128 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 4, 8, 32, 16, false, false, true >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 4, 32, 4, 8, 32, 16, false, false, true ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } else if (numImages % 64 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 2, 8, 32, 16, false, false, true >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 4, 32, 2, 8, 32, 16, false, false, true ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } else if (numImages % 32 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 1, 8, 32, 16, false, false, true >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 4, 32, 1, 8, 32, 16, false, false, true ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } else if (numImages % 16 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 1, 8, 32, 16, false, false, true >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 4, 32, 1, 8, 32, 16, false, false, true ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } } else if (numFilters % 16 == 0) { if (numImages % 128 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 4, 8, 16, 16, false, false, true >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 4, 32, 4, 8, 16, 16, false, false, true ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } else if (numImages % 64 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 2, 8, 16, 16, false, false, true >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 4, 32, 2, 8, 16, 16, false, false, true ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } else if (numImages % 32 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 1, 8, 16, 16, false, false, true >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 4, 32, 1, 8, 16, 16, false, false, true ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } else if (numImages % 16 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 1, 8, 16, 16, false, false, true >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 4, 32, 1, 8, 16, 16, false, false, true ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } } } else if (numFilterColors % 16 == 0) { if (numFilters % 16 == 0) { if (numImages % 128 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 4, 4, 16, 16, false, false, true >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 4, 32, 4, 4, 16, 16, false, false, true ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } else if (numImages % 64 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 2, 4, 16, 16, false, false, true >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 4, 32, 2, 4, 16, 16, false, false, true ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } else if (numImages % 32 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 1, 4, 16, 16, false, false, true >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 4, 32, 1, 4, 16, 16, false, false, true ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } else if (numImages % 16 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 1, 4, 16, 16, false, false, true >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 4, 32, 1, 4, 16, 16, false, false, true ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } } } else if (numFilterColors % 8 == 0) { if (numFilters % 16 == 0) { if (numImages % 128 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 4, 2, 16, 16, false, false, true >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 4, 32, 4, 2, 16, 16, false, false, true ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } else if (numImages % 64 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 2, 2, 16, 16, false, false, true >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 4, 32, 2, 2, 16, 16, false, false, true ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } else if (numImages % 32 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 1, 2, 16, 16, false, false, true >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 4, 32, 1, 2, 16, 16, false, false, true ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } else if (numImages % 16 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 1, 2, 16, 16, false, false, true >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 4, 32, 1, 2, 16, 16, false, false, true ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } } } } else if (numFilterColors > 3) { if (numFilterColors == 4) { if (numFilters % 16 == 0) { if (numImages % 128 == 0) { cudaFuncSetCacheConfig(img_acts_mediumcolor < 8, 4, false, false, true >, cudaFuncCachePreferShared); img_acts_mediumcolor < 8, 4, false, false, true ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } else if (numImages % 64 == 0) { cudaFuncSetCacheConfig(img_acts_mediumcolor < 4, 4, false, false, true >, cudaFuncCachePreferShared); img_acts_mediumcolor < 4, 4, false, false, true ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } else if (numImages % 32 == 0) { cudaFuncSetCacheConfig(img_acts_mediumcolor < 2, 4, false, false, true >, cudaFuncCachePreferShared); img_acts_mediumcolor < 2, 4, false, false, true ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } else if (numImages % 16 == 0) { cudaFuncSetCacheConfig(img_acts_mediumcolor < 2, 4, false, false, true >, cudaFuncCachePreferShared); img_acts_mediumcolor < 2, 4, false, false, true ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } } } else if (numFilterColors == 2) { if (numFilters % 16 == 0) { if (numImages % 128 == 0) { cudaFuncSetCacheConfig(img_acts_color < 8, 2, false, false, true >, cudaFuncCachePreferShared); img_acts_color < 8, 2, false, false, true ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput); } else if (numImages % 64 == 0) { cudaFuncSetCacheConfig(img_acts_color < 4, 2, false, false, true >, cudaFuncCachePreferShared); img_acts_color < 4, 2, false, false, true ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput); } else if (numImages % 32 == 0) { cudaFuncSetCacheConfig(img_acts_color < 2, 2, false, false, true >, cudaFuncCachePreferShared); img_acts_color < 2, 2, false, false, true ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput); } else if (numImages % 16 == 0) { cudaFuncSetCacheConfig(img_acts_color < 2, 2, false, false, true >, cudaFuncCachePreferShared); img_acts_color < 2, 2, false, false, true ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput); } } } } else if (numFilterColors <= 3) { if (numFilterColors == 3) { if (numFilters % 16 == 0) { if (numImages % 128 == 0) { cudaFuncSetCacheConfig(img_acts_color < 8, 3, false, false, true >, cudaFuncCachePreferShared); img_acts_color < 8, 3, false, false, true ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput); } else if (numImages % 64 == 0) { cudaFuncSetCacheConfig(img_acts_color < 4, 3, false, false, true >, cudaFuncCachePreferShared); img_acts_color < 4, 3, false, false, true ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput); } else if (numImages % 32 == 0) { cudaFuncSetCacheConfig(img_acts_color < 2, 3, false, false, true >, cudaFuncCachePreferShared); img_acts_color < 2, 3, false, false, true ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput); } else if (numImages % 16 == 0) { cudaFuncSetCacheConfig(img_acts_color < 2, 3, false, false, true >, cudaFuncCachePreferShared); img_acts_color < 2, 3, false, false, true ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput); } } } else if (numFilterColors == 2) { if (numFilters % 16 == 0) { if (numImages % 128 == 0) { cudaFuncSetCacheConfig(img_acts_color < 8, 2, false, false, true >, cudaFuncCachePreferShared); img_acts_color < 8, 2, false, false, true ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput); } else if (numImages % 64 == 0) { cudaFuncSetCacheConfig(img_acts_color < 4, 2, false, false, true >, cudaFuncCachePreferShared); img_acts_color < 4, 2, false, false, true ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput); } else if (numImages % 32 == 0) { cudaFuncSetCacheConfig(img_acts_color < 2, 2, false, false, true >, cudaFuncCachePreferShared); img_acts_color < 2, 2, false, false, true ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput); } else if (numImages % 16 == 0) { cudaFuncSetCacheConfig(img_acts_color < 2, 2, false, false, true >, cudaFuncCachePreferShared); img_acts_color < 2, 2, false, false, true ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput); } } } else if (numFilterColors == 1) { if (numFilters % 16 == 0) { if (numImages % 128 == 0) { cudaFuncSetCacheConfig(img_acts_color < 8, 1, false, false, true >, cudaFuncCachePreferShared); img_acts_color < 8, 1, false, false, true ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput); } else if (numImages % 64 == 0) { cudaFuncSetCacheConfig(img_acts_color < 4, 1, false, false, true >, cudaFuncCachePreferShared); img_acts_color < 4, 1, false, false, true ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput); } else if (numImages % 32 == 0) { cudaFuncSetCacheConfig(img_acts_color < 2, 1, false, false, true >, cudaFuncCachePreferShared); img_acts_color < 2, 1, false, false, true ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput); } else if (numImages % 16 == 0) { cudaFuncSetCacheConfig(img_acts_color < 2, 1, false, false, true >, cudaFuncCachePreferShared); img_acts_color < 2, 1, false, false, true ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput); } } } } } else if (checkCaseBounds == true) { if (numFilterColors % 8 == 0) { if (numFilterColors % 64 == 0) { if (numFilters % 32 == 0) { if (numImages % 1 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 8, 32, 1, 8, 32, 16, false, true, true >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 8, 32, 1, 8, 32, 16, false, true, true ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } } else if (numFilters % 16 == 0) { if (numImages % 1 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 8, 32, 1, 8, 16, 16, false, true, true >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 8, 32, 1, 8, 16, 16, false, true, true ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } } } else if (numFilterColors % 48 == 0) { if (numFilters % 16 == 0) { if (numImages % 1 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 1, 12, 16, 16, false, true, true >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 4, 32, 1, 12, 16, 16, false, true, true ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } } } else if (numFilterColors % 32 == 0) { if (numFilters % 32 == 0) { if (numImages % 1 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 1, 8, 32, 16, false, true, true >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 4, 32, 1, 8, 32, 16, false, true, true ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } } else if (numFilters % 16 == 0) { if (numImages % 1 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 1, 8, 16, 16, false, true, true >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 4, 32, 1, 8, 16, 16, false, true, true ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } } } else if (numFilterColors % 16 == 0) { if (numFilters % 16 == 0) { if (numImages % 1 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 1, 4, 16, 16, false, true, true >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 4, 32, 1, 4, 16, 16, false, true, true ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } } } else if (numFilterColors % 8 == 0) { if (numFilters % 16 == 0) { if (numImages % 1 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 1, 2, 16, 16, false, true, true >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 4, 32, 1, 2, 16, 16, false, true, true ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } } } } else if (numFilterColors > 3) { if (numFilterColors == 4) { if (numFilters % 16 == 0) { if (numImages % 1 == 0) { cudaFuncSetCacheConfig(img_acts_mediumcolor < 2, 4, false, true, true >, cudaFuncCachePreferShared); img_acts_mediumcolor < 2, 4, false, true, true ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } } } else if (numFilterColors == 2) { if (numFilters % 16 == 0) { if (numImages % 1 == 0) { cudaFuncSetCacheConfig(img_acts_color < 2, 2, false, true, true >, cudaFuncCachePreferShared); img_acts_color < 2, 2, false, true, true ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput); } } } } else if (numFilterColors <= 3) { if (numFilterColors == 3) { if (numFilters % 16 == 0) { if (numImages % 1 == 0) { cudaFuncSetCacheConfig(img_acts_color < 2, 3, false, true, true >, cudaFuncCachePreferShared); img_acts_color < 2, 3, false, true, true ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput); } } } else if (numFilterColors == 2) { if (numFilters % 16 == 0) { if (numImages % 1 == 0) { cudaFuncSetCacheConfig(img_acts_color < 2, 2, false, true, true >, cudaFuncCachePreferShared); img_acts_color < 2, 2, false, true, true ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput); } } } else if (numFilterColors == 1) { if (numFilters % 16 == 0) { if (numImages % 1 == 0) { cudaFuncSetCacheConfig(img_acts_color < 2, 1, false, true, true >, cudaFuncCachePreferShared); img_acts_color < 2, 1, false, true, true ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput); } } } } } } else if (scale == true) { if (checkCaseBounds == false) { if (numFilterColors % 8 == 0) { if (numFilterColors % 64 == 0) { if (numFilters % 32 == 0) { if (numImages % 128 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_preloadfh_ty_8_tx_32_c_8_ff_32_fh_16_tex< 8, 32, 4, 8, 32, 16, true, false, true >, cudaFuncCachePreferShared); conv_img_acts_manycolor_preloadfh_ty_8_tx_32_c_8_ff_32_fh_16_tex< 8, 32, 4, 8, 32, 16, true, false, true ><<>>(tex_hidacts, tex_filters, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } else if (numImages % 64 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 8, 32, 2, 8, 32, 16, true, false, true >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 8, 32, 2, 8, 32, 16, true, false, true ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } else if (numImages % 32 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 8, 32, 1, 8, 32, 16, true, false, true >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 8, 32, 1, 8, 32, 16, true, false, true ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } else if (numImages % 16 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 8, 32, 1, 8, 32, 16, true, false, true >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 8, 32, 1, 8, 32, 16, true, false, true ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } } else if (numFilters % 16 == 0) { if (numImages % 128 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 8, 32, 4, 8, 16, 16, true, false, true >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 8, 32, 4, 8, 16, 16, true, false, true ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } else if (numImages % 64 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 8, 32, 2, 8, 16, 16, true, false, true >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 8, 32, 2, 8, 16, 16, true, false, true ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } else if (numImages % 32 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 8, 32, 1, 8, 16, 16, true, false, true >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 8, 32, 1, 8, 16, 16, true, false, true ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } else if (numImages % 16 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 8, 32, 1, 8, 16, 16, true, false, true >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 8, 32, 1, 8, 16, 16, true, false, true ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } } } else if (numFilterColors % 48 == 0) { if (numFilters % 16 == 0) { if (numImages % 128 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_preloadfh_ty_4_tx_32_c_12_ff_16_fh_16< 4, 32, 4, 12, 16, 16, true, false, true >, cudaFuncCachePreferShared); conv_img_acts_manycolor_preloadfh_ty_4_tx_32_c_12_ff_16_fh_16< 4, 32, 4, 12, 16, 16, true, false, true ><<>>(tex_hidacts, tex_filters, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } else if (numImages % 64 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 2, 12, 16, 16, true, false, true >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 4, 32, 2, 12, 16, 16, true, false, true ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } else if (numImages % 32 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 1, 12, 16, 16, true, false, true >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 4, 32, 1, 12, 16, 16, true, false, true ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } else if (numImages % 16 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 1, 12, 16, 16, true, false, true >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 4, 32, 1, 12, 16, 16, true, false, true ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } } } else if (numFilterColors % 32 == 0) { if (numFilters % 32 == 0) { if (numImages % 128 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 4, 8, 32, 16, true, false, true >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 4, 32, 4, 8, 32, 16, true, false, true ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } else if (numImages % 64 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 2, 8, 32, 16, true, false, true >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 4, 32, 2, 8, 32, 16, true, false, true ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } else if (numImages % 32 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 1, 8, 32, 16, true, false, true >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 4, 32, 1, 8, 32, 16, true, false, true ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } else if (numImages % 16 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 1, 8, 32, 16, true, false, true >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 4, 32, 1, 8, 32, 16, true, false, true ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } } else if (numFilters % 16 == 0) { if (numImages % 128 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 4, 8, 16, 16, true, false, true >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 4, 32, 4, 8, 16, 16, true, false, true ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } else if (numImages % 64 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 2, 8, 16, 16, true, false, true >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 4, 32, 2, 8, 16, 16, true, false, true ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } else if (numImages % 32 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 1, 8, 16, 16, true, false, true >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 4, 32, 1, 8, 16, 16, true, false, true ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } else if (numImages % 16 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 1, 8, 16, 16, true, false, true >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 4, 32, 1, 8, 16, 16, true, false, true ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } } } else if (numFilterColors % 16 == 0) { if (numFilters % 16 == 0) { if (numImages % 128 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 4, 4, 16, 16, true, false, true >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 4, 32, 4, 4, 16, 16, true, false, true ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } else if (numImages % 64 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 2, 4, 16, 16, true, false, true >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 4, 32, 2, 4, 16, 16, true, false, true ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } else if (numImages % 32 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 1, 4, 16, 16, true, false, true >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 4, 32, 1, 4, 16, 16, true, false, true ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } else if (numImages % 16 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 1, 4, 16, 16, true, false, true >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 4, 32, 1, 4, 16, 16, true, false, true ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } } } else if (numFilterColors % 8 == 0) { if (numFilters % 16 == 0) { if (numImages % 128 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 4, 2, 16, 16, true, false, true >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 4, 32, 4, 2, 16, 16, true, false, true ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } else if (numImages % 64 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 2, 2, 16, 16, true, false, true >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 4, 32, 2, 2, 16, 16, true, false, true ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } else if (numImages % 32 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 1, 2, 16, 16, true, false, true >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 4, 32, 1, 2, 16, 16, true, false, true ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } else if (numImages % 16 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 1, 2, 16, 16, true, false, true >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 4, 32, 1, 2, 16, 16, true, false, true ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } } } } else if (numFilterColors > 3) { if (numFilterColors == 4) { if (numFilters % 16 == 0) { if (numImages % 128 == 0) { cudaFuncSetCacheConfig(img_acts_mediumcolor < 8, 4, true, false, true >, cudaFuncCachePreferShared); img_acts_mediumcolor < 8, 4, true, false, true ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } else if (numImages % 64 == 0) { cudaFuncSetCacheConfig(img_acts_mediumcolor < 4, 4, true, false, true >, cudaFuncCachePreferShared); img_acts_mediumcolor < 4, 4, true, false, true ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } else if (numImages % 32 == 0) { cudaFuncSetCacheConfig(img_acts_mediumcolor < 2, 4, true, false, true >, cudaFuncCachePreferShared); img_acts_mediumcolor < 2, 4, true, false, true ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } else if (numImages % 16 == 0) { cudaFuncSetCacheConfig(img_acts_mediumcolor < 2, 4, true, false, true >, cudaFuncCachePreferShared); img_acts_mediumcolor < 2, 4, true, false, true ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } } } else if (numFilterColors == 2) { if (numFilters % 16 == 0) { if (numImages % 128 == 0) { cudaFuncSetCacheConfig(img_acts_color < 8, 2, true, false, true >, cudaFuncCachePreferShared); img_acts_color < 8, 2, true, false, true ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput); } else if (numImages % 64 == 0) { cudaFuncSetCacheConfig(img_acts_color < 4, 2, true, false, true >, cudaFuncCachePreferShared); img_acts_color < 4, 2, true, false, true ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput); } else if (numImages % 32 == 0) { cudaFuncSetCacheConfig(img_acts_color < 2, 2, true, false, true >, cudaFuncCachePreferShared); img_acts_color < 2, 2, true, false, true ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput); } else if (numImages % 16 == 0) { cudaFuncSetCacheConfig(img_acts_color < 2, 2, true, false, true >, cudaFuncCachePreferShared); img_acts_color < 2, 2, true, false, true ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput); } } } } else if (numFilterColors <= 3) { if (numFilterColors == 3) { if (numFilters % 16 == 0) { if (numImages % 128 == 0) { cudaFuncSetCacheConfig(img_acts_color < 8, 3, true, false, true >, cudaFuncCachePreferShared); img_acts_color < 8, 3, true, false, true ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput); } else if (numImages % 64 == 0) { cudaFuncSetCacheConfig(img_acts_color < 4, 3, true, false, true >, cudaFuncCachePreferShared); img_acts_color < 4, 3, true, false, true ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput); } else if (numImages % 32 == 0) { cudaFuncSetCacheConfig(img_acts_color < 2, 3, true, false, true >, cudaFuncCachePreferShared); img_acts_color < 2, 3, true, false, true ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput); } else if (numImages % 16 == 0) { cudaFuncSetCacheConfig(img_acts_color < 2, 3, true, false, true >, cudaFuncCachePreferShared); img_acts_color < 2, 3, true, false, true ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput); } } } else if (numFilterColors == 2) { if (numFilters % 16 == 0) { if (numImages % 128 == 0) { cudaFuncSetCacheConfig(img_acts_color < 8, 2, true, false, true >, cudaFuncCachePreferShared); img_acts_color < 8, 2, true, false, true ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput); } else if (numImages % 64 == 0) { cudaFuncSetCacheConfig(img_acts_color < 4, 2, true, false, true >, cudaFuncCachePreferShared); img_acts_color < 4, 2, true, false, true ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput); } else if (numImages % 32 == 0) { cudaFuncSetCacheConfig(img_acts_color < 2, 2, true, false, true >, cudaFuncCachePreferShared); img_acts_color < 2, 2, true, false, true ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput); } else if (numImages % 16 == 0) { cudaFuncSetCacheConfig(img_acts_color < 2, 2, true, false, true >, cudaFuncCachePreferShared); img_acts_color < 2, 2, true, false, true ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput); } } } else if (numFilterColors == 1) { if (numFilters % 16 == 0) { if (numImages % 128 == 0) { cudaFuncSetCacheConfig(img_acts_color < 8, 1, true, false, true >, cudaFuncCachePreferShared); img_acts_color < 8, 1, true, false, true ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput); } else if (numImages % 64 == 0) { cudaFuncSetCacheConfig(img_acts_color < 4, 1, true, false, true >, cudaFuncCachePreferShared); img_acts_color < 4, 1, true, false, true ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput); } else if (numImages % 32 == 0) { cudaFuncSetCacheConfig(img_acts_color < 2, 1, true, false, true >, cudaFuncCachePreferShared); img_acts_color < 2, 1, true, false, true ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput); } else if (numImages % 16 == 0) { cudaFuncSetCacheConfig(img_acts_color < 2, 1, true, false, true >, cudaFuncCachePreferShared); img_acts_color < 2, 1, true, false, true ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput); } } } } } else if (checkCaseBounds == true) { if (numFilterColors % 8 == 0) { if (numFilterColors % 64 == 0) { if (numFilters % 32 == 0) { if (numImages % 1 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 8, 32, 1, 8, 32, 16, true, true, true >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 8, 32, 1, 8, 32, 16, true, true, true ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } } else if (numFilters % 16 == 0) { if (numImages % 1 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 8, 32, 1, 8, 16, 16, true, true, true >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 8, 32, 1, 8, 16, 16, true, true, true ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } } } else if (numFilterColors % 48 == 0) { if (numFilters % 16 == 0) { if (numImages % 1 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 1, 12, 16, 16, true, true, true >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 4, 32, 1, 12, 16, 16, true, true, true ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } } } else if (numFilterColors % 32 == 0) { if (numFilters % 32 == 0) { if (numImages % 1 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 1, 8, 32, 16, true, true, true >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 4, 32, 1, 8, 32, 16, true, true, true ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } } else if (numFilters % 16 == 0) { if (numImages % 1 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 1, 8, 16, 16, true, true, true >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 4, 32, 1, 8, 16, 16, true, true, true ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } } } else if (numFilterColors % 16 == 0) { if (numFilters % 16 == 0) { if (numImages % 1 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 1, 4, 16, 16, true, true, true >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 4, 32, 1, 4, 16, 16, true, true, true ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } } } else if (numFilterColors % 8 == 0) { if (numFilters % 16 == 0) { if (numImages % 1 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 1, 2, 16, 16, true, true, true >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 4, 32, 1, 2, 16, 16, true, true, true ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } } } } else if (numFilterColors > 3) { if (numFilterColors == 4) { if (numFilters % 16 == 0) { if (numImages % 1 == 0) { cudaFuncSetCacheConfig(img_acts_mediumcolor < 2, 4, true, true, true >, cudaFuncCachePreferShared); img_acts_mediumcolor < 2, 4, true, true, true ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } } } else if (numFilterColors == 2) { if (numFilters % 16 == 0) { if (numImages % 1 == 0) { cudaFuncSetCacheConfig(img_acts_color < 2, 2, true, true, true >, cudaFuncCachePreferShared); img_acts_color < 2, 2, true, true, true ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput); } } } } else if (numFilterColors <= 3) { if (numFilterColors == 3) { if (numFilters % 16 == 0) { if (numImages % 1 == 0) { cudaFuncSetCacheConfig(img_acts_color < 2, 3, true, true, true >, cudaFuncCachePreferShared); img_acts_color < 2, 3, true, true, true ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput); } } } else if (numFilterColors == 2) { if (numFilters % 16 == 0) { if (numImages % 1 == 0) { cudaFuncSetCacheConfig(img_acts_color < 2, 2, true, true, true >, cudaFuncCachePreferShared); img_acts_color < 2, 2, true, true, true ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput); } } } else if (numFilterColors == 1) { if (numFilters % 16 == 0) { if (numImages % 1 == 0) { cudaFuncSetCacheConfig(img_acts_color < 2, 1, true, true, true >, cudaFuncCachePreferShared); img_acts_color < 2, 1, true, true, true ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput); } } } } } } } else if (conv == false) { if (scale == false) { if (checkCaseBounds == false) { if (numFilterColors % 8 == 0) { if (numFilterColors % 64 == 0) { if (numFilters % 32 == 0) { if (numImages % 128 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_preloadfh_ty_8_tx_32_c_8_ff_32_fh_16_tex< 8, 32, 4, 8, 32, 16, false, false, false >, cudaFuncCachePreferShared); conv_img_acts_manycolor_preloadfh_ty_8_tx_32_c_8_ff_32_fh_16_tex< 8, 32, 4, 8, 32, 16, false, false, false ><<>>(tex_hidacts, tex_filters, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } else if (numImages % 64 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 8, 32, 2, 8, 32, 16, false, false, false >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 8, 32, 2, 8, 32, 16, false, false, false ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } else if (numImages % 32 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 8, 32, 1, 8, 32, 16, false, false, false >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 8, 32, 1, 8, 32, 16, false, false, false ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } else if (numImages % 16 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 8, 32, 1, 8, 32, 16, false, false, false >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 8, 32, 1, 8, 32, 16, false, false, false ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } } else if (numFilters % 16 == 0) { if (numImages % 128 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 8, 32, 4, 8, 16, 16, false, false, false >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 8, 32, 4, 8, 16, 16, false, false, false ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } else if (numImages % 64 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 8, 32, 2, 8, 16, 16, false, false, false >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 8, 32, 2, 8, 16, 16, false, false, false ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } else if (numImages % 32 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 8, 32, 1, 8, 16, 16, false, false, false >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 8, 32, 1, 8, 16, 16, false, false, false ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } else if (numImages % 16 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 8, 32, 1, 8, 16, 16, false, false, false >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 8, 32, 1, 8, 16, 16, false, false, false ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } } } else if (numFilterColors % 48 == 0) { if (numFilters % 16 == 0) { if (numImages % 128 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_preloadfh_ty_4_tx_32_c_12_ff_16_fh_16< 4, 32, 4, 12, 16, 16, false, false, false >, cudaFuncCachePreferShared); conv_img_acts_manycolor_preloadfh_ty_4_tx_32_c_12_ff_16_fh_16< 4, 32, 4, 12, 16, 16, false, false, false ><<>>(tex_hidacts, tex_filters, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } else if (numImages % 64 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 2, 12, 16, 16, false, false, false >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 4, 32, 2, 12, 16, 16, false, false, false ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } else if (numImages % 32 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 1, 12, 16, 16, false, false, false >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 4, 32, 1, 12, 16, 16, false, false, false ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } else if (numImages % 16 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 1, 12, 16, 16, false, false, false >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 4, 32, 1, 12, 16, 16, false, false, false ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } } } else if (numFilterColors % 32 == 0) { if (numFilters % 32 == 0) { if (numImages % 128 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 4, 8, 32, 16, false, false, false >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 4, 32, 4, 8, 32, 16, false, false, false ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } else if (numImages % 64 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 2, 8, 32, 16, false, false, false >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 4, 32, 2, 8, 32, 16, false, false, false ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } else if (numImages % 32 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 1, 8, 32, 16, false, false, false >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 4, 32, 1, 8, 32, 16, false, false, false ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } else if (numImages % 16 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 1, 8, 32, 16, false, false, false >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 4, 32, 1, 8, 32, 16, false, false, false ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } } else if (numFilters % 16 == 0) { if (numImages % 128 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 4, 8, 16, 16, false, false, false >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 4, 32, 4, 8, 16, 16, false, false, false ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } else if (numImages % 64 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 2, 8, 16, 16, false, false, false >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 4, 32, 2, 8, 16, 16, false, false, false ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } else if (numImages % 32 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 1, 8, 16, 16, false, false, false >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 4, 32, 1, 8, 16, 16, false, false, false ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } else if (numImages % 16 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 1, 8, 16, 16, false, false, false >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 4, 32, 1, 8, 16, 16, false, false, false ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } } } else if (numFilterColors % 16 == 0) { if (numFilters % 16 == 0) { if (numImages % 128 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 4, 4, 16, 16, false, false, false >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 4, 32, 4, 4, 16, 16, false, false, false ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } else if (numImages % 64 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 2, 4, 16, 16, false, false, false >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 4, 32, 2, 4, 16, 16, false, false, false ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } else if (numImages % 32 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 1, 4, 16, 16, false, false, false >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 4, 32, 1, 4, 16, 16, false, false, false ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } else if (numImages % 16 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 1, 4, 16, 16, false, false, false >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 4, 32, 1, 4, 16, 16, false, false, false ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } } } else if (numFilterColors % 8 == 0) { if (numFilters % 16 == 0) { if (numImages % 128 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 4, 2, 16, 16, false, false, false >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 4, 32, 4, 2, 16, 16, false, false, false ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } else if (numImages % 64 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 2, 2, 16, 16, false, false, false >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 4, 32, 2, 2, 16, 16, false, false, false ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } else if (numImages % 32 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 1, 2, 16, 16, false, false, false >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 4, 32, 1, 2, 16, 16, false, false, false ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } else if (numImages % 16 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 1, 2, 16, 16, false, false, false >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 4, 32, 1, 2, 16, 16, false, false, false ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } } } } else if (numFilterColors > 3) { if (numFilterColors == 4) { if (numFilters % 16 == 0) { if (numImages % 128 == 0) { cudaFuncSetCacheConfig(img_acts_mediumcolor < 8, 4, false, false, false >, cudaFuncCachePreferShared); img_acts_mediumcolor < 8, 4, false, false, false ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } else if (numImages % 64 == 0) { cudaFuncSetCacheConfig(img_acts_mediumcolor < 4, 4, false, false, false >, cudaFuncCachePreferShared); img_acts_mediumcolor < 4, 4, false, false, false ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } else if (numImages % 32 == 0) { cudaFuncSetCacheConfig(img_acts_mediumcolor < 2, 4, false, false, false >, cudaFuncCachePreferShared); img_acts_mediumcolor < 2, 4, false, false, false ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } else if (numImages % 16 == 0) { cudaFuncSetCacheConfig(img_acts_mediumcolor < 2, 4, false, false, false >, cudaFuncCachePreferShared); img_acts_mediumcolor < 2, 4, false, false, false ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } } } else if (numFilterColors == 2) { if (numFilters % 16 == 0) { if (numImages % 128 == 0) { cudaFuncSetCacheConfig(img_acts_color < 8, 2, false, false, false >, cudaFuncCachePreferShared); img_acts_color < 8, 2, false, false, false ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput); } else if (numImages % 64 == 0) { cudaFuncSetCacheConfig(img_acts_color < 4, 2, false, false, false >, cudaFuncCachePreferShared); img_acts_color < 4, 2, false, false, false ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput); } else if (numImages % 32 == 0) { cudaFuncSetCacheConfig(img_acts_color < 2, 2, false, false, false >, cudaFuncCachePreferShared); img_acts_color < 2, 2, false, false, false ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput); } else if (numImages % 16 == 0) { cudaFuncSetCacheConfig(img_acts_color < 2, 2, false, false, false >, cudaFuncCachePreferShared); img_acts_color < 2, 2, false, false, false ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput); } } } } else if (numFilterColors <= 3) { if (numFilterColors == 3) { if (numFilters % 16 == 0) { if (numImages % 128 == 0) { cudaFuncSetCacheConfig(img_acts_color < 8, 3, false, false, false >, cudaFuncCachePreferShared); img_acts_color < 8, 3, false, false, false ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput); } else if (numImages % 64 == 0) { cudaFuncSetCacheConfig(img_acts_color < 4, 3, false, false, false >, cudaFuncCachePreferShared); img_acts_color < 4, 3, false, false, false ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput); } else if (numImages % 32 == 0) { cudaFuncSetCacheConfig(img_acts_color < 2, 3, false, false, false >, cudaFuncCachePreferShared); img_acts_color < 2, 3, false, false, false ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput); } else if (numImages % 16 == 0) { cudaFuncSetCacheConfig(img_acts_color < 2, 3, false, false, false >, cudaFuncCachePreferShared); img_acts_color < 2, 3, false, false, false ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput); } } } else if (numFilterColors == 2) { if (numFilters % 16 == 0) { if (numImages % 128 == 0) { cudaFuncSetCacheConfig(img_acts_color < 8, 2, false, false, false >, cudaFuncCachePreferShared); img_acts_color < 8, 2, false, false, false ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput); } else if (numImages % 64 == 0) { cudaFuncSetCacheConfig(img_acts_color < 4, 2, false, false, false >, cudaFuncCachePreferShared); img_acts_color < 4, 2, false, false, false ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput); } else if (numImages % 32 == 0) { cudaFuncSetCacheConfig(img_acts_color < 2, 2, false, false, false >, cudaFuncCachePreferShared); img_acts_color < 2, 2, false, false, false ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput); } else if (numImages % 16 == 0) { cudaFuncSetCacheConfig(img_acts_color < 2, 2, false, false, false >, cudaFuncCachePreferShared); img_acts_color < 2, 2, false, false, false ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput); } } } else if (numFilterColors == 1) { if (numFilters % 16 == 0) { if (numImages % 128 == 0) { cudaFuncSetCacheConfig(img_acts_color < 8, 1, false, false, false >, cudaFuncCachePreferShared); img_acts_color < 8, 1, false, false, false ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput); } else if (numImages % 64 == 0) { cudaFuncSetCacheConfig(img_acts_color < 4, 1, false, false, false >, cudaFuncCachePreferShared); img_acts_color < 4, 1, false, false, false ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput); } else if (numImages % 32 == 0) { cudaFuncSetCacheConfig(img_acts_color < 2, 1, false, false, false >, cudaFuncCachePreferShared); img_acts_color < 2, 1, false, false, false ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput); } else if (numImages % 16 == 0) { cudaFuncSetCacheConfig(img_acts_color < 2, 1, false, false, false >, cudaFuncCachePreferShared); img_acts_color < 2, 1, false, false, false ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput); } } } } } else if (checkCaseBounds == true) { if (numFilterColors % 8 == 0) { if (numFilterColors % 64 == 0) { if (numFilters % 32 == 0) { if (numImages % 1 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 8, 32, 1, 8, 32, 16, false, true, false >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 8, 32, 1, 8, 32, 16, false, true, false ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } } else if (numFilters % 16 == 0) { if (numImages % 1 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 8, 32, 1, 8, 16, 16, false, true, false >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 8, 32, 1, 8, 16, 16, false, true, false ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } } } else if (numFilterColors % 48 == 0) { if (numFilters % 16 == 0) { if (numImages % 1 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 1, 12, 16, 16, false, true, false >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 4, 32, 1, 12, 16, 16, false, true, false ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } } } else if (numFilterColors % 32 == 0) { if (numFilters % 32 == 0) { if (numImages % 1 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 1, 8, 32, 16, false, true, false >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 4, 32, 1, 8, 32, 16, false, true, false ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } } else if (numFilters % 16 == 0) { if (numImages % 1 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 1, 8, 16, 16, false, true, false >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 4, 32, 1, 8, 16, 16, false, true, false ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } } } else if (numFilterColors % 16 == 0) { if (numFilters % 16 == 0) { if (numImages % 1 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 1, 4, 16, 16, false, true, false >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 4, 32, 1, 4, 16, 16, false, true, false ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } } } else if (numFilterColors % 8 == 0) { if (numFilters % 16 == 0) { if (numImages % 1 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 1, 2, 16, 16, false, true, false >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 4, 32, 1, 2, 16, 16, false, true, false ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } } } } else if (numFilterColors > 3) { if (numFilterColors == 4) { if (numFilters % 16 == 0) { if (numImages % 1 == 0) { cudaFuncSetCacheConfig(img_acts_mediumcolor < 2, 4, false, true, false >, cudaFuncCachePreferShared); img_acts_mediumcolor < 2, 4, false, true, false ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } } } else if (numFilterColors == 2) { if (numFilters % 16 == 0) { if (numImages % 1 == 0) { cudaFuncSetCacheConfig(img_acts_color < 2, 2, false, true, false >, cudaFuncCachePreferShared); img_acts_color < 2, 2, false, true, false ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput); } } } } else if (numFilterColors <= 3) { if (numFilterColors == 3) { if (numFilters % 16 == 0) { if (numImages % 1 == 0) { cudaFuncSetCacheConfig(img_acts_color < 2, 3, false, true, false >, cudaFuncCachePreferShared); img_acts_color < 2, 3, false, true, false ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput); } } } else if (numFilterColors == 2) { if (numFilters % 16 == 0) { if (numImages % 1 == 0) { cudaFuncSetCacheConfig(img_acts_color < 2, 2, false, true, false >, cudaFuncCachePreferShared); img_acts_color < 2, 2, false, true, false ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput); } } } else if (numFilterColors == 1) { if (numFilters % 16 == 0) { if (numImages % 1 == 0) { cudaFuncSetCacheConfig(img_acts_color < 2, 1, false, true, false >, cudaFuncCachePreferShared); img_acts_color < 2, 1, false, true, false ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput); } } } } } } else if (scale == true) { if (checkCaseBounds == false) { if (numFilterColors % 8 == 0) { if (numFilterColors % 64 == 0) { if (numFilters % 32 == 0) { if (numImages % 128 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_preloadfh_ty_8_tx_32_c_8_ff_32_fh_16_tex< 8, 32, 4, 8, 32, 16, true, false, false >, cudaFuncCachePreferShared); conv_img_acts_manycolor_preloadfh_ty_8_tx_32_c_8_ff_32_fh_16_tex< 8, 32, 4, 8, 32, 16, true, false, false ><<>>(tex_hidacts, tex_filters, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } else if (numImages % 64 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 8, 32, 2, 8, 32, 16, true, false, false >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 8, 32, 2, 8, 32, 16, true, false, false ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } else if (numImages % 32 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 8, 32, 1, 8, 32, 16, true, false, false >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 8, 32, 1, 8, 32, 16, true, false, false ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } else if (numImages % 16 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 8, 32, 1, 8, 32, 16, true, false, false >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 8, 32, 1, 8, 32, 16, true, false, false ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } } else if (numFilters % 16 == 0) { if (numImages % 128 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 8, 32, 4, 8, 16, 16, true, false, false >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 8, 32, 4, 8, 16, 16, true, false, false ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } else if (numImages % 64 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 8, 32, 2, 8, 16, 16, true, false, false >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 8, 32, 2, 8, 16, 16, true, false, false ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } else if (numImages % 32 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 8, 32, 1, 8, 16, 16, true, false, false >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 8, 32, 1, 8, 16, 16, true, false, false ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } else if (numImages % 16 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 8, 32, 1, 8, 16, 16, true, false, false >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 8, 32, 1, 8, 16, 16, true, false, false ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } } } else if (numFilterColors % 48 == 0) { if (numFilters % 16 == 0) { if (numImages % 128 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_preloadfh_ty_4_tx_32_c_12_ff_16_fh_16< 4, 32, 4, 12, 16, 16, true, false, false >, cudaFuncCachePreferShared); conv_img_acts_manycolor_preloadfh_ty_4_tx_32_c_12_ff_16_fh_16< 4, 32, 4, 12, 16, 16, true, false, false ><<>>(tex_hidacts, tex_filters, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } else if (numImages % 64 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 2, 12, 16, 16, true, false, false >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 4, 32, 2, 12, 16, 16, true, false, false ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } else if (numImages % 32 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 1, 12, 16, 16, true, false, false >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 4, 32, 1, 12, 16, 16, true, false, false ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } else if (numImages % 16 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 1, 12, 16, 16, true, false, false >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 4, 32, 1, 12, 16, 16, true, false, false ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } } } else if (numFilterColors % 32 == 0) { if (numFilters % 32 == 0) { if (numImages % 128 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 4, 8, 32, 16, true, false, false >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 4, 32, 4, 8, 32, 16, true, false, false ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } else if (numImages % 64 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 2, 8, 32, 16, true, false, false >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 4, 32, 2, 8, 32, 16, true, false, false ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } else if (numImages % 32 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 1, 8, 32, 16, true, false, false >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 4, 32, 1, 8, 32, 16, true, false, false ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } else if (numImages % 16 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 1, 8, 32, 16, true, false, false >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 4, 32, 1, 8, 32, 16, true, false, false ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } } else if (numFilters % 16 == 0) { if (numImages % 128 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 4, 8, 16, 16, true, false, false >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 4, 32, 4, 8, 16, 16, true, false, false ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } else if (numImages % 64 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 2, 8, 16, 16, true, false, false >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 4, 32, 2, 8, 16, 16, true, false, false ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } else if (numImages % 32 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 1, 8, 16, 16, true, false, false >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 4, 32, 1, 8, 16, 16, true, false, false ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } else if (numImages % 16 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 1, 8, 16, 16, true, false, false >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 4, 32, 1, 8, 16, 16, true, false, false ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } } } else if (numFilterColors % 16 == 0) { if (numFilters % 16 == 0) { if (numImages % 128 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 4, 4, 16, 16, true, false, false >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 4, 32, 4, 4, 16, 16, true, false, false ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } else if (numImages % 64 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 2, 4, 16, 16, true, false, false >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 4, 32, 2, 4, 16, 16, true, false, false ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } else if (numImages % 32 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 1, 4, 16, 16, true, false, false >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 4, 32, 1, 4, 16, 16, true, false, false ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } else if (numImages % 16 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 1, 4, 16, 16, true, false, false >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 4, 32, 1, 4, 16, 16, true, false, false ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } } } else if (numFilterColors % 8 == 0) { if (numFilters % 16 == 0) { if (numImages % 128 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 4, 2, 16, 16, true, false, false >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 4, 32, 4, 2, 16, 16, true, false, false ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } else if (numImages % 64 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 2, 2, 16, 16, true, false, false >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 4, 32, 2, 2, 16, 16, true, false, false ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } else if (numImages % 32 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 1, 2, 16, 16, true, false, false >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 4, 32, 1, 2, 16, 16, true, false, false ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } else if (numImages % 16 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 1, 2, 16, 16, true, false, false >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 4, 32, 1, 2, 16, 16, true, false, false ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } } } } else if (numFilterColors > 3) { if (numFilterColors == 4) { if (numFilters % 16 == 0) { if (numImages % 128 == 0) { cudaFuncSetCacheConfig(img_acts_mediumcolor < 8, 4, true, false, false >, cudaFuncCachePreferShared); img_acts_mediumcolor < 8, 4, true, false, false ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } else if (numImages % 64 == 0) { cudaFuncSetCacheConfig(img_acts_mediumcolor < 4, 4, true, false, false >, cudaFuncCachePreferShared); img_acts_mediumcolor < 4, 4, true, false, false ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } else if (numImages % 32 == 0) { cudaFuncSetCacheConfig(img_acts_mediumcolor < 2, 4, true, false, false >, cudaFuncCachePreferShared); img_acts_mediumcolor < 2, 4, true, false, false ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } else if (numImages % 16 == 0) { cudaFuncSetCacheConfig(img_acts_mediumcolor < 2, 4, true, false, false >, cudaFuncCachePreferShared); img_acts_mediumcolor < 2, 4, true, false, false ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } } } else if (numFilterColors == 2) { if (numFilters % 16 == 0) { if (numImages % 128 == 0) { cudaFuncSetCacheConfig(img_acts_color < 8, 2, true, false, false >, cudaFuncCachePreferShared); img_acts_color < 8, 2, true, false, false ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput); } else if (numImages % 64 == 0) { cudaFuncSetCacheConfig(img_acts_color < 4, 2, true, false, false >, cudaFuncCachePreferShared); img_acts_color < 4, 2, true, false, false ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput); } else if (numImages % 32 == 0) { cudaFuncSetCacheConfig(img_acts_color < 2, 2, true, false, false >, cudaFuncCachePreferShared); img_acts_color < 2, 2, true, false, false ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput); } else if (numImages % 16 == 0) { cudaFuncSetCacheConfig(img_acts_color < 2, 2, true, false, false >, cudaFuncCachePreferShared); img_acts_color < 2, 2, true, false, false ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput); } } } } else if (numFilterColors <= 3) { if (numFilterColors == 3) { if (numFilters % 16 == 0) { if (numImages % 128 == 0) { cudaFuncSetCacheConfig(img_acts_color < 8, 3, true, false, false >, cudaFuncCachePreferShared); img_acts_color < 8, 3, true, false, false ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput); } else if (numImages % 64 == 0) { cudaFuncSetCacheConfig(img_acts_color < 4, 3, true, false, false >, cudaFuncCachePreferShared); img_acts_color < 4, 3, true, false, false ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput); } else if (numImages % 32 == 0) { cudaFuncSetCacheConfig(img_acts_color < 2, 3, true, false, false >, cudaFuncCachePreferShared); img_acts_color < 2, 3, true, false, false ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput); } else if (numImages % 16 == 0) { cudaFuncSetCacheConfig(img_acts_color < 2, 3, true, false, false >, cudaFuncCachePreferShared); img_acts_color < 2, 3, true, false, false ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput); } } } else if (numFilterColors == 2) { if (numFilters % 16 == 0) { if (numImages % 128 == 0) { cudaFuncSetCacheConfig(img_acts_color < 8, 2, true, false, false >, cudaFuncCachePreferShared); img_acts_color < 8, 2, true, false, false ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput); } else if (numImages % 64 == 0) { cudaFuncSetCacheConfig(img_acts_color < 4, 2, true, false, false >, cudaFuncCachePreferShared); img_acts_color < 4, 2, true, false, false ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput); } else if (numImages % 32 == 0) { cudaFuncSetCacheConfig(img_acts_color < 2, 2, true, false, false >, cudaFuncCachePreferShared); img_acts_color < 2, 2, true, false, false ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput); } else if (numImages % 16 == 0) { cudaFuncSetCacheConfig(img_acts_color < 2, 2, true, false, false >, cudaFuncCachePreferShared); img_acts_color < 2, 2, true, false, false ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput); } } } else if (numFilterColors == 1) { if (numFilters % 16 == 0) { if (numImages % 128 == 0) { cudaFuncSetCacheConfig(img_acts_color < 8, 1, true, false, false >, cudaFuncCachePreferShared); img_acts_color < 8, 1, true, false, false ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput); } else if (numImages % 64 == 0) { cudaFuncSetCacheConfig(img_acts_color < 4, 1, true, false, false >, cudaFuncCachePreferShared); img_acts_color < 4, 1, true, false, false ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput); } else if (numImages % 32 == 0) { cudaFuncSetCacheConfig(img_acts_color < 2, 1, true, false, false >, cudaFuncCachePreferShared); img_acts_color < 2, 1, true, false, false ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput); } else if (numImages % 16 == 0) { cudaFuncSetCacheConfig(img_acts_color < 2, 1, true, false, false >, cudaFuncCachePreferShared); img_acts_color < 2, 1, true, false, false ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput); } } } } } else if (checkCaseBounds == true) { if (numFilterColors % 8 == 0) { if (numFilterColors % 64 == 0) { if (numFilters % 32 == 0) { if (numImages % 1 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 8, 32, 1, 8, 32, 16, true, true, false >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 8, 32, 1, 8, 32, 16, true, true, false ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } } else if (numFilters % 16 == 0) { if (numImages % 1 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 8, 32, 1, 8, 16, 16, true, true, false >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 8, 32, 1, 8, 16, 16, true, true, false ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } } } else if (numFilterColors % 48 == 0) { if (numFilters % 16 == 0) { if (numImages % 1 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 1, 12, 16, 16, true, true, false >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 4, 32, 1, 12, 16, 16, true, true, false ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } } } else if (numFilterColors % 32 == 0) { if (numFilters % 32 == 0) { if (numImages % 1 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 1, 8, 32, 16, true, true, false >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 4, 32, 1, 8, 32, 16, true, true, false ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } } else if (numFilters % 16 == 0) { if (numImages % 1 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 1, 8, 16, 16, true, true, false >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 4, 32, 1, 8, 16, 16, true, true, false ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } } } else if (numFilterColors % 16 == 0) { if (numFilters % 16 == 0) { if (numImages % 1 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 1, 4, 16, 16, true, true, false >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 4, 32, 1, 4, 16, 16, true, true, false ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } } } else if (numFilterColors % 8 == 0) { if (numFilters % 16 == 0) { if (numImages % 1 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 1, 2, 16, 16, true, true, false >, cudaFuncCachePreferShared); conv_img_acts_manycolor_kepler < 4, 32, 1, 2, 16, 16, true, true, false ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } } } } else if (numFilterColors > 3) { if (numFilterColors == 4) { if (numFilters % 16 == 0) { if (numImages % 1 == 0) { cudaFuncSetCacheConfig(img_acts_mediumcolor < 2, 4, true, true, false >, cudaFuncCachePreferShared); img_acts_mediumcolor < 2, 4, true, true, false ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } } } else if (numFilterColors == 2) { if (numFilters % 16 == 0) { if (numImages % 1 == 0) { cudaFuncSetCacheConfig(img_acts_color < 2, 2, true, true, false >, cudaFuncCachePreferShared); img_acts_color < 2, 2, true, true, false ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput); } } } } else if (numFilterColors <= 3) { if (numFilterColors == 3) { if (numFilters % 16 == 0) { if (numImages % 1 == 0) { cudaFuncSetCacheConfig(img_acts_color < 2, 3, true, true, false >, cudaFuncCachePreferShared); img_acts_color < 2, 3, true, true, false ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput); } } } else if (numFilterColors == 2) { if (numFilters % 16 == 0) { if (numImages % 1 == 0) { cudaFuncSetCacheConfig(img_acts_color < 2, 2, true, true, false >, cudaFuncCachePreferShared); img_acts_color < 2, 2, true, true, false ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput); } } } else if (numFilterColors == 1) { if (numFilters % 16 == 0) { if (numImages % 1 == 0) { cudaFuncSetCacheConfig(img_acts_color < 2, 1, true, true, false >, cudaFuncCachePreferShared); img_acts_color < 2, 1, true, true, false ><<>>(hidacts_data, filters_data, targets_data, numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput); } } } } } } } checkCudaErrors(cudaDestroyTextureObject(tex_hidacts)); checkCudaErrors(cudaDestroyTextureObject(tex_filters)); getLastCudaError("imgActs: kernel execution failed"); } void convImgActs(caffe2::CUDAContext* context, caffe2::TensorCUDA* hidActs, caffe2::TensorCUDA* filters, caffe2::TensorCUDA* targets, int imgSizeY, int imgSizeX, int numModulesY, int paddingStart, int moduleStride, int numImgColors, int numGroups) { _imgActs(context, hidActs, filters, targets, imgSizeY, imgSizeX, numModulesY, paddingStart, moduleStride, numImgColors, numGroups, 0, 1, true); } void convImgActs(caffe2::CUDAContext* context, caffe2::TensorCUDA* hidActs, caffe2::TensorCUDA* filters, caffe2::TensorCUDA* targets, int imgSizeY, int imgSizeX, int numModulesY, int paddingStart, int moduleStride, int numImgColors, int numGroups, float scaleTargets, float scaleOutput) { _imgActs(context, hidActs, filters, targets, imgSizeY, imgSizeX, numModulesY, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput, true); } void localImgActs(caffe2::CUDAContext* context, caffe2::TensorCUDA* hidActs, caffe2::TensorCUDA* filters, caffe2::TensorCUDA* targets, int imgSizeY, int imgSizeX, int numModulesY, int paddingStart, int moduleStride, int numImgColors, int numGroups) { _imgActs(context, hidActs, filters, targets, imgSizeY, imgSizeX, numModulesY, paddingStart, moduleStride, numImgColors, numGroups, 0, 1, false); } void localImgActs(caffe2::CUDAContext* context, caffe2::TensorCUDA* hidActs, caffe2::TensorCUDA* filters, caffe2::TensorCUDA* targets, int imgSizeY, int imgSizeX, int numModulesY, int paddingStart, int moduleStride, int numImgColors, int numGroups, float scaleTargets, float scaleOutput) { _imgActs(context, hidActs, filters, targets, imgSizeY, imgSizeX, numModulesY, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput, false); } ================================================ FILE: caffe2/contrib/cuda-convnet2/cudaconv3/src/weight_acts.cu ================================================ /* * Copyright 2014 Google Inc. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include #include "../include/cudaconv2.cuh" #define LO16(x) ((x) & 0x0000FFFF) #define HI16(x) ((x) >> 16) #define WA_LOOP(r) _Pragma("unroll") \ for (int c = 0; c < colorsPerThread; c++) { \ _Pragma("unroll") \ for (int f = 0; f < filtersPerThread; f++) { \ prod[f][c] += shImages[threadIdx.y + c * B_Y][(r)] * shHidActs[threadIdx.x + f * B_X][(r)]; \ } \ } #define WA_LOOP2(r) _Pragma("unroll") \ for (int f = 0; f < filtersPerThread; f++) { \ _Pragma("unroll") \ for (int c = 0; c < colorsPerThread; c++) { \ prod[f][c] += shImages[threadIdx.y + c * B_Y][(r)] * shHidActs[threadIdx.x + f * B_X][(r)]; \ } \ } #define WA_IMLOAD(r) imPreload[r] = im[(r) * B_X * B_Y / preloadCases * imgPixels * imgStride]; #define WA_IMLOAD_TX(r) imPreload[r] = tex1Dfetch(images, imgOffset2 + (r) * B_X * B_Y / preloadCases * imgPixels * imgStride); #define WA_HALOAD(r) haPreload[r] = ha[(r) * B_X * B_Y / preloadCases * numImages * numModules]; #define WA_HALOAD_TX(r) haPreload[r] = tex1Dfetch(hidActs, hidActsOffset2 + (r) * B_X * B_Y / preloadCases * numImages * numModules); __device__ __forceinline__ void conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_32_f_4_c_8_r_16_setCoords( const int my, const int mx, const int paddingStart, const int numModulesX, const int moduleStride, const int blockPixelY, const int blockPixelX, const int imgSizeX, const int imgStride, int& pixIdx, int& m) { const int imgLoadModPosY = paddingStart + my * moduleStride; const int imgLoadModPosX = paddingStart + mx * moduleStride; const int pxY = imgLoadModPosY + blockPixelY; // pixel x,y coords in image const int pxX = imgLoadModPosX + blockPixelX; pixIdx = (pxY * imgSizeX + pxX) * imgStride; // pixel idx in image m = my * numModulesX + mx; } /* * Each block computes weight gradients for B_Y * pixelsPerThread pixels and B_X filters * threadIdx.x determines filter * threadIdx.y determines pixel in filter * * blockIdx.x determines filter batch of B_X * filtersPerThread, module batch of partialSum * blockIdx.y determines pixel batch of B_Y * pixelsPerThread * * Number of filters must be divisible by B_X * filtersPerThread * Number of images (cases) should be divisible by preloadCases if checkCaseBounds is false. * * images: (numColors, imgSizeY, imgSizeX, numImages), with stride given * hidActs: (numFilters, numModulesY, numModulesX, numImages) * * targets: (numModulesY*numModulesX/partialSum, numColors, filterPixels, numFilters) * * B_Y * B_X should be divisible by preloadCases. * preloadCases one of 16, 32. * B_X one of 4, 8, 16, 32 * B_Y arbitrary (satisfying divisibility constraints) * numModules must be divisible by partialSum * pixelsPerThread must be divisible by pixelCache * * After adding pixelsPerThread, register usage went from 20 to 23 (when pixelsPerThread = 1)... * so the compiler is messing up here somehow. It's unable to optimize that case away. */ template __global__ void conv_weight_acts_c_kepler(float* images, float* hidActs, float* targets, const int numImages, const int numFilters, const int numModulesY, const int numModulesX, const int imgSizeY, const int imgSizeX, const int filterSize, const int paddingStart, const int moduleStride, const int imgStride, const int partialSum, const float scaleTargets, const float scaleOutputs) { __shared__ float shImages[pixelCache * B_Y * numColors][preloadCases]; // preload preloadCases cases of B_Y * pixelsPerThread pixels __shared__ float shHidActs[B_X * filtersPerThread][preloadCases + 1]; // preload preloadCases cases of B_X hidActs const int tidx = B_X * threadIdx.y + threadIdx.x; const int loadY = tidx / preloadCases, loadX = tidx % preloadCases; const int filterPixels = filterSize * filterSize; const int imgPixels = imgSizeY * imgSizeX; const int filterBlocksPerModule = numFilters / (B_X*filtersPerThread); const int outputModuleIdx = blockIdx.x / filterBlocksPerModule; const int moduleIdx = partialSum * outputModuleIdx; const int blockFilterIdx = B_X * filtersPerThread* (blockIdx.x % filterBlocksPerModule); // const int moduleStride = (imgSize - filterSize + 1) / numModulesX; const int numModules = numModulesY * numModulesX; const int blockPixelOffset = blockIdx.y * B_Y * pixelsPerThread; images += loadX; hidActs += blockFilterIdx * numImages * numModules + loadY * numImages * numModules + loadX; targets += (outputModuleIdx * numFilters) * filterPixels * numColors + blockPixelOffset * numFilters + blockFilterIdx + threadIdx.y * numFilters + threadIdx.x; float prod[numColors][pixelsPerThread][filtersPerThread]; #pragma unroll for (int c = 0; c < numColors; c++) { #pragma unroll for (int p = 0; p < pixelsPerThread; p++) { #pragma unroll for (int f = 0; f < filtersPerThread; f++) { prod[c][p][f] = 0; } } } __shared__ int pxIdxes[B_Y*pixelsPerThread]; //__shared__ bool isPxInImage[B_Y*pixelsPerThread]; for (int m = moduleIdx; m < moduleIdx + partialSum; m++) { __syncthreads(); if (tidx < B_Y * pixelsPerThread) { const int imgLoadModPosY = paddingStart + (m / numModulesX) * moduleStride; const int imgLoadModPosX = paddingStart + (m % numModulesX) * moduleStride; int pxY = (imgLoadModPosY + (blockPixelOffset + tidx) / filterSize); int pxX = (imgLoadModPosX + (blockPixelOffset + tidx) % filterSize); int pixIdx = (pxY * imgSizeX + pxX) * imgStride; pxIdxes[tidx] = pxY >= 0 && pxY < imgSizeY && pxX >= 0 && pxX < imgSizeX ? pixIdx : -1; //isPxInImage[tidx] = ; } __syncthreads(); for (int caseIdx = 0; caseIdx < numImages; caseIdx += preloadCases) { if (/*loadY < B_X*filtersPerThread &&*/ (!checkCaseBounds || caseIdx + loadX < numImages)) { #pragma unroll for (int y = 0; y < B_X*filtersPerThread; y += (B_X * B_Y) / preloadCases) { // Make sure number of rows in the array is divisible by number of rows filled per iteration if ((B_X*filtersPerThread) % (B_X * B_Y / preloadCases) == 0 || y + loadY < B_X*filtersPerThread) { shHidActs[loadY+y][loadX]= hidActs[caseIdx + y * numImages * numModules + m * numImages]; } } } #pragma unroll for (int pp = 0; pp < pixelsPerThread; pp += pixelCache) { //if (loadY < B_Y * pixelCache) { // This condition is not necessary for correctness, but it speeds things a bit /* * As long as B_Y * B_X is divisible by preloadCases this will loop the right * number of times. * * This will load some imgGrads from filter pixels that don't exit (it'll set those to 0), * but the code does not produce any output for those pixels (see last lines). */ #pragma unroll for (int y = 0; y < B_Y * pixelCache; y += (B_X * B_Y) / preloadCases) { // Make sure number of rows in the array is divisible by number of rows filled per iteration if ((B_Y * pixelCache) % (B_X * B_Y / preloadCases) == 0 || y + loadY < B_Y * pixelCache) { const int pxIdx = pp * B_Y + loadY + y; // pixel idx in filter if (pxIdx + blockPixelOffset < filterPixels && (!checkCaseBounds || caseIdx + loadX < numImages)) { const int pixIdx = pxIdxes[pxIdx];//(pxY * imgSizeX + pxX) * imgStride; if (pixIdx >= 0) { #pragma unroll for (int c = 0; c < numColors; c++) { shImages[loadY+y + c * pixelCache * B_Y][loadX] = images[caseIdx + c * imgPixels * imgStride + pixIdx]; } } else { #pragma unroll for (int c = 0; c < numColors; c++) { shImages[loadY+y + c * pixelCache * B_Y][loadX] = 0; } } } else { #pragma unroll for (int c = 0; c < numColors; c++) { shImages[loadY+y + c * pixelCache * B_Y][loadX]= 0; } } } } //} __syncthreads(); #pragma unroll for (int i = 0; i < preloadCases; i++) { #pragma unroll for (int f = 0; f < filtersPerThread; f++) { #pragma unroll for (int p = 0; p < pixelCache; p++) { #pragma unroll for (int c = 0; c < numColors; c++) { prod[c][pp + p][f] += shImages[threadIdx.y + p * B_Y + c * pixelCache * B_Y][i] * shHidActs[threadIdx.x + f * B_X][i]; } } } } __syncthreads(); } } } if (scale) { #pragma unroll for (int p = 0; p < pixelsPerThread; p++) { if (blockPixelOffset + p * B_Y + threadIdx.y < filterPixels) { #pragma unroll for (int c = 0; c < numColors; c++) { #pragma unroll for (int f = 0; f < filtersPerThread; f++) { targets[p * B_Y * numFilters + c * filterPixels * numFilters + f * B_X] = scaleTargets * targets[p * B_Y * numFilters + c * filterPixels * numFilters + f * B_X] + scaleOutputs * prod[c][p][f]; } } } } } else { #pragma unroll for (int p = 0; p < pixelsPerThread; p++) { if (blockPixelOffset + p * B_Y + threadIdx.y < filterPixels) { #pragma unroll for (int c = 0; c < numColors; c++) { #pragma unroll for (int f = 0; f < filtersPerThread; f++) { targets[p * B_Y * numFilters + c * filterPixels * numFilters + f * B_X] = scaleOutputs * prod[c][p][f]; } } } } } } /* * Each block computes weight gradients for 1 pixel, B_Y * colorsPerThread colors and B_X * filtersPerThread filters * threadIdx.x determines filter * threadIdx.y determines color * * blockIdx.x determines filter batch of B_X * filtersPerThread, module batch of partialSum * blockIdx.y determines color batch of B_Y * colorsPerThread * blockIdx.z determines pixel in filter * NOTE: blockIdx.z is limited to values < 2^16. This means that this routine will * fail for filters >= 256*256. I'm assuming I won't ever use such large filters. * images: (numImgColors, imgSizeY, imgSizeX, numImages), with stride given * hidActs: (numFilters, numModulesY, numModulesX, numImages) * * targets: (numModulesY*numModulesX/partialSum, numFilterColors, filterPixels, numFilters) * B_X * B_Y must be divisible by preloadCases */ template __global__ void conv_weight_acts_mc_mf_kepler(float* images, float* hidActs, float* targets, const int numImages, const int numFilters, const int numModulesY, const int numModulesX, const int imgSizeY, const int imgSizeX, const int filterSize, const int paddingStart, const int moduleStride, const int imgStride, const int numImgColors, const int numGroups, const int partialSum, const float scaleTargets, const float scaleOutputs) { __shared__ float shImages[colorsPerThread * B_Y][preloadCases]; // preload preloadCases cases __shared__ float shHidActs[filtersPerThread * B_X][preloadCases + 1]; // preload preloadCases cases of B_X hidacts const int tidx = B_X * threadIdx.y + threadIdx.x; const int loadY = tidx / preloadCases, loadX = tidx % preloadCases; const int filterPixels = filterSize * filterSize; const int imgPixels = imgSizeY * imgSizeX; const int numFilterBlocks = numFilters / (B_X * filtersPerThread); const int outputModuleIdx = blockIdx.x / numFilterBlocks; const int moduleIdx = partialSum * outputModuleIdx; const int blockFilterIdx = filtersPerThread * B_X * (blockIdx.x % numFilterBlocks); const int numModules = numModulesY * numModulesX; const int numFiltersPerGroup = numFilters / numGroups; const int blockGroupIdx = blockFilterIdx / numFiltersPerGroup; const int numFilterColors = numImgColors / numGroups; const int blockPixelOffset = blockIdx.z; // pixel idx in filter const int blockPixelY = blockPixelOffset / filterSize, blockPixelX = blockPixelOffset % filterSize; const int blockFilterColorIdx = blockIdx.y * B_Y * colorsPerThread; const int imgColorIdx = blockFilterColorIdx + blockGroupIdx * numFilterColors; images += (imgColorIdx + loadY) * imgPixels * imgStride + loadX; hidActs += blockFilterIdx * numImages * numModules + loadY * numImages * numModules + loadX; targets += outputModuleIdx * numFilters * filterPixels * numFilterColors + (blockFilterColorIdx + threadIdx.y) * filterPixels * numFilters + blockPixelOffset * numFilters + blockFilterIdx + threadIdx.x; //if (blockIdx.x != 0 || blockIdx.y != 0 || blockIdx.z != 0) return; float* shHidActLoad = &shHidActs[loadY][loadX]; float* shImgLoad = &shImages[loadY][loadX]; float prod[colorsPerThread][filtersPerThread]; #pragma unroll for (int c = 0; c < colorsPerThread; c++) { #pragma unroll for (int f = 0; f < filtersPerThread; f++) { prod[c][f] = 0; } } for (int m = moduleIdx; m < moduleIdx + partialSum; m++) { const int imgLoadModPosY = paddingStart + (m / numModulesX) * moduleStride; const int imgLoadModPosX = paddingStart + (m % numModulesX) * moduleStride; const int pxY = imgLoadModPosY + blockPixelY; // pixel x,y coords in image const int pxX = imgLoadModPosX + blockPixelX; const int pixIdx = (pxY * imgSizeX + pxX) * imgStride; // pixel idx in image if (pxY >= 0 && pxY < imgSizeY && pxX >= 0 && pxX < imgSizeX) { for (int caseIdx = 0; caseIdx < numImages; caseIdx += preloadCases) { // Checking this condition actually makes things faster ... :/ // So I've removed the !checkCaseBounds flag and just check it all the time. if (caseIdx + loadX < numImages) { /* * As long as B_Y * B_X is divisible by preloadCases this will loop the right * number of times. * * This will load some images from filter pixels that don't exist (it'll set those to 0), * but the code does not produce any output for those pixels (see last lines). */ if (loadY < B_Y * colorsPerThread) { #pragma unroll for (int y = 0; y < B_Y * colorsPerThread; y += (B_X * B_Y) / preloadCases) { // Make sure number of rows in the array is divisible by number of rows filled per iteration if ((B_Y*colorsPerThread) % (B_X * B_Y / preloadCases) == 0 || y + loadY < B_Y*colorsPerThread) { shImgLoad[(y) * preloadCases] = images[caseIdx + y * imgPixels * imgStride + pixIdx]; } } } if (loadY < B_X * filtersPerThread) { #pragma unroll for (int y = 0; y < B_X * filtersPerThread; y += (B_X * B_Y) / preloadCases) { // Make sure number of rows in the array is divisible by number of rows filled per iteration if ((B_X * filtersPerThread) % (B_X * B_Y / preloadCases) == 0 || y + loadY < B_X * filtersPerThread) { shHidActLoad[y * (preloadCases + 1)] = hidActs[caseIdx + y * numImages * numModules + m * numImages]; } } } } else { #pragma unroll for (int y = 0; y < B_Y * colorsPerThread; y += (B_X * B_Y) / preloadCases) { // Make sure number of rows in the array is divisible by number of rows filled per iteration if ((B_Y*colorsPerThread) % (B_X * B_Y / preloadCases) == 0 || y + loadY < B_Y*colorsPerThread) { shImgLoad[(y) * preloadCases] = 0; } } #pragma unroll for (int y = 0; y < B_X * filtersPerThread; y += (B_X * B_Y) / preloadCases) { // Make sure number of rows in the array is divisible by number of rows filled per iteration if ((B_X * filtersPerThread) % (B_X * B_Y / preloadCases) == 0 || y + loadY < B_X * filtersPerThread) { shHidActLoad[y * (preloadCases + 1)] = 0; } } } __syncthreads(); #pragma unroll for (int i = 0; i < preloadCases; i++) { #pragma unroll for (int f = 0; f < filtersPerThread; f++) { #pragma unroll for (int c = 0; c < colorsPerThread; c++) { prod[c][f] += shImages[threadIdx.y + c * B_Y][i] * shHidActs[threadIdx.x + f * B_X][i]; } } } __syncthreads(); } } } if (scale) { #pragma unroll for (int c = 0; c < colorsPerThread; c++) { #pragma unroll for (int f = 0; f < filtersPerThread; f++) { targets[c * B_Y * filterPixels * numFilters + f * B_X] = scaleTargets * targets[c * B_Y * filterPixels * numFilters + f * B_X] + scaleOutputs * prod[c][f]; } } } else { #pragma unroll for (int c = 0; c < colorsPerThread; c++) { #pragma unroll for (int f = 0; f < filtersPerThread; f++) { targets[c * B_Y * filterPixels * numFilters + f * B_X] = scaleOutputs * prod[c][f]; } } } } /* * Each block computes weight gradients for 1 pixel, B_Y * colorsPerThread colors and B_X * filtersPerThread filters * threadIdx.x determines filter * threadIdx.y determines color * * blockIdx.x determines filter batch of B_X * filtersPerThread, module batch of partialSum * blockIdx.y determines color batch of B_Y * colorsPerThread * blockIdx.z determines pixel in filter * NOTE: blockIdx.z is limited to values < 2^16. This means that this routine will * fail for filters >= 256*256. I'm assuming I won't ever use such large filters. * images: (numImgColors, imgSizeY, imgSizeX, numImages), with stride given * hidActs: (numFilters, numModulesY, numModulesX, numImages) * * targets: (numModulesY*numModulesX/partialSum, numFilterColors, filterPixels, numFilters) * B_X * B_Y must be divisible by preloadCases */ template __global__ void conv_weight_acts_mc_mf_kepler_sw(float* images, float* hidActs, float* targets, const int numImages, const int numFilters, const int numModulesY, const int numModulesX, const int imgSizeY, const int imgSizeX, const int filterSize, const int paddingStart, const int moduleStride, const int imgStride, const int numImgColors, const int numGroups, const int sumWidth, const float scaleTargets, const float scaleOutputs) { __shared__ float shImages[colorsPerThread * B_Y][preloadCases]; // preload preloadCases cases __shared__ float shHidActs[filtersPerThread * B_X][preloadCases + 1]; // preload preloadCases cases of B_X hidacts const int tidx = B_X * threadIdx.y + threadIdx.x; const int loadY = tidx / preloadCases, loadX = tidx % preloadCases; const int filterPixels = filterSize * filterSize; const int imgPixels = imgSizeY * imgSizeX; const int numFilterBlocks = numFilters / (B_X * filtersPerThread); const int blockModuleChunkIdx = blockIdx.x / numFilterBlocks; const int numModuleChunksX = DIVUP(numModulesX, sumWidth); // const int numModuleChunksY = DIVUP(numModulesY, sumWidth); const int blockModuleChunkX = blockModuleChunkIdx % numModuleChunksX; const int blockModuleChunkY = blockModuleChunkIdx / numModuleChunksX; const int blockModuleStartX = blockModuleChunkX * sumWidth; const int blockModuleStartY = blockModuleChunkY * sumWidth; const int blockFilterIdx = filtersPerThread * B_X * (blockIdx.x % numFilterBlocks); const int numModules = numModulesY * numModulesX; const int numFiltersPerGroup = numFilters / numGroups; const int blockGroupIdx = blockFilterIdx / numFiltersPerGroup; const int numFilterColors = numImgColors / numGroups; const int blockPixelOffset = blockIdx.z; // pixel idx in filter const int blockPixelY = blockPixelOffset / filterSize, blockPixelX = blockPixelOffset % filterSize; const int blockFilterColorIdx = blockIdx.y * B_Y * colorsPerThread; const int imgColorIdx = blockFilterColorIdx + blockGroupIdx * numFilterColors; images += (imgColorIdx + loadY) * imgPixels * imgStride + loadX; hidActs += blockFilterIdx * numImages * numModules + loadY * numImages * numModules + loadX; targets += blockModuleChunkIdx * numFilters * filterPixels * numFilterColors + (blockFilterColorIdx + threadIdx.y) * filterPixels * numFilters + blockPixelOffset * numFilters + blockFilterIdx + threadIdx.x; //if (blockIdx.x != 0 || blockIdx.y != 0 || blockIdx.z != 0) return; const int mStartX = max(blockModuleStartX, DIVUP(-blockPixelX - paddingStart, moduleStride)); const int mStartY = max(blockModuleStartY, DIVUP(-blockPixelY - paddingStart, moduleStride)); const int mEndX = min(numModulesX, min(blockModuleStartX + sumWidth, DIVUP(imgSizeX - blockPixelX - paddingStart, moduleStride))); const int mEndY = min(numModulesY, min(blockModuleStartY + sumWidth, DIVUP(imgSizeY - blockPixelY - paddingStart, moduleStride))); // if (mStartY == mEndY || mStartX == mEndX) { // return; // } float* shHidActLoad = &shHidActs[loadY][loadX]; float* shImgLoad = &shImages[loadY][loadX]; float prod[colorsPerThread][filtersPerThread]; #pragma unroll for (int c = 0; c < colorsPerThread; c++) { #pragma unroll for (int f = 0; f < filtersPerThread; f++) { prod[c][f] = 0; } } /* * Note; iterating this way is about 1% slower and uses a few more registers than iterating * over the modules linearly. But it's consistent with the preload routines, * so I'm using it. */ for (int my = mStartY; my < mEndY; my++) { const int imgLoadModPosY = paddingStart + my * moduleStride; const int pxY = imgLoadModPosY + blockPixelY; // pixel x,y coords in image for (int mx = mStartX; mx < mEndX; mx++) { const int m = my * numModulesX + mx; const int imgLoadModPosX = paddingStart + mx * moduleStride; const int pxX = imgLoadModPosX + blockPixelX; const int pixIdx = (pxY * imgSizeX + pxX) * imgStride; // pixel idx in image for (int caseIdx = 0; caseIdx < numImages; caseIdx += preloadCases) { // Checking this condition actually makes things faster ... :/ // So I've removed the !checkCaseBounds flag and just check it all the time. if (caseIdx + loadX < numImages) { /* * As long as B_Y * B_X is divisible by preloadCases this will loop the right * number of times. * * This will load some images from filter pixels that don't exist (it'll set those to 0), * but the code does not produce any output for those pixels (see last lines). */ if (loadY < B_Y * colorsPerThread) { #pragma unroll for (int y = 0; y < B_Y * colorsPerThread; y += (B_X * B_Y) / preloadCases) { // Make sure number of rows in the array is divisible by number of rows filled per iteration if ((B_Y*colorsPerThread) % (B_X * B_Y / preloadCases) == 0 || y + loadY < B_Y*colorsPerThread) { shImgLoad[(y) * preloadCases] = images[caseIdx + y * imgPixels * imgStride + pixIdx]; } } } if (loadY < B_X * filtersPerThread) { #pragma unroll for (int y = 0; y < B_X * filtersPerThread; y += (B_X * B_Y) / preloadCases) { // Make sure number of rows in the array is divisible by number of rows filled per iteration if ((B_X * filtersPerThread) % (B_X * B_Y / preloadCases) == 0 || y + loadY < B_X * filtersPerThread) { shHidActLoad[y * (preloadCases + 1)] = hidActs[caseIdx + y * numImages * numModules + m * numImages]; } } } } else { #pragma unroll for (int y = 0; y < B_Y * colorsPerThread; y += (B_X * B_Y) / preloadCases) { // Make sure number of rows in the array is divisible by number of rows filled per iteration if ((B_Y*colorsPerThread) % (B_X * B_Y / preloadCases) == 0 || y + loadY < B_Y*colorsPerThread) { shImgLoad[(y) * preloadCases] = 0; } } #pragma unroll for (int y = 0; y < B_X * filtersPerThread; y += (B_X * B_Y) / preloadCases) { // Make sure number of rows in the array is divisible by number of rows filled per iteration if ((B_X * filtersPerThread) % (B_X * B_Y / preloadCases) == 0 || y + loadY < B_X * filtersPerThread) { shHidActLoad[y * (preloadCases + 1)] = 0; } } } __syncthreads(); #pragma unroll for (int i = 0; i < preloadCases; i++) { #pragma unroll for (int f = 0; f < filtersPerThread; f++) { #pragma unroll for (int c = 0; c < colorsPerThread; c++) { prod[c][f] += shImages[threadIdx.y + c * B_Y][i] * shHidActs[threadIdx.x + f * B_X][i]; } } } __syncthreads(); } } } if (scale) { #pragma unroll for (int c = 0; c < colorsPerThread; c++) { #pragma unroll for (int f = 0; f < filtersPerThread; f++) { targets[c * B_Y * filterPixels * numFilters + f * B_X] = scaleTargets * targets[c * B_Y * filterPixels * numFilters + f * B_X] + scaleOutputs * prod[c][f]; } } } else { #pragma unroll for (int c = 0; c < colorsPerThread; c++) { #pragma unroll for (int f = 0; f < filtersPerThread; f++) { targets[c * B_Y * filterPixels * numFilters + f * B_X] = scaleOutputs * prod[c][f]; } } } } /* * Each block computes weight gradients for B_Y * pixelsPerThread pixels and B_X filters * threadIdx.x determines filter * threadIdx.y determines pixel in filter * * blockIdx.x determines filter batch of B_X * filtersPerThread, module batch of partialSum * blockIdx.y determines pixel batch of B_Y * pixelsPerThread * * Number of filters must be divisible by B_X * filtersPerThread * Number of images (cases) should be divisible by preloadCases if checkCaseBounds is false. * * images: (numColors, imgSizeY, imgSizeX, numImages), with stride given * hidActs: (numFilters, numModulesY, numModulesX, numImages) * * targets: (numModulesY*numModulesX/partialSum, numColors, filterPixels, numFilters) * * B_Y * B_X should be divisible by preloadCases. * preloadCases one of 16, 32. * B_X one of 4, 8, 16, 32 * B_Y arbitrary (satisfying divisibility constraints) * numModules must be divisible by partialSum * pixelsPerThread must be divisible by pixelCache * * After adding pixelsPerThread, register usage went from 20 to 23 (when pixelsPerThread = 1)... * so the compiler is messing up here somehow. It's unable to optimize that case away. */ template __global__ void conv_weight_acts_c_kepler_sw(float* images, float* hidActs, float* targets, const int numImages, const int numFilters, const int numModulesY, const int numModulesX, const int imgSizeY, const int imgSizeX, const int filterSize, const int paddingStart, const int moduleStride, const int imgStride, const int sumWidth, const float scaleTargets, const float scaleOutputs) { __shared__ float shImages[pixelCache * B_Y * numColors][preloadCases]; // preload preloadCases cases of B_Y * pixelsPerThread pixels __shared__ float shHidActs[B_X * filtersPerThread][preloadCases + 1]; // preload preloadCases cases of B_X hidActs const int tidx = B_X * threadIdx.y + threadIdx.x; const int loadY = tidx / preloadCases, loadX = tidx % preloadCases; const int filterPixels = filterSize * filterSize; const int imgPixels = imgSizeY * imgSizeX; const int numFilterBlocks = numFilters / (B_X*filtersPerThread); const int blockModuleChunkIdx = blockIdx.x / numFilterBlocks; const int numModuleChunksX = DIVUP(numModulesX, sumWidth); // const int numModuleChunksY = DIVUP(numModulesY, sumWidth); const int blockModuleChunkX = blockModuleChunkIdx % numModuleChunksX; const int blockModuleChunkY = blockModuleChunkIdx / numModuleChunksX; const int blockModuleStartX = blockModuleChunkX * sumWidth; const int blockModuleStartY = blockModuleChunkY * sumWidth; const int blockFilterIdx = B_X * filtersPerThread* (blockIdx.x % numFilterBlocks); // const int moduleStride = (imgSize - filterSize + 1) / numModulesX; const int numModules = numModulesY * numModulesX; const int blockPixelOffset = blockIdx.y * B_Y * pixelsPerThread; images += loadX; hidActs += blockFilterIdx * numImages * numModules // + loadY * numImages * numModules + loadX; targets += (blockModuleChunkIdx * numFilters) * filterPixels * numColors + blockPixelOffset * numFilters + blockFilterIdx + threadIdx.y * numFilters + threadIdx.x; //float* shImgLoad = &shImages[loadY][loadX]; //float* shHidActLoad = &shHidActs[loadY][loadX]; float prod[numColors][pixelsPerThread][filtersPerThread]; #pragma unroll for (int c = 0; c < numColors; c++) { #pragma unroll for (int p = 0; p < pixelsPerThread; p++) { #pragma unroll for (int f = 0; f < filtersPerThread; f++) { prod[c][p][f] = 0; } } } const int mStartX = blockModuleStartX; const int mStartY = blockModuleStartY; const int mEndX = min(numModulesX, blockModuleStartX + sumWidth); const int mEndY = min(numModulesY, blockModuleStartY + sumWidth); // if (mStartY == mEndY || mStartX == mEndX) { // return; // } const int fYOff = (blockPixelOffset + tidx) / filterSize; const int fXOff = (blockPixelOffset + tidx) % filterSize; __shared__ int pxIdxes[B_Y*pixelsPerThread]; for (int my = mStartY; my < mEndY; my++) { const int imgLoadModPosY = paddingStart + my * moduleStride; for (int mx = mStartX; mx < mEndX; mx++) { const int m = my * numModulesX + mx; __syncthreads(); const int imgLoadModPosX = paddingStart + mx * moduleStride; if (tidx < B_Y * pixelsPerThread) { // const int imgLoadModPosY = paddingStart + my * moduleStride; // const int imgLoadModPosX = paddingStart + mx * moduleStride; int pxY = (imgLoadModPosY + fYOff); int pxX = (imgLoadModPosX + fXOff); int pixIdx = (pxY * imgSizeX + pxX) * imgStride; pxIdxes[tidx] = pxY >= 0 && pxY < imgSizeY && pxX >= 0 && pxX < imgSizeX ? pixIdx : -1; } __syncthreads(); for (int caseIdx = 0; caseIdx < numImages; caseIdx += preloadCases) { if (/*loadY < B_X*filtersPerThread &&*/ (!checkCaseBounds || caseIdx + loadX < numImages)) { #pragma unroll for (int y = 0; y < B_X*filtersPerThread; y += (B_X * B_Y) / preloadCases) { const int fIdx = ((loadY + y) % filtersPerThread) * B_X + (loadY + y) / filtersPerThread; // Make sure number of rows in the array is divisible by number of rows filled per iteration if ((B_X*filtersPerThread) % (B_X * B_Y / preloadCases) == 0 || loadY+y < B_X*filtersPerThread) { shHidActs[loadY+y][loadX]= hidActs[caseIdx + fIdx * numImages * numModules + m * numImages]; } } } else { #pragma unroll for (int y = 0; y < B_X*filtersPerThread; y += (B_X * B_Y) / preloadCases) { // const int fIdx = ((loadY + y) % filtersPerThread) * B_X + (loadY + y) / filtersPerThread; // Make sure number of rows in the array is divisible by number of rows filled per iteration if ((B_X*filtersPerThread) % (B_X * B_Y / preloadCases) == 0 || loadY+y < B_X*filtersPerThread) { shHidActs[loadY+y][loadX] = 0; } } } #pragma unroll for (int pp = 0; pp < pixelsPerThread; pp += pixelCache) { //if (loadY < B_Y * pixelCache) { // This condition is not necessary for correctness, but it speeds things a bit /* * As long as B_Y * B_X is divisible by preloadCases this will loop the right * number of times. * * This will load some imgGrads from filter pixels that don't exit (it'll set those to 0), * but the code does not produce any output for those pixels (see last lines). */ #pragma unroll for (int y = 0; y < B_Y * pixelCache; y += (B_X * B_Y) / preloadCases) { // Make sure number of rows in the array is divisible by number of rows filled per iteration if ((B_Y * pixelCache) % (B_X * B_Y / preloadCases) == 0 || y + loadY < B_Y * pixelCache) { const int pxIdx = pp * B_Y + loadY + y; // pixel idx in filter if (pxIdx + blockPixelOffset < filterPixels && (!checkCaseBounds || caseIdx + loadX < numImages)) { const int pixIdx = pxIdxes[pxIdx];//(pxY * imgSizeX + pxX) * imgStride; if (pixIdx >= 0) { #pragma unroll for (int c = 0; c < numColors; c++) { shImages[loadY+y + c * pixelCache * B_Y][loadX] = images[caseIdx + c * imgPixels * imgStride + pixIdx]; } } else { #pragma unroll for (int c = 0; c < numColors; c++) { shImages[loadY+y + c * pixelCache * B_Y][loadX] = 0; } } } else { #pragma unroll for (int c = 0; c < numColors; c++) { shImages[loadY+y + c * pixelCache * B_Y][loadX]= 0; } } } } //} __syncthreads(); #pragma unroll for (int c = 0; c < numColors; c++) { #pragma unroll for (int i = 0; i < preloadCases; i++) { #pragma unroll for (int p = 0; p < pixelCache; p++) { #pragma unroll for (int f = 0; f < filtersPerThread; f++) { prod[c][pp + p][f] += shImages[threadIdx.y + p * B_Y + c * pixelCache * B_Y][i] * shHidActs[threadIdx.x * filtersPerThread + f][i]; } } } } __syncthreads(); } } } } if (scale) { #pragma unroll for (int p = 0; p < pixelsPerThread; p++) { if (blockPixelOffset + p * B_Y + threadIdx.y < filterPixels) { #pragma unroll for (int c = 0; c < numColors; c++) { #pragma unroll for (int f = 0; f < filtersPerThread; f++) { targets[p * B_Y * numFilters + c * filterPixels * numFilters + f * B_X] = scaleTargets * targets[p * B_Y * numFilters + c * filterPixels * numFilters + f * B_X] + scaleOutputs * prod[c][p][f]; } } } } } else { #pragma unroll for (int p = 0; p < pixelsPerThread; p++) { if (blockPixelOffset + p * B_Y + threadIdx.y < filterPixels) { #pragma unroll for (int c = 0; c < numColors; c++) { #pragma unroll for (int f = 0; f < filtersPerThread; f++) { targets[p * B_Y * numFilters + c * filterPixels * numFilters + f * B_X] = scaleOutputs * prod[c][p][f]; } } } } } } #define WA_C3_LOOP(pp, c) _Pragma("unroll") \ for (int i = 0; i < preloadCases; i++) { \ _Pragma("unroll") \ for (int p = 0; p < pixelCache; p++) { \ _Pragma("unroll") \ for (int f = 0; f < filtersPerThread; f++) { \ prod[c][(pp) + p][f] += shImages[threadIdx.y + p * B_Y + (c) * pixelCache * B_Y][i] * shHidActs[threadIdx.x * filtersPerThread + f][i]; \ } \ } \ } #define WA_C3_LOOP2(pp) _Pragma("unroll") \ for (int p = 0; p < pixelCache; p++) { \ _Pragma("unroll") \ for (int i = 0; i < preloadCases; i++) { \ _Pragma("unroll") \ for (int f = 0; f < filtersPerThread; f++) { \ _Pragma("unroll") \ for (int c = 0; c < 3; ++c) { \ prod[c][(pp) + p][f] += shImages[threadIdx.y + p * B_Y + (c) * pixelCache * B_Y][i] * shHidActs[threadIdx.x * filtersPerThread + f][i]; \ } \ } \ } \ } #define WA_3_FIDX(y) (((loadY + (y)*B_X*B_Y/preloadCases) % filtersPerThread) * B_X + (loadY + (y)*B_X*B_Y/preloadCases) / filtersPerThread) /* * Each block computes weight gradients for B_Y * pixelsPerThread pixels and B_X filters * threadIdx.x determines filter * threadIdx.y determines pixel in filter * * blockIdx.x determines filter batch of B_X * filtersPerThread, module batch of partialSum * blockIdx.y determines pixel batch of B_Y * pixelsPerThread * * Number of filters must be divisible by B_X * filtersPerThread * Number of images (cases) should be divisible by preloadCases if checkCaseBounds is false. * * images: (numColors, imgSizeY, imgSizeX, numImages), with stride given * hidActs: (numFilters, numModulesY, numModulesX, numImages) * * targets: (numModulesY*numModulesX/partialSum, numColors, filterPixels, numFilters) * * B_Y * B_X should be divisible by preloadCases. * preloadCases one of 16, 32. * B_X one of 4, 8, 16, 32 * B_Y arbitrary (satisfying divisibility constraints) * numModules must be divisible by partialSum * pixelsPerThread must be divisible by pixelCache * * After adding pixelsPerThread, register usage went from 20 to 23 (when pixelsPerThread = 1)... * so the compiler is messing up here somehow. It's unable to optimize that case away. */ template //__launch_bounds__(256,2) __global__ void conv_weight_acts_c_preload_pc_2_pt_2_f_4_r_32_c_3(cudaTextureObject_t images, cudaTextureObject_t hidActs, float* targets, const int numImages, const int numFilters, const int numModulesY, const int numModulesX, const int imgSizeY, const int imgSizeX, const int filterSize, const int paddingStart, const int moduleStride, const int imgStride, const int sumWidth, const float scaleTargets, const float scaleOutputs) { __shared__ float shImages[pixelCache * B_Y * numColors][preloadCases]; // preload preloadCases cases of B_Y * pixelsPerThread pixels __shared__ float shHidActs[B_X * filtersPerThread][preloadCases + 1]; // preload preloadCases cases of B_X hidActs const int tidx = B_X * threadIdx.y + threadIdx.x; const int loadY = tidx / preloadCases, loadX = tidx % preloadCases; const int filterPixels = filterSize * filterSize; const int imgPixels = imgSizeY * imgSizeX; const int numFilterBlocks = numFilters / (B_X*filtersPerThread); const int blockModuleChunkIdx = blockIdx.x / numFilterBlocks; const int numModuleChunksX = DIVUP(numModulesX, sumWidth); // const int numModuleChunksY = DIVUP(numModulesY, sumWidth); const int blockModuleChunkX = blockModuleChunkIdx % numModuleChunksX; const int blockModuleChunkY = blockModuleChunkIdx / numModuleChunksX; const int blockModuleStartX = blockModuleChunkX * sumWidth; const int blockModuleStartY = blockModuleChunkY * sumWidth; const int blockFilterIdx = B_X * filtersPerThread* (blockIdx.x % numFilterBlocks); // const int moduleStride = (imgSize - filterSize + 1) / numModulesX; const int numModules = numModulesY * numModulesX; const int blockPixelOffset = blockIdx.y * B_Y * pixelsPerThread; const int imgOffset = loadX; const int hidActsOffset = blockFilterIdx * numImages * numModules + loadX; // images += loadX; // hidActs += blockFilterIdx * numImages * numModules // + loadX; targets += (blockModuleChunkIdx * numFilters) * filterPixels * numColors + blockPixelOffset * numFilters + blockFilterIdx + threadIdx.y * numFilters + threadIdx.x; //float* shImgLoad = &shImages[loadY][loadX]; //float* shHidActLoad = &shHidActs[loadY][loadX]; float prod[numColors][pixelsPerThread][filtersPerThread]; #pragma unroll for (int c = 0; c < numColors; c++) { #pragma unroll for (int p = 0; p < pixelsPerThread; p++) { #pragma unroll for (int f = 0; f < filtersPerThread; f++) { prod[c][p][f] = 0; } } } const int mStartX = blockModuleStartX; const int mStartY = blockModuleStartY; const int mEndX = min(numModulesX, blockModuleStartX + sumWidth); const int mEndY = min(numModulesY, blockModuleStartY + sumWidth); const bool doWork = mStartY < mEndY && mStartX < mEndX; // if (!doWork) { // hidActs -= // } // if (mStartY == mEndY || mStartX == mEndX) { // return; // } // float imPreload[pixelCache * numColors * preloadCases / B_X]; // [12] float haPreload[filtersPerThread * preloadCases / B_Y]; // [8] // if (blockIdx.x != 0 || blockIdx.y !=0) { // return; // } // printf("mStartX: %d, mStartX: %d, mStartX: %d, mStartX: %d\n", mStartX, mStartY, mEndX, mEndY); const int fYOff = (blockPixelOffset + tidx) / filterSize; const int fXOff = (blockPixelOffset + tidx) % filterSize; __shared__ int pxIdxes[B_Y*pixelsPerThread]; // __shared__ int fidx[filtersPerThread * preloadCases / B_Y]; // [8] int m = mStartY * numModulesX + mStartX; int fidx[filtersPerThread * preloadCases / B_Y]; if (doWork) { #pragma unroll for (int y = 0; y < filtersPerThread * preloadCases / B_Y; ++y) { const int fIdx = WA_3_FIDX(y); // if (doWork) { haPreload[y] = tex1Dfetch(hidActs, hidActsOffset + fIdx * numImages * numModules + m * numImages); // } fidx[y] = fIdx * numImages * numModules; } } for (int my = mStartY; my < mEndY; my++) { const int imgLoadModPosY = paddingStart + my * moduleStride; for (int mx = mStartX; mx < mEndX; mx++) { m = my * numModulesX + mx; // __syncthreads(); const int imgLoadModPosX = paddingStart + mx * moduleStride; if (tidx < B_Y * pixelsPerThread) { // const int imgLoadModPosY = paddingStart + my * moduleStride; // const int imgLoadModPosX = paddingStart + mx * moduleStride; const int pxY = (imgLoadModPosY + fYOff); const int pxX = (imgLoadModPosX + fXOff); const int pixIdx = (pxY * imgSizeX + pxX) * imgStride; pxIdxes[tidx] = pxY >= 0 && pxY < imgSizeY && pxX >= 0 && pxX < imgSizeX ? pixIdx : -1; } __syncthreads(); int myNext = my, mxNext = mx, mNext = m; const bool lastModule = my == mEndY - 1 && mx == mEndX - 1; if (!lastModule) { mxNext = mx + 1 == mEndX ? mStartX : mx + 1; myNext = my + (mx + 1 == mEndX); mNext = myNext * numModulesX + mxNext; } for (int caseIdx = 0; caseIdx < numImages; caseIdx += preloadCases) { const bool lastBatch = caseIdx + preloadCases == numImages; // const float* im = &images[caseIdx + preloadCases + pixIdx]; // const float* ha = &hidActs[caseIdx + preloadCases + m * numImages]; int hidActsOffset2 = hidActsOffset + caseIdx + preloadCases + m * numImages; if (lastBatch) { // ha = &hidActs[mNext * numImages]; hidActsOffset2 = hidActsOffset + mNext * numImages; } #pragma unroll for (int y = 0; y < B_X*filtersPerThread; y += (B_X * B_Y) / preloadCases) { shHidActs[loadY+y][loadX] = haPreload[y*preloadCases/(B_X*B_Y)]; } /* ================================================================================== * Iteration 0 * ================================================================================== */ #pragma unroll for (int y = 0; y < B_Y * pixelCache; y += (B_X * B_Y) / preloadCases) { #pragma unroll for (int c = 0; c < numColors; c++) { shImages[loadY+y + c * pixelCache * B_Y][loadX] = 0; } } #pragma unroll for (int y = 0; y < B_Y * pixelCache; y += (B_X * B_Y) / preloadCases) { const int pxIdx = 0 * B_Y + loadY + y; // pixel idx in filter if (pxIdx + blockPixelOffset < filterPixels) { const int pixIdx = pxIdxes[pxIdx];//(pxY * imgSizeX + pxX) * imgStride; if (pixIdx >= 0) { #pragma unroll for (int c = 0; c < numColors; c++) { shImages[loadY+y + c * pixelCache * B_Y][loadX] = tex1Dfetch(images, imgOffset + caseIdx + c * imgPixels * imgStride + pixIdx); } } } } __syncthreads(); haPreload[0] = tex1Dfetch(hidActs, hidActsOffset2 + fidx[0]); haPreload[1] = tex1Dfetch(hidActs, hidActsOffset2 + fidx[1]); WA_C3_LOOP(0,0); haPreload[2] = tex1Dfetch(hidActs, hidActsOffset2 + fidx[2]); haPreload[3] = tex1Dfetch(hidActs, hidActsOffset2 + fidx[3]); WA_C3_LOOP(0,1); haPreload[4] = tex1Dfetch(hidActs, hidActsOffset2 + fidx[4]); haPreload[5] = tex1Dfetch(hidActs, hidActsOffset2 + fidx[5]); WA_C3_LOOP(0,2); haPreload[6] = tex1Dfetch(hidActs, hidActsOffset2 + fidx[6]); haPreload[7] = tex1Dfetch(hidActs, hidActsOffset2 + fidx[7]); __syncthreads(); } } } if (scale) { #pragma unroll for (int p = 0; p < pixelsPerThread; p++) { if (blockPixelOffset + p * B_Y + threadIdx.y < filterPixels) { #pragma unroll for (int c = 0; c < numColors; c++) { #pragma unroll for (int f = 0; f < filtersPerThread; f++) { targets[p * B_Y * numFilters + c * filterPixels * numFilters + f * B_X] = scaleTargets * targets[p * B_Y * numFilters + c * filterPixels * numFilters + f * B_X] + scaleOutputs * prod[c][p][f]; } } } } } else { #pragma unroll for (int p = 0; p < pixelsPerThread; p++) { if (blockPixelOffset + p * B_Y + threadIdx.y < filterPixels) { #pragma unroll for (int c = 0; c < numColors; c++) { #pragma unroll for (int f = 0; f < filtersPerThread; f++) { // if (threadIdx.x == 3) targets[p * B_Y * numFilters + c * filterPixels * numFilters + f * B_X] = scaleOutputs * prod[c][p][f]; } } } } } } /* * Each block computes weight gradients for B_Y * pixelsPerThread pixels and B_X filters * threadIdx.x determines filter * threadIdx.y determines pixel in filter * * blockIdx.x determines filter batch of B_X * filtersPerThread, module batch of partialSum * blockIdx.y determines pixel batch of B_Y * pixelsPerThread * * Number of filters must be divisible by B_X * filtersPerThread * Number of images (cases) should be divisible by preloadCases if checkCaseBounds is false. * * images: (numColors, imgSizeY, imgSizeX, numImages), with stride given * hidActs: (numFilters, numModulesY, numModulesX, numImages) * * targets: (numModulesY*numModulesX/partialSum, numColors, filterPixels, numFilters) * * B_Y * B_X should be divisible by preloadCases. * preloadCases one of 16, 32. * B_X one of 4, 8, 16, 32 * B_Y arbitrary (satisfying divisibility constraints) * numModules must be divisible by partialSum * pixelsPerThread must be divisible by pixelCache * * After adding pixelsPerThread, register usage went from 20 to 23 (when pixelsPerThread = 1)... * so the compiler is messing up here somehow. It's unable to optimize that case away. */ template __launch_bounds__(256,2) __global__ void conv_weight_acts_c_preload_pc_2_pt_4_f_3_r_32_c_3(cudaTextureObject_t images, cudaTextureObject_t hidActs, float* targets, const int numImages, const int numFilters, const int numModulesY, const int numModulesX, const int imgSizeY, const int imgSizeX, const int filterSize, const int paddingStart, const int moduleStride, const int imgStride, const int sumWidth, const float scaleTargets, const float scaleOutputs) { __shared__ float shImages[pixelCache * B_Y * numColors][preloadCases]; // preload preloadCases cases of B_Y * pixelsPerThread pixels __shared__ float shHidActs[B_X * filtersPerThread][preloadCases + 1]; // preload preloadCases cases of B_X hidActs const int tidx = B_X * threadIdx.y + threadIdx.x; const int loadY = tidx / preloadCases, loadX = tidx % preloadCases; const int filterPixels = filterSize * filterSize; const int imgPixels = imgSizeY * imgSizeX; const int numFilterBlocks = numFilters / (B_X*filtersPerThread); const int blockModuleChunkIdx = blockIdx.x / numFilterBlocks; const int numModuleChunksX = DIVUP(numModulesX, sumWidth); // const int numModuleChunksY = DIVUP(numModulesY, sumWidth); const int blockModuleChunkX = blockModuleChunkIdx % numModuleChunksX; const int blockModuleChunkY = blockModuleChunkIdx / numModuleChunksX; const int blockModuleStartX = blockModuleChunkX * sumWidth; const int blockModuleStartY = blockModuleChunkY * sumWidth; const int blockFilterIdx = B_X * filtersPerThread* (blockIdx.x % numFilterBlocks); // const int moduleStride = (imgSize - filterSize + 1) / numModulesX; const int numModules = numModulesY * numModulesX; const int blockPixelOffset = blockIdx.y * B_Y * pixelsPerThread; const int imgOffset = loadX; const int hidActsOffset = blockFilterIdx * numImages * numModules + loadX; // images += loadX; // hidActs += blockFilterIdx * numImages * numModules // + loadX; targets += (blockModuleChunkIdx * numFilters) * filterPixels * numColors + blockPixelOffset * numFilters + blockFilterIdx + threadIdx.y * numFilters + threadIdx.x; //float* shImgLoad = &shImages[loadY][loadX]; //float* shHidActLoad = &shHidActs[loadY][loadX]; float prod[numColors][pixelsPerThread][filtersPerThread]; #pragma unroll for (int c = 0; c < numColors; c++) { #pragma unroll for (int p = 0; p < pixelsPerThread; p++) { #pragma unroll for (int f = 0; f < filtersPerThread; f++) { prod[c][p][f] = 0; } } } const int mStartX = blockModuleStartX; const int mStartY = blockModuleStartY; const int mEndX = min(numModulesX, blockModuleStartX + sumWidth); const int mEndY = min(numModulesY, blockModuleStartY + sumWidth); const bool doWork = mStartY < mEndY && mStartX < mEndX; // if (mStartY == mEndY || mStartX == mEndX) { // return; // } // float imPreload[pixelCache * numColors * preloadCases / B_X]; // [12] float haPreload[filtersPerThread * preloadCases / B_Y]; // [6] // if (blockIdx.x != 0 || blockIdx.y !=0) { // return; // } // printf("mStartX: %d, mStartX: %d, mStartX: %d, mStartX: %d\n", mStartX, mStartY, mEndX, mEndY); const int fYOff = (blockPixelOffset + tidx) / filterSize; const int fXOff = (blockPixelOffset + tidx) % filterSize; __shared__ int pxIdxes[B_Y*pixelsPerThread]; // __shared__ int fidx[filtersPerThread * preloadCases / B_Y]; // [6] int m = mStartY * numModulesX + mStartX; int fidx[filtersPerThread * preloadCases / B_Y]; // if (doWork) { #pragma unroll for (int y = 0; y < filtersPerThread * preloadCases / B_Y; ++y) { fidx[y] = WA_3_FIDX(y) * numImages * numModules; if (doWork) { // Not actually necessary, I think haPreload[y] = tex1Dfetch(hidActs, hidActsOffset + fidx[y] + m * numImages); } } // } int mNext = mStartY * numModulesX + mStartX; for (int my = mStartY; my < mEndY; my++) { // const int imgLoadModPosY = paddingStart + my * moduleStride; for (int mx = mStartX; mx < mEndX; mx++) { m = mNext;//my * numModulesX + mx; // __syncthreads(); // const int imgLoadModPosX = paddingStart + mx * moduleStride; if (tidx < B_Y * pixelsPerThread) { const int imgLoadModPosY = paddingStart + my * moduleStride; const int imgLoadModPosX = paddingStart + mx * moduleStride; const int pxY = (imgLoadModPosY + fYOff); const int pxX = (imgLoadModPosX + fXOff); const int pixIdx = (pxY * imgSizeX + pxX) * imgStride; pxIdxes[tidx] = pxY >= 0 && pxY < imgSizeY && pxX >= 0 && pxX < imgSizeX ? pixIdx : -1; } __syncthreads(); const bool lastModule = my == mEndY - 1 && mx == mEndX - 1; mNext = lastModule * m + !lastModule * ((my + (mx + 1 == mEndX)) * numModulesX + (mx + 1 == mEndX ? mStartX : mx + 1)); // if (!lastModule) { // const int mxNext = mx + 1 == mEndX ? mStartX : mx + 1; // const int myNext = my + (mx + 1 == mEndX); // mNext = myNext * numModulesX + mxNext; // } for (int caseIdx = 0; caseIdx < numImages; caseIdx += preloadCases) { const bool lastBatch = caseIdx + preloadCases == numImages; // const float* im = &images[caseIdx + preloadCases + pixIdx]; // const float* ha = hidActs + !lastBatch * (caseIdx + preloadCases + m * numImages) + lastBatch * mNext * numImages; const int hidActsOffset2 = hidActsOffset + !lastBatch * (caseIdx + preloadCases + m * numImages) + lastBatch * mNext * numImages; // if (lastBatch) { // ha = &hidActs[mNext * numImages]; // } #pragma unroll for (int y = 0; y < B_X*filtersPerThread; y += (B_X * B_Y) / preloadCases) { shHidActs[loadY+y][loadX] = haPreload[y*preloadCases/(B_X*B_Y)]; } /* ================================================================================== * Iteration 0 * ================================================================================== */ #pragma unroll for (int y = 0; y < B_Y * pixelCache; y += (B_X * B_Y) / preloadCases) { // Make sure number of rows in the array is divisible by number of rows filled per iteration if ((B_Y * pixelCache) % (B_X * B_Y / preloadCases) == 0 || y + loadY < B_Y * pixelCache) { #pragma unroll for (int c = 0; c < numColors; c++) { shImages[loadY+y + c * pixelCache * B_Y][loadX] = 0; } } } #pragma unroll for (int y = 0; y < B_Y * pixelCache; y += (B_X * B_Y) / preloadCases) { // Make sure number of rows in the array is divisible by number of rows filled per iteration if ((B_Y * pixelCache) % (B_X * B_Y / preloadCases) == 0 || y + loadY < B_Y * pixelCache) { const int pxIdx = 0 * B_Y + loadY + y; // pixel idx in filter const int pixIdx = pxIdxes[pxIdx];//(pxY * imgSizeX + pxX) * imgStride; if (pixIdx >= 0 && pxIdx + blockPixelOffset < filterPixels && (!checkCaseBounds || caseIdx + loadX < numImages)) { #pragma unroll for (int c = 0; c < numColors; c++) { shImages[loadY+y + c * pixelCache * B_Y][loadX] = tex1Dfetch(images, imgOffset + caseIdx + c * imgPixels * imgStride + pixIdx); } } } } __syncthreads(); haPreload[0] = tex1Dfetch(hidActs, hidActsOffset2 + fidx[0]); haPreload[1] = tex1Dfetch(hidActs, hidActsOffset2 + fidx[1]); haPreload[2] = tex1Dfetch(hidActs, hidActsOffset2 + fidx[2]); haPreload[3] = tex1Dfetch(hidActs, hidActsOffset2 + fidx[3]); haPreload[4] = tex1Dfetch(hidActs, hidActsOffset2 + fidx[4]); haPreload[5] = tex1Dfetch(hidActs, hidActsOffset2 + fidx[5]); WA_C3_LOOP2(0); __syncthreads(); /* ================================================================================== * Iteration 1 * ================================================================================== */ #pragma unroll for (int y = 0; y < B_Y * pixelCache; y += (B_X * B_Y) / preloadCases) { // Make sure number of rows in the array is divisible by number of rows filled per iteration if ((B_Y * pixelCache) % (B_X * B_Y / preloadCases) == 0 || y + loadY < B_Y * pixelCache) { // const int pxIdx = 2 * B_Y + loadY + y; // pixel idx in filter #pragma unroll for (int c = 0; c < numColors; c++) { shImages[loadY+y + c * pixelCache * B_Y][loadX] = 0; } } } #pragma unroll for (int y = 0; y < B_Y * pixelCache; y += (B_X * B_Y) / preloadCases) { // Make sure number of rows in the array is divisible by number of rows filled per iteration if ((B_Y * pixelCache) % (B_X * B_Y / preloadCases) == 0 || y + loadY < B_Y * pixelCache) { const int pxIdx = 2 * B_Y + loadY + y; // pixel idx in filter const int pixIdx = pxIdxes[pxIdx];//(pxY * imgSizeX + pxX) * imgStride; if (pixIdx >= 0 && pxIdx + blockPixelOffset < filterPixels && (!checkCaseBounds || caseIdx + loadX < numImages)) { #pragma unroll for (int c = 0; c < numColors; c++) { shImages[loadY+y + c * pixelCache * B_Y][loadX] = tex1Dfetch(images, imgOffset + caseIdx + c * imgPixels * imgStride + pixIdx); } } } } __syncthreads(); WA_C3_LOOP2(2); __syncthreads(); } } } if (scale) { #pragma unroll for (int p = 0; p < pixelsPerThread; p++) { if (blockPixelOffset + p * B_Y + threadIdx.y < filterPixels) { #pragma unroll for (int c = 0; c < numColors; c++) { #pragma unroll for (int f = 0; f < filtersPerThread; f++) { targets[p * B_Y * numFilters + c * filterPixels * numFilters + f * B_X] = scaleTargets * targets[p * B_Y * numFilters + c * filterPixels * numFilters + f * B_X] + scaleOutputs * prod[c][p][f]; } } } } } else { #pragma unroll for (int p = 0; p < pixelsPerThread; p++) { if (blockPixelOffset + p * B_Y + threadIdx.y < filterPixels) { #pragma unroll for (int c = 0; c < numColors; c++) { #pragma unroll for (int f = 0; f < filtersPerThread; f++) { targets[p * B_Y * numFilters + c * filterPixels * numFilters + f * B_X] = scaleOutputs * prod[c][p][f]; } } } } } } /*****************************Function Revision Record***************************** * Author: Tencent BestImage Team(ankerguo@tencent.com) * * Date: 2015-05-18 * * Reason: Optimizing kernel to get faster speed according to GPU features * * Method: * * 1. reorganizing data structure to avoid bank conflict; * * 2. using vectorized data type; * * 3. improving instruction-level parallelism; * * 4. removing redundant 'if' branches; * * 5. removing local variables to save registers. * *********************************************************************************/ /* * images: (numImgColors, imgSizeY, imgSizeX, numImages), with stride given * hidActs: (numFilters, numModulesY, numModulesX, numImages) * * targets: (numModulesY*numModulesX/partialSum, numFilterColors, filterPixels, numFilters) */ template __launch_bounds__(128, 4) __global__ void conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_16_f_4_c_8_r_16(cudaTextureObject_t images, cudaTextureObject_t hidActs, float* targets, const int numImages, const int numFilters, const int numModulesY, const int numModulesX, const int imgSizeY, const int imgSizeX, const int filterSize, const int paddingStart, const int moduleStride, const int imgStride, const int numImgColors, const int numGroups, const int sumWidth, const float scaleTargets, const float scaleOutputs) { // avoid bank conflict by reorganizing the data structure, and improve the band width by using 'float2' instead of 'float' __shared__ float2 shImages[preloadCases][colorsPerThread * B_Y / 2 + 2]; // preload preloadCases cases __shared__ float2 shHidActs[preloadCases][filtersPerThread * B_X / 2 + 2]; // preload preloadCases cases of B_X hidacts const int tx = threadIdx.x % B_X, ty = threadIdx.y % B_Y; const int tidx = B_X * ty + tx; const int loadY = tidx / preloadCases, loadX = tidx % preloadCases; const int filterPixels = filterSize * filterSize; const int imgPixels = imgSizeY * imgSizeX; const int numFilterBlocks = numFilters / (B_X * filtersPerThread); const int blockModuleChunkIdx = blockIdx.x / numFilterBlocks; const int numModuleChunksX = DIVUP(numModulesX, sumWidth); // const int numModuleChunksY = DIVUP(numModulesY, sumWidth); const int blockModuleChunkX = blockModuleChunkIdx % numModuleChunksX; const int blockModuleChunkY = blockModuleChunkIdx / numModuleChunksX; const int blockModuleStartX = blockModuleChunkX * sumWidth; const int blockModuleStartY = blockModuleChunkY * sumWidth; // const int moduleIdx = partialSum * outputModuleIdx; const int blockFilterIdx = filtersPerThread * B_X * (blockIdx.x % numFilterBlocks); const int numModules = numModulesY * numModulesX; const int numFiltersPerGroup = numFilters / numGroups; const int blockGroupIdx = blockFilterIdx / numFiltersPerGroup; const int numFilterColors = numImgColors / numGroups; const int blockPixelOffset = blockIdx.z; // pixel idx in filter const int blockPixelY = blockPixelOffset / filterSize, blockPixelX = blockPixelOffset % filterSize; const int blockFilterColorIdx = blockIdx.y * B_Y * colorsPerThread; const int imgColorIdx = blockFilterColorIdx + blockGroupIdx * numFilterColors; const int imgOffset = (imgColorIdx + loadY) * imgPixels * imgStride + loadX; // images += (imgColorIdx + loadY) * imgPixels * imgStride + loadX; const int hidActsOffset = blockFilterIdx * numImages * numModules + loadY * numImages * numModules + loadX; // // hidActs += // blockFilterIdx * numImages * numModules // + loadY * numImages * numModules // + loadX; targets += blockModuleChunkIdx * numFilters * filterPixels * numFilterColors + (blockFilterColorIdx + ty) * filterPixels * numFilters + blockPixelOffset * numFilters + blockFilterIdx + tx; // if (blockIdx.x != 0 || blockIdx.y != 0 || blockIdx.z != 0) return; const int mStartX = max(blockModuleStartX, DIVUP(-blockPixelX - paddingStart, moduleStride)); const int mStartY = max(blockModuleStartY, DIVUP(-blockPixelY - paddingStart, moduleStride)); const int mEndX = min(numModulesX, min(blockModuleStartX + sumWidth, DIVUP(imgSizeX - blockPixelX - paddingStart, moduleStride))); const int mEndY = min(numModulesY, min(blockModuleStartY + sumWidth, DIVUP(imgSizeY - blockPixelY - paddingStart, moduleStride))); // if (mStartY == mEndY || mStartX == mEndX) { // return; // } const bool doWork = mStartY < mEndY && mStartX < mEndX; // reduce 2 registers //float* shHidActLoad = &shHidActs[loadY][loadX]; //float* shImgLoad = &shImages[loadY][loadX]; float imPreload[preloadCases*colorsPerThread/B_X]; // [8] float haPreload[preloadCases*filtersPerThread/B_Y]; // [8] float prod[filtersPerThread][colorsPerThread]; #pragma unroll for (int f = 0; f < filtersPerThread; f++) { #pragma unroll for (int c = 0; c < colorsPerThread; c++) { prod[f][c] = 0; } } int pixIdx, pixIdxNext, m, mNext; conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_32_f_4_c_8_r_16_setCoords( mStartY, mStartX, paddingStart, numModulesX, moduleStride, blockPixelY, blockPixelX, imgSizeX, imgStride, pixIdx, m); if (doWork) { #pragma unroll for (int y = 0; y < B_Y * colorsPerThread; y += (B_X * B_Y) / preloadCases) { // It's bizarre, but this is the fastest way I've found to get it not to load nonexistent pixels. // All other ways cause crazy excessive register usage. const int idx = (mStartY < mEndY && mStartX < mEndX) * (0 + y * imgPixels * imgStride + pixIdx); imPreload[y * preloadCases/(B_X * B_Y)] = tex1Dfetch(images, imgOffset + idx); } } if (doWork) { #pragma unroll for (int y = 0; y < B_X * filtersPerThread; y += (B_X * B_Y) / preloadCases) { // Almost certainly not necessary here. const int idx = (mStartY < mEndY && mStartX < mEndX) * (0 + y * numImages * numModules + m * numImages); haPreload[y * preloadCases / (B_X * B_Y)] = tex1Dfetch(hidActs, hidActsOffset + idx); } } for (int my = mStartY; my < mEndY; my++) { for (int mx = mStartX; mx < mEndX; mx++) { int myNext = my, mxNext = mx; const bool lastModule = my == mEndY - 1 && mx == mEndX - 1; if (!lastModule) { mxNext = mx + 1 == mEndX ? mStartX : mx + 1; myNext = my + (mx + 1 == mEndX); } conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_32_f_4_c_8_r_16_setCoords( myNext, mxNext, paddingStart, numModulesX, moduleStride, blockPixelY, blockPixelX, imgSizeX, imgStride, pixIdxNext, mNext); for (int caseIdx = 0; caseIdx < numImages; caseIdx += preloadCases) { // store the preloaded image's pixel into shared memory #pragma unroll for (int y = 0; y < 4; y++) { shImages[loadX][loadY+y*8].x = imPreload[y]; shImages[loadX][loadY+y*8].y = imPreload[y+4]; } //const float* im = &images[caseIdx + preloadCases + pixIdx]; //const float* ha = &hidActs[caseIdx + preloadCases + m * numImages]; int imgOffset2 = imgOffset + caseIdx + preloadCases + pixIdx; int hidActsOffset2 = hidActsOffset + caseIdx + preloadCases + m * numImages; if (caseIdx + preloadCases == numImages) { pixIdx = pixIdxNext; m = mNext; imgOffset2 = imgOffset + pixIdxNext; hidActsOffset2 = hidActsOffset + mNext * numImages; } // store the images and hidActs shHidActs[loadX][loadY].x = haPreload[0]; shHidActs[loadX][loadY].y = haPreload[2]; shHidActs[loadX][loadY+16].x = haPreload[4]; shHidActs[loadX][loadY+16].y = haPreload[6]; shHidActs[loadX][loadY+8].x = haPreload[1]; shHidActs[loadX][loadY+8].y = haPreload[3]; shHidActs[loadX][loadY+24].x = haPreload[5]; shHidActs[loadX][loadY+24].y = haPreload[7]; // preloade the image's and hidAct's pixel #pragma unroll for (int r = 0; r < 8; r++) { imPreload[r] = tex1Dfetch(images, imgOffset2 + (r) * 8 * imgPixels * imgStride); haPreload[r] = tex1Dfetch(hidActs, hidActsOffset2 + (r) * 8 * numImages * numModules); } __syncthreads(); // put together the instructions of same type to improve instruction-level parallelism #pragma unroll for (int r = 0; r < 16; r++) { for (int c = 0; c < 4; c++) { prod[0][c] += shImages[r][ty + c * B_Y].x * shHidActs[(r)][tx].x; prod[1][c] += shImages[r][ty + c * B_Y].x * shHidActs[(r)][tx].y; prod[2][c] += shImages[r][ty + c * B_Y].x * shHidActs[(r)][tx + B_X].x; prod[3][c] += shImages[r][ty + c * B_Y].x * shHidActs[(r)][tx + B_X].y; prod[0][c+4] += shImages[r][ty + c * B_Y].y * shHidActs[(r)][tx].x; prod[1][c+4] += shImages[r][ty + c * B_Y].y * shHidActs[(r)][tx].y; prod[2][c+4] += shImages[r][ty + c * B_Y].y * shHidActs[(r)][tx + B_X].x; prod[3][c+4] += shImages[r][ty + c * B_Y].y * shHidActs[(r)][tx + B_X].y; } } __syncthreads(); } } } if (scale) { #pragma unroll for (int c = 0; c < colorsPerThread; c++) { #pragma unroll for (int f = 0; f < filtersPerThread; f++) { targets[c * B_Y * filterPixels * numFilters + f * B_X] = scaleTargets * targets[c * B_Y * filterPixels * numFilters + f * B_X] + scaleOutputs * prod[f][c]; } } } else { #pragma unroll for (int c = 0; c < colorsPerThread; c++) { #pragma unroll for (int f = 0; f < filtersPerThread; f++) { targets[c * B_Y * filterPixels * numFilters + f * B_X] = scaleOutputs * prod[f][c]; } } } } /* * images: (numImgColors, imgSizeY, imgSizeX, numImages), with stride given * hidActs: (numFilters, numModulesY, numModulesX, numImages) * * targets: (numModulesY*numModulesX/partialSum, numFilterColors, filterPixels, numFilters) */ template __launch_bounds__(256, 2) __global__ void conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_32_f_4_c_6_r_32(cudaTextureObject_t images, cudaTextureObject_t hidActs, float* targets, const int numImages, const int numFilters, const int numModulesY, const int numModulesX, const int imgSizeY, const int imgSizeX, const int filterSize, const int paddingStart, const int moduleStride, const int imgStride, const int numImgColors, const int numGroups, const int sumWidth, const float scaleTargets, const float scaleOutputs) { __shared__ float shImages[colorsPerThread * B_Y][preloadCases]; // preload preloadCases cases __shared__ float shHidActs[filtersPerThread * B_X][preloadCases + 1]; // preload preloadCases cases of B_X hidacts const int tidx = B_X * threadIdx.y + threadIdx.x; const int loadY = tidx / preloadCases, loadX = tidx % preloadCases; const int filterPixels = filterSize * filterSize; const int imgPixels = imgSizeY * imgSizeX; const int numFilterBlocks = numFilters / (B_X * filtersPerThread); const int blockModuleChunkIdx = blockIdx.x / numFilterBlocks; const int numModuleChunksX = DIVUP(numModulesX, sumWidth); // const int numModuleChunksY = DIVUP(numModulesY, sumWidth); const int blockModuleChunkX = blockModuleChunkIdx % numModuleChunksX; const int blockModuleChunkY = blockModuleChunkIdx / numModuleChunksX; const int blockModuleStartX = blockModuleChunkX * sumWidth; const int blockModuleStartY = blockModuleChunkY * sumWidth; // const int moduleIdx = partialSum * outputModuleIdx; const int blockFilterIdx = filtersPerThread * B_X * (blockIdx.x % numFilterBlocks); const int numModules = numModulesY * numModulesX; const int numFiltersPerGroup = numFilters / numGroups; const int blockGroupIdx = blockFilterIdx / numFiltersPerGroup; const int numFilterColors = numImgColors / numGroups; const int blockPixelOffset = blockIdx.z; // pixel idx in filter const int blockPixelY = blockPixelOffset / filterSize, blockPixelX = blockPixelOffset % filterSize; const int blockFilterColorIdx = blockIdx.y * B_Y * colorsPerThread; const int imgColorIdx = blockFilterColorIdx + blockGroupIdx * numFilterColors; const int imgOffset = (imgColorIdx + loadY) * imgPixels * imgStride + loadX; const int hidActsOffset = blockFilterIdx * numImages * numModules + loadY * numImages * numModules + loadX; // images += (imgColorIdx + loadY) * imgPixels * imgStride + loadX; // // hidActs += // blockFilterIdx * numImages * numModules // + loadY * numImages * numModules // + loadX; targets += blockModuleChunkIdx * numFilters * filterPixels * numFilterColors + (blockFilterColorIdx + threadIdx.y) * filterPixels * numFilters + blockPixelOffset * numFilters + blockFilterIdx + threadIdx.x; // if (blockIdx.x != 0 || blockIdx.y != 0 || blockIdx.z != 0) return; const int mStartX = max(blockModuleStartX, DIVUP(-blockPixelX - paddingStart, moduleStride)); const int mStartY = max(blockModuleStartY, DIVUP(-blockPixelY - paddingStart, moduleStride)); const int mEndX = min(numModulesX, min(blockModuleStartX + sumWidth, DIVUP(imgSizeX - blockPixelX - paddingStart, moduleStride))); const int mEndY = min(numModulesY, min(blockModuleStartY + sumWidth, DIVUP(imgSizeY - blockPixelY - paddingStart, moduleStride))); // if (mStartY == mEndY || mStartX == mEndX) { // return; // } const bool doWork = mStartY < mEndY && mStartX < mEndX; float* shHidActLoad = &shHidActs[loadY][loadX]; float* shImgLoad = &shImages[loadY][loadX]; float imPreload[preloadCases*colorsPerThread/B_X]; // [6] float haPreload[preloadCases*filtersPerThread/B_Y]; // [16] float prod[filtersPerThread][colorsPerThread]; #pragma unroll for (int f = 0; f < filtersPerThread; f++) { #pragma unroll for (int c = 0; c < colorsPerThread; c++) { prod[f][c] = 0; } } int pixIdx, pixIdxNext, m, mNext; conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_32_f_4_c_8_r_16_setCoords( mStartY, mStartX, paddingStart, numModulesX, moduleStride, blockPixelY, blockPixelX, imgSizeX, imgStride, pixIdx, m); if (doWork) { #pragma unroll for (int y = 0; y < B_Y * colorsPerThread; y += (B_X * B_Y) / preloadCases) { imPreload[y * preloadCases/(B_X * B_Y)] = tex1Dfetch(images, imgOffset + y * imgPixels * imgStride + pixIdx); } #pragma unroll for (int y = 0; y < B_X * filtersPerThread; y += (B_X * B_Y) / preloadCases) { haPreload[y * preloadCases / (B_X * B_Y)] = tex1Dfetch(hidActs, hidActsOffset + y * numImages * numModules + m * numImages); } } // if (mStartY > mEndY || mStartX > mEndX) { // printf("crzy!!\n"); // } for (int my = mStartY; my < mEndY; my++) { for (int mx = mStartX; mx < mEndX; mx++) { int myNext = my, mxNext = mx; const bool lastModule = my == mEndY - 1 && mx == mEndX - 1; if (!lastModule) { mxNext = mx + 1 == mEndX ? mStartX : mx + 1; myNext = my + (mx + 1 == mEndX); } conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_32_f_4_c_8_r_16_setCoords( myNext, mxNext, paddingStart, numModulesX, moduleStride, blockPixelY, blockPixelX, imgSizeX, imgStride, pixIdxNext, mNext); for (int caseIdx = 0; caseIdx < numImages; caseIdx += preloadCases) { #pragma unroll for (int y = 0; y < B_Y * colorsPerThread; y += (B_X * B_Y) / preloadCases) { shImgLoad[(y) * preloadCases] = imPreload[y * preloadCases / (B_X * B_Y)]; } #pragma unroll for (int y = 0; y < B_X * filtersPerThread; y += (B_X * B_Y) / preloadCases) { shHidActLoad[y * (preloadCases + 1)] = haPreload[y * preloadCases / (B_X * B_Y)]; } __syncthreads(); // const float* im = &images[caseIdx + preloadCases + pixIdx]; // const float* ha = &hidActs[caseIdx + preloadCases + m * numImages]; int imgOffset2 = imgOffset + caseIdx + preloadCases + pixIdx; int hidActsOffset2 = hidActsOffset + caseIdx + preloadCases + m * numImages; if (caseIdx + preloadCases == numImages) { pixIdx = pixIdxNext; m = mNext; imgOffset2 = imgOffset + pixIdxNext; hidActsOffset2 = hidActsOffset + mNext * numImages; } WA_LOOP(0); WA_LOOP(1); WA_LOOP(2); WA_LOOP(3); WA_LOOP(4); WA_LOOP(5); WA_IMLOAD_TX(0); WA_LOOP(6); WA_IMLOAD_TX(1); WA_LOOP(7); WA_IMLOAD_TX(2); WA_LOOP(8); WA_IMLOAD_TX(3); WA_LOOP(9); WA_IMLOAD_TX(4); WA_LOOP(10); WA_IMLOAD_TX(5); WA_LOOP(11); WA_HALOAD_TX(0); WA_LOOP(12); WA_HALOAD_TX(1); WA_LOOP(13); WA_HALOAD_TX(2); WA_LOOP(14); WA_HALOAD_TX(3); WA_LOOP(15); WA_HALOAD_TX(4); WA_LOOP(16); WA_HALOAD_TX(5); WA_LOOP(17); WA_HALOAD_TX(6); WA_LOOP(18); WA_HALOAD_TX(7); WA_LOOP(19); WA_HALOAD_TX(8); WA_LOOP(20); WA_HALOAD_TX(9); WA_LOOP(21); WA_HALOAD_TX(10); WA_LOOP(22); WA_HALOAD_TX(11); WA_LOOP(23); WA_HALOAD_TX(12); WA_LOOP(24); WA_HALOAD_TX(13); WA_LOOP(25); WA_HALOAD_TX(14); WA_LOOP(26); WA_HALOAD_TX(15); WA_LOOP(27); WA_LOOP(28); WA_LOOP(29); WA_LOOP(30); WA_LOOP(31); __syncthreads(); } } } if (scale) { #pragma unroll for (int c = 0; c < colorsPerThread; c++) { #pragma unroll for (int f = 0; f < filtersPerThread; f++) { targets[c * B_Y * filterPixels * numFilters + f * B_X] = scaleTargets * targets[c * B_Y * filterPixels * numFilters + f * B_X] + scaleOutputs * prod[f][c]; } } } else { #pragma unroll for (int c = 0; c < colorsPerThread; c++) { #pragma unroll for (int f = 0; f < filtersPerThread; f++) { targets[c * B_Y * filterPixels * numFilters + f * B_X] = scaleOutputs * prod[f][c]; } } } } /*****************************Function Revision Record***************************** * Author: Tencent BestImage Team(ankerguo@tencent.com) * * Date: 2015-05-18 * * Reason: Optimizing kernel to get faster speed according to GPU features * * Method: * * 1. reorganizing data structure to avoid bank conflict; * * 2. using vectorized data type; * * 3. improving instruction-level parallelism; * * 4. removing redundant 'if' branches; * * 5. removing local variables to save registers. * *********************************************************************************/ /* * images: (numImgColors, imgSizeY, imgSizeX, numImages), with stride given * hidActs: (numFilters, numModulesY, numModulesX, numImages) * * targets: (numModulesY*numModulesX/partialSum, numFilterColors, filterPixels, numFilters) */ template __launch_bounds__(256, 2) __global__ void conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_32_f_4_c_8_r_16(cudaTextureObject_t images, cudaTextureObject_t hidActs, float* targets, const int numImages, const int numFilters, const int numModulesY, const int numModulesX, const int imgSizeY, const int imgSizeX, const int filterSize, const int paddingStart, const int moduleStride, const int imgStride, const int numImgColors, const int numGroups, const int sumWidth, const float scaleTargets, const float scaleOutputs) { // avoid bank conflict by re-organizing the data structure, and improve band width by using 'float2' instead of 'float' __shared__ float2 shImages[preloadCases][colorsPerThread * B_Y / 2 + 2]; // preload preloadCases cases __shared__ float2 shHidActs[preloadCases][filtersPerThread * B_X / 2 + 2]; // preload preloadCases cases of B_X hidacts const int tx = threadIdx.x % B_X, ty = threadIdx.y % B_Y; //const int tidx = B_X * threadIdx.y + threadIdx.x; // reduce two registers //const int loadY = tidx / preloadCases, loadX = tidx % preloadCases; //const int filterPixels = filterSize * filterSize; // reduce one register const int filterPixelsAll = numFilters * filterSize * filterSize; const int imgPixels = imgSizeY * imgSizeX; const int numFilterBlocks = numFilters / (B_X * filtersPerThread); const int blockModuleChunkIdx = blockIdx.x / numFilterBlocks; const int numModuleChunksX = DIVUP(numModulesX, sumWidth); // const int numModuleChunksY = DIVUP(numModulesY, sumWidth); const int blockModuleChunkX = blockModuleChunkIdx % numModuleChunksX; const int blockModuleChunkY = blockModuleChunkIdx / numModuleChunksX; const int blockModuleStartX = blockModuleChunkX * sumWidth; const int blockModuleStartY = blockModuleChunkY * sumWidth; // const int moduleIdx = partialSum * outputModuleIdx; const int blockFilterIdx = filtersPerThread * B_X * (blockIdx.x % numFilterBlocks); const int numModules = numModulesY * numModulesX; const int numFiltersPerGroup = numFilters / numGroups; const int blockGroupIdx = blockFilterIdx / numFiltersPerGroup; const int numFilterColors = numImgColors / numGroups; const int blockPixelOffset = blockIdx.z; // pixel idx in filter const int blockPixelY = blockPixelOffset / filterSize, blockPixelX = blockPixelOffset % filterSize; const int blockFilterColorIdx = blockIdx.y * B_Y * colorsPerThread; const int imgColorIdx = blockFilterColorIdx + blockGroupIdx * numFilterColors; const int imgOffset = (imgColorIdx + (ty * B_X + tx) / preloadCases) * imgPixels * imgStride + (ty * B_X + tx) % preloadCases; // images += (imgColorIdx + loadY) * imgPixels * imgStride + loadX; const int hidActsOffset = blockFilterIdx * numImages * numModules + ((ty * B_X + tx) / preloadCases) * numImages * numModules + ((ty * B_X + tx) % preloadCases); // // hidActs += // blockFilterIdx * numImages * numModules // + loadY * numImages * numModules // + loadX; // usie one temporary register instead of multiple registers const int pIdxBase = imgStride * ((paddingStart + blockPixelY) * imgSizeX + paddingStart + blockPixelX); targets += blockModuleChunkIdx * numFilters * filterSize * filterSize * numFilterColors + (blockFilterColorIdx + ty) * filterSize * filterSize * numFilters + blockPixelOffset * numFilters + blockFilterIdx + tx; // if (blockIdx.x != 0 || blockIdx.y != 0 || blockIdx.z != 0) return; const int mStartX = max(blockModuleStartX, DIVUP(-blockPixelX - paddingStart, moduleStride)); const int mStartY = max(blockModuleStartY, DIVUP(-blockPixelY - paddingStart, moduleStride)); const int mEndX = min(numModulesX, min(blockModuleStartX + sumWidth, DIVUP(imgSizeX - blockPixelX - paddingStart, moduleStride))); const int mEndY = min(numModulesY, min(blockModuleStartY + sumWidth, DIVUP(imgSizeY - blockPixelY - paddingStart, moduleStride))); // reduce 3 registers const bool doWork = mStartY < mEndY && mStartX < mEndX; //float* shHidActLoad = &shHidActs[loadY][loadX]; //float* shImgLoad = &shImages[loadY][loadX]; float imPreload[preloadCases*colorsPerThread/B_X]; // [4] float haPreload[preloadCases*filtersPerThread/B_Y]; // [8] float prod[filtersPerThread][colorsPerThread]; #pragma unroll for (int f = 0; f < filtersPerThread; f++) { #pragma unroll for (int c = 0; c < colorsPerThread; c++) { prod[f][c] = 0; } } //int pixIdx, pixIdxNext, m, mNext; //conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_32_f_4_c_8_r_16_setCoords( // mStartY, mStartX, paddingStart, numModulesX, moduleStride, // blockPixelY, blockPixelX, imgSizeX, imgStride, // pixIdx, m); const int pixIdx = pIdxBase + (mStartY * imgSizeX + mStartX) * moduleStride * imgStride; const int m = (mStartY * numModulesX + mStartX); // preload the image's pixel if (doWork && (ty * B_X + tx) / preloadCases < (B_Y * colorsPerThread / 4)) { #pragma unroll for (int i = 0; i < 4; i++) { imPreload[i] = tex1Dfetch(images, imgOffset + 16 * i * imgPixels * imgStride + pixIdx); } } // preload the hidAct's pixel if (doWork && (ty * B_X + tx) / preloadCases < (B_X * filtersPerThread) / 8) { #pragma unroll for (int i = 0; i < 8; i++) { haPreload[i] = tex1Dfetch(hidActs, hidActsOffset + 16 * i * numImages * numModules + m * numImages); } } for (int my = mStartY; my < mEndY; my++) { for (int mx = mStartX; mx < mEndX; mx++) { for (int caseIdx = 0; caseIdx < numImages; caseIdx += preloadCases) { int imgOffset2 = imgOffset + caseIdx + preloadCases + pIdxBase + (my * imgSizeX + mx) * moduleStride * imgStride; int hidActsOffset2 = hidActsOffset + caseIdx + preloadCases + (my * numModulesX + mx) * numImages; if (caseIdx + preloadCases == numImages) { const int mxNext = mx + 1 == mEndX ? mStartX : mx + 1; const int myNext = my + (mx + 1 == mEndX); imgOffset2 = imgOffset + + pIdxBase + (myNext * imgSizeX + mxNext) * moduleStride * imgStride; hidActsOffset2 = hidActsOffset + (myNext * numModulesX + mxNext) * numImages; } if ((ty * B_X + tx) / preloadCases < (B_Y * colorsPerThread / 4)) { // store the previousely preloaded pixel into shared memory shImages[(ty * B_X + tx) % preloadCases][(ty * B_X + tx) / preloadCases].x = imPreload[0]; shImages[(ty * B_X + tx) % preloadCases][(ty * B_X + tx) / preloadCases].y = imPreload[2]; shImages[(ty * B_X + tx) % preloadCases][(ty * B_X + tx) / preloadCases + 16].x = imPreload[1]; shImages[(ty * B_X + tx) % preloadCases][(ty * B_X + tx) / preloadCases + 16].y = imPreload[3]; } if ((ty * B_X + tx) / preloadCases < (B_X * filtersPerThread / 8)) { shHidActs[(ty * B_X + tx) % preloadCases][(ty * B_X + tx) / preloadCases].x = haPreload[0]; shHidActs[(ty * B_X + tx) % preloadCases][(ty * B_X + tx) / preloadCases].y = haPreload[2]; shHidActs[(ty * B_X + tx) % preloadCases][(ty * B_X + tx) / preloadCases + 32].x = haPreload[4]; shHidActs[(ty * B_X + tx) % preloadCases][(ty * B_X + tx) / preloadCases + 32].y = haPreload[6]; shHidActs[(ty * B_X + tx) % preloadCases][(ty * B_X + tx) / preloadCases + 16].x = haPreload[1]; shHidActs[(ty * B_X + tx) % preloadCases][(ty * B_X + tx) / preloadCases + 16].y = haPreload[3]; shHidActs[(ty * B_X + tx) % preloadCases][(ty * B_X + tx) / preloadCases + 48].x = haPreload[5]; shHidActs[(ty * B_X + tx) % preloadCases][(ty * B_X + tx) / preloadCases + 48].y = haPreload[7]; } #pragma unroll for (int r = 0; r < 8; r++) { haPreload[r] = tex1Dfetch(hidActs, hidActsOffset2 + r * 16 * numImages * numModules); } #pragma unroll for (int r = 0; r < 4; r++) { imPreload[r] = tex1Dfetch(images, imgOffset2 + r * 16 * imgPixels * imgStride); } __syncthreads(); // put together the instructions of same type to improve instruction-level parallelism // calculate the derivative of the hidAct with respect to weight #pragma unroll for (int r = 0; r < 16; r++) { #pragma unroll for (int c = 0; c < 4; c++) { prod[0][c] += shImages[r][ty + c * B_Y].x * shHidActs[r][tx].x; prod[1][c] += shImages[r][ty + c * B_Y].x * shHidActs[r][tx].y; prod[2][c] += shImages[r][ty + c * B_Y].x * shHidActs[r][tx + B_X].x; prod[3][c] += shImages[r][ty + c * B_Y].x * shHidActs[r][tx + B_X].y; prod[0][c+4] += shImages[r][ty + c * B_Y].y * shHidActs[r][tx].x; prod[1][c+4] += shImages[r][ty + c * B_Y].y * shHidActs[r][tx].y; prod[2][c+4] += shImages[r][ty + c * B_Y].y * shHidActs[r][tx + B_X].x; prod[3][c+4] += shImages[r][ty + c * B_Y].y * shHidActs[r][tx + B_X].y; } } __syncthreads(); } } } if (scale) { #pragma unroll for (int c = 0; c < colorsPerThread; c++) { #pragma unroll for (int f = 0; f < filtersPerThread; f++) { targets[c * B_Y * filterPixelsAll + f * B_X] = scaleTargets * targets[c * B_Y * filterPixelsAll + f * B_X] + scaleOutputs * prod[f][c]; } } } else { #pragma unroll for (int c = 0; c < colorsPerThread; c++) { #pragma unroll for (int f = 0; f < filtersPerThread; f++) { targets[c * B_Y * filterPixelsAll + f * B_X] = scaleOutputs * prod[f][c]; } } } } std::pair getWeightActsOutputSize(int numModulesY, int numModulesX, int numFilterColors, int filterSize, int numFilters, int sumWidth) { const int outputModuleChunksX = DIVUP(numModulesX, sumWidth); const int outputModuleChunksY = DIVUP(numModulesY, sumWidth); const int outputModuleChunks = outputModuleChunksX * outputModuleChunksY; return std::pair(outputModuleChunks * numFilterColors * filterSize * filterSize, numFilters); } /* * images: (numImgColors, imgSizeY, imgSizeX, numImages), with stride given * hidActs: (numFilters, numModules, numImages) * * targets: (numModuleY*numModulesX/partialSum, numFilterColors, filterPixels, numFilters) * * TODO: you can get a slight speed boost for local non-convolutional units by writing special * routines for partialSum = 1. But I dunno if the code duplication is worth it... * * Note: all of these convolution routines are optimized for the case when * the number of images (i.e. the minibatch size) is a multiple of 128. * Other batch sizes will work, but but I made no attempt whatsoever * to make them work fast. */ void _weightActs(caffe2::CUDAContext* context, caffe2::TensorCUDA* images, caffe2::TensorCUDA* hidActs, caffe2::TensorCUDA* targets, int imgSizeY, int numModulesY, int numModulesX, int filterSize, int paddingStart, int moduleStride, int numImgColors, int numGroups, int sumWidth, float scaleTargets, float scaleOutput) { CAFFE_ENFORCE(images->ndim() == 2); CAFFE_ENFORCE(hidActs->ndim() == 2); CAFFE_ENFORCE(targets->ndim() == 2); int numFilterColors = numImgColors / numGroups; int imgStride = images->dim32(1); int numImages = images->dim32(1); int imgPixels = images->dim32(0) / numImgColors; int imgSizeX = imgPixels / imgSizeY; int numModules = numModulesY * numModulesX; int numFilters = hidActs->dim32(0) / numModules; int numFiltersPerGroup = numFilters / numGroups; CAFFE_ENFORCE(numImgColors % numGroups == 0); CAFFE_ENFORCE(numFilters % (16*numGroups) == 0); CAFFE_ENFORCE(numGroups > 1 || (numImgColors > 0 && (numImgColors <= 3 || numImgColors % 16 == 0))); CAFFE_ENFORCE(numGroups == 1 || numFilterColors % 16 == 0); CAFFE_ENFORCE(imgSizeY * imgSizeX == imgPixels); CAFFE_ENFORCE(images->dim32(0) == imgPixels * numImgColors); int filterPixels = filterSize * filterSize; int outputModuleChunksX = DIVUP(numModulesX, sumWidth); int outputModuleChunksY = DIVUP(numModulesY, sumWidth); int outputModuleChunks = outputModuleChunksX * outputModuleChunksY; // partialSum = partialSum == 0 ? numModules : partialSum; // CAFFE_ENFORCE(numModules % partialSum == 0); CAFFE_ENFORCE(hidActs->dim32(1) == numImages); // These routines don't handle the case when only part of the image is visited in the convolution CAFFE_ENFORCE(paddingStart <= 0); CAFFE_ENFORCE(paddingStart + (numModulesX-1)*moduleStride + filterSize >= imgSizeX); CAFFE_ENFORCE(paddingStart + (numModulesY-1)*moduleStride + filterSize >= imgSizeY); CAFFE_ENFORCE(moduleStride <= filterSize); CAFFE_ENFORCE(numModules * numFilters == hidActs->dim32(0)); int preloadCases = 32; dim3 blocks, threads; int bx, by; int pixelsPerThread, filtersPerThread, colorsPerThread; // Worth playing with these parameters to find best values for your problem. // These values work relatively well, but not optimal for all problems. if (numFilterColors > 3) { filtersPerThread = numFiltersPerGroup % 64 == 0 ? 4 : numFiltersPerGroup % 32 == 0 ? 2 : 1; colorsPerThread = numFilterColors % 64 == 0 ? 8 : numFilterColors % 48 == 0 ? 6 : numFilterColors % 32 == 0 ? 8 : 4; by = (numFilterColors / colorsPerThread) % 8 == 0 ? 8 : 4; bx = numFiltersPerGroup % 128 == 0 ? 32 : 16; preloadCases = filtersPerThread * colorsPerThread < 32 ? 32 : 16; blocks = dim3(outputModuleChunks*(numFilters/(bx*filtersPerThread)), numFilterColors / (by*colorsPerThread), filterPixels); CAFFE_ENFORCE(numFilterColors % (by*colorsPerThread) == 0); } else { // This is ugly but it's nice to spell it out clearly CAFFE_ENFORCE(numGroups == 1); // Just for sanity // NOTE: these things are only optimized for colors = 3. I didn't really test other cases. if (numFilters % 64 == 0) { // TODO: having a separate case for 128 would make things faster, but I probably don't care about 128 filtersPerThread = 4; pixelsPerThread = 2; by = 16; bx = 16; preloadCases = 32; } else if (numFilters % 48 == 0) { filtersPerThread = 3; pixelsPerThread = 4; by = 16; bx = 16; preloadCases = 32; } else if (numFilters % 32 == 0) { filtersPerThread = 2; pixelsPerThread = 2; by = 8; bx = 16; preloadCases = 16; } else { // This case is completely untested. It might be really slow. But no time now. filtersPerThread = 1; pixelsPerThread = 16; by = 16; bx = 16; preloadCases = 32; } blocks = dim3(outputModuleChunks*(numFilters/(bx*filtersPerThread)), DIVUP(filterPixels, by*pixelsPerThread)); } CAFFE_ENFORCE((by * bx) % preloadCases == 0); CAFFE_ENFORCE(numFilters % (bx * filtersPerThread) == 0); threads = dim3(bx, by); bool checkCaseBounds = numImages % preloadCases != 0; bool scale = scaleTargets != 0; std::pair targetSize = getWeightActsOutputSize(numModulesY, numModulesX, numFilterColors, filterSize, numFilters, sumWidth); if (!scale) { targets->Resize(std::vector{targetSize.first, targetSize.second}); } else { CAFFE_ENFORCE(targets->dim32(0) == targetSize.first); CAFFE_ENFORCE(targets->dim32(1) == targetSize.second); } cudaTextureObject_t tex_images = GetTensorTextureObject(images); cudaTextureObject_t tex_hidacts = GetTensorTextureObject(hidActs); float* images_data = images->mutable_data(); float* hidacts_data = hidActs->mutable_data(); float* targets_data = targets->mutable_data(); const std::size_t images_bytes = images->nbytes(); cudaStream_t stream = context->cuda_stream(); checkCudaErrors(cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte)); if (scale == false) { if (checkCaseBounds == false) { if (numFilterColors > 3) { if (numFilterColors % 64 == 0) { if (numFiltersPerGroup % 128 == 0) { cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_32_f_4_c_8_r_16< 8, 32, 4, 8, 16, false >, cudaFuncCachePreferShared); conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_32_f_4_c_8_r_16< 8, 32, 4, 8, 16, false ><<>>(tex_images, tex_hidacts, targets_data, numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, numImgColors, numGroups, sumWidth, scaleTargets, scaleOutput); } else if (numFiltersPerGroup % 64 == 0) { cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_16_f_4_c_8_r_16< 8, 16, 4, 8, 16, false >, cudaFuncCachePreferShared); conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_16_f_4_c_8_r_16< 8, 16, 4, 8, 16, false ><<>>(tex_images, tex_hidacts, targets_data, numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, numImgColors, numGroups, sumWidth, scaleTargets, scaleOutput); } else if (numFiltersPerGroup % 32 == 0) { cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_kepler_sw < 8, 16, 2, 8, 32, false >, cudaFuncCachePreferShared); conv_weight_acts_mc_mf_kepler_sw < 8, 16, 2, 8, 32, false ><<>>(images_data, hidacts_data, targets_data, numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, numImgColors, numGroups, sumWidth, scaleTargets, scaleOutput); } else if (numFiltersPerGroup % 16 == 0) { cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_kepler_sw < 8, 16, 1, 8, 32, false >, cudaFuncCachePreferShared); conv_weight_acts_mc_mf_kepler_sw < 8, 16, 1, 8, 32, false ><<>>(images_data, hidacts_data, targets_data, numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, numImgColors, numGroups, sumWidth, scaleTargets, scaleOutput); } } else if (numFilterColors % 48 == 0) { if (numFiltersPerGroup % 128 == 0) { cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_32_f_4_c_6_r_32< 8, 32, 4, 6, 32, false >, cudaFuncCachePreferShared); conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_32_f_4_c_6_r_32< 8, 32, 4, 6, 32, false ><<>>(tex_images, tex_hidacts, targets_data, numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, numImgColors, numGroups, sumWidth, scaleTargets, scaleOutput); } else if (numFiltersPerGroup % 64 == 0) { cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_kepler_sw < 8, 16, 4, 6, 32, false >, cudaFuncCachePreferShared); conv_weight_acts_mc_mf_kepler_sw < 8, 16, 4, 6, 32, false ><<>>(images_data, hidacts_data, targets_data, numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, numImgColors, numGroups, sumWidth, scaleTargets, scaleOutput); } else if (numFiltersPerGroup % 32 == 0) { cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_kepler_sw < 8, 16, 2, 6, 32, false >, cudaFuncCachePreferShared); conv_weight_acts_mc_mf_kepler_sw < 8, 16, 2, 6, 32, false ><<>>(images_data, hidacts_data, targets_data, numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, numImgColors, numGroups, sumWidth, scaleTargets, scaleOutput); } else if (numFiltersPerGroup % 16 == 0) { cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_kepler_sw < 8, 16, 1, 6, 32, false >, cudaFuncCachePreferShared); conv_weight_acts_mc_mf_kepler_sw < 8, 16, 1, 6, 32, false ><<>>(images_data, hidacts_data, targets_data, numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, numImgColors, numGroups, sumWidth, scaleTargets, scaleOutput); } } else if (numFilterColors % 32 == 0) { if (numFiltersPerGroup % 128 == 0) { cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_kepler_sw < 4, 32, 4, 8, 16, false >, cudaFuncCachePreferShared); conv_weight_acts_mc_mf_kepler_sw < 4, 32, 4, 8, 16, false ><<>>(images_data, hidacts_data, targets_data, numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, numImgColors, numGroups, sumWidth, scaleTargets, scaleOutput); } else if (numFiltersPerGroup % 64 == 0) { cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_kepler_sw < 4, 16, 4, 8, 16, false >, cudaFuncCachePreferShared); conv_weight_acts_mc_mf_kepler_sw < 4, 16, 4, 8, 16, false ><<>>(images_data, hidacts_data, targets_data, numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, numImgColors, numGroups, sumWidth, scaleTargets, scaleOutput); } else if (numFiltersPerGroup % 32 == 0) { cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_kepler_sw < 4, 16, 2, 8, 32, false >, cudaFuncCachePreferShared); conv_weight_acts_mc_mf_kepler_sw < 4, 16, 2, 8, 32, false ><<>>(images_data, hidacts_data, targets_data, numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, numImgColors, numGroups, sumWidth, scaleTargets, scaleOutput); } else if (numFiltersPerGroup % 16 == 0) { cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_kepler_sw < 4, 16, 1, 8, 32, false >, cudaFuncCachePreferShared); conv_weight_acts_mc_mf_kepler_sw < 4, 16, 1, 8, 32, false ><<>>(images_data, hidacts_data, targets_data, numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, numImgColors, numGroups, sumWidth, scaleTargets, scaleOutput); } } else if (numFilterColors % 16 == 0) { if (numFiltersPerGroup % 128 == 0) { cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_kepler_sw < 4, 32, 4, 4, 32, false >, cudaFuncCachePreferShared); conv_weight_acts_mc_mf_kepler_sw < 4, 32, 4, 4, 32, false ><<>>(images_data, hidacts_data, targets_data, numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, numImgColors, numGroups, sumWidth, scaleTargets, scaleOutput); } else if (numFiltersPerGroup % 64 == 0) { cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_kepler_sw < 4, 16, 4, 4, 32, false >, cudaFuncCachePreferShared); conv_weight_acts_mc_mf_kepler_sw < 4, 16, 4, 4, 32, false ><<>>(images_data, hidacts_data, targets_data, numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, numImgColors, numGroups, sumWidth, scaleTargets, scaleOutput); } else if (numFiltersPerGroup % 32 == 0) { cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_kepler_sw < 4, 16, 2, 4, 32, false >, cudaFuncCachePreferShared); conv_weight_acts_mc_mf_kepler_sw < 4, 16, 2, 4, 32, false ><<>>(images_data, hidacts_data, targets_data, numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, numImgColors, numGroups, sumWidth, scaleTargets, scaleOutput); } else if (numFiltersPerGroup % 16 == 0) { cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_kepler_sw < 4, 16, 1, 4, 32, false >, cudaFuncCachePreferShared); conv_weight_acts_mc_mf_kepler_sw < 4, 16, 1, 4, 32, false ><<>>(images_data, hidacts_data, targets_data, numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, numImgColors, numGroups, sumWidth, scaleTargets, scaleOutput); } } } else if (numFilterColors <= 3) { if (numFilterColors == 3) { if (numFiltersPerGroup % 64 == 0) { cudaFuncSetCacheConfig(conv_weight_acts_c_preload_pc_2_pt_2_f_4_r_32_c_3 < 16, 16, 2, 2, 4, 32, 3, false, false >, cudaFuncCachePreferShared); conv_weight_acts_c_preload_pc_2_pt_2_f_4_r_32_c_3 < 16, 16, 2, 2, 4, 32, 3, false, false ><<>>(tex_images, tex_hidacts, targets_data, numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, sumWidth, scaleTargets, scaleOutput); } else if (numFiltersPerGroup % 48 == 0) { cudaFuncSetCacheConfig(conv_weight_acts_c_preload_pc_2_pt_4_f_3_r_32_c_3 < 16, 16, 2, 4, 3, 32, 3, false, false >, cudaFuncCachePreferShared); conv_weight_acts_c_preload_pc_2_pt_4_f_3_r_32_c_3 < 16, 16, 2, 4, 3, 32, 3, false, false ><<>>(tex_images, tex_hidacts, targets_data, numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, sumWidth, scaleTargets, scaleOutput); } else if (numFiltersPerGroup % 32 == 0) { cudaFuncSetCacheConfig(conv_weight_acts_c_kepler_sw < 8, 16, 2, 2, 2, 16, 3, false, false >, cudaFuncCachePreferShared); conv_weight_acts_c_kepler_sw < 8, 16, 2, 2, 2, 16, 3, false, false ><<>>(images_data, hidacts_data, targets_data, numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, sumWidth, scaleTargets, scaleOutput); } else if (numFiltersPerGroup % 16 == 0) { cudaFuncSetCacheConfig(conv_weight_acts_c_kepler_sw < 16, 16, 2, 16, 1, 32, 3, false, false >, cudaFuncCachePreferShared); conv_weight_acts_c_kepler_sw < 16, 16, 2, 16, 1, 32, 3, false, false ><<>>(images_data, hidacts_data, targets_data, numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, sumWidth, scaleTargets, scaleOutput); } } else if (numFilterColors == 2) { if (numFiltersPerGroup % 64 == 0) { cudaFuncSetCacheConfig(conv_weight_acts_c_kepler_sw < 16, 16, 2, 2, 4, 32, 2, false, false >, cudaFuncCachePreferShared); conv_weight_acts_c_kepler_sw < 16, 16, 2, 2, 4, 32, 2, false, false ><<>>(images_data, hidacts_data, targets_data, numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, sumWidth, scaleTargets, scaleOutput); } else if (numFiltersPerGroup % 48 == 0) { cudaFuncSetCacheConfig(conv_weight_acts_c_kepler_sw < 16, 16, 2, 4, 3, 32, 2, false, false >, cudaFuncCachePreferShared); conv_weight_acts_c_kepler_sw < 16, 16, 2, 4, 3, 32, 2, false, false ><<>>(images_data, hidacts_data, targets_data, numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, sumWidth, scaleTargets, scaleOutput); } else if (numFiltersPerGroup % 32 == 0) { cudaFuncSetCacheConfig(conv_weight_acts_c_kepler_sw < 8, 16, 2, 2, 2, 16, 2, false, false >, cudaFuncCachePreferShared); conv_weight_acts_c_kepler_sw < 8, 16, 2, 2, 2, 16, 2, false, false ><<>>(images_data, hidacts_data, targets_data, numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, sumWidth, scaleTargets, scaleOutput); } else if (numFiltersPerGroup % 16 == 0) { cudaFuncSetCacheConfig(conv_weight_acts_c_kepler_sw < 16, 16, 2, 16, 1, 32, 2, false, false >, cudaFuncCachePreferShared); conv_weight_acts_c_kepler_sw < 16, 16, 2, 16, 1, 32, 2, false, false ><<>>(images_data, hidacts_data, targets_data, numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, sumWidth, scaleTargets, scaleOutput); } } else if (numFilterColors == 1) { if (numFiltersPerGroup % 64 == 0) { cudaFuncSetCacheConfig(conv_weight_acts_c_kepler_sw < 16, 16, 2, 2, 4, 32, 1, false, false >, cudaFuncCachePreferShared); conv_weight_acts_c_kepler_sw < 16, 16, 2, 2, 4, 32, 1, false, false ><<>>(images_data, hidacts_data, targets_data, numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, sumWidth, scaleTargets, scaleOutput); } else if (numFiltersPerGroup % 48 == 0) { cudaFuncSetCacheConfig(conv_weight_acts_c_kepler_sw < 16, 16, 2, 4, 3, 32, 1, false, false >, cudaFuncCachePreferShared); conv_weight_acts_c_kepler_sw < 16, 16, 2, 4, 3, 32, 1, false, false ><<>>(images_data, hidacts_data, targets_data, numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, sumWidth, scaleTargets, scaleOutput); } else if (numFiltersPerGroup % 32 == 0) { cudaFuncSetCacheConfig(conv_weight_acts_c_kepler_sw < 8, 16, 2, 2, 2, 16, 1, false, false >, cudaFuncCachePreferShared); conv_weight_acts_c_kepler_sw < 8, 16, 2, 2, 2, 16, 1, false, false ><<>>(images_data, hidacts_data, targets_data, numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, sumWidth, scaleTargets, scaleOutput); } else if (numFiltersPerGroup % 16 == 0) { cudaFuncSetCacheConfig(conv_weight_acts_c_kepler_sw < 16, 16, 2, 16, 1, 32, 1, false, false >, cudaFuncCachePreferShared); conv_weight_acts_c_kepler_sw < 16, 16, 2, 16, 1, 32, 1, false, false ><<>>(images_data, hidacts_data, targets_data, numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, sumWidth, scaleTargets, scaleOutput); } } } } else if (checkCaseBounds == true) { if (numFilterColors > 3) { if (numFilterColors % 64 == 0) { if (numFiltersPerGroup % 128 == 0) { cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_kepler_sw < 8, 32, 4, 8, 16, false >, cudaFuncCachePreferShared); conv_weight_acts_mc_mf_kepler_sw < 8, 32, 4, 8, 16, false ><<>>(images_data, hidacts_data, targets_data, numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, numImgColors, numGroups, sumWidth, scaleTargets, scaleOutput); } else if (numFiltersPerGroup % 64 == 0) { cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_kepler_sw < 8, 16, 4, 8, 16, false >, cudaFuncCachePreferShared); conv_weight_acts_mc_mf_kepler_sw < 8, 16, 4, 8, 16, false ><<>>(images_data, hidacts_data, targets_data, numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, numImgColors, numGroups, sumWidth, scaleTargets, scaleOutput); } else if (numFiltersPerGroup % 32 == 0) { cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_kepler_sw < 8, 16, 2, 8, 32, false >, cudaFuncCachePreferShared); conv_weight_acts_mc_mf_kepler_sw < 8, 16, 2, 8, 32, false ><<>>(images_data, hidacts_data, targets_data, numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, numImgColors, numGroups, sumWidth, scaleTargets, scaleOutput); } else if (numFiltersPerGroup % 16 == 0) { cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_kepler_sw < 8, 16, 1, 8, 32, false >, cudaFuncCachePreferShared); conv_weight_acts_mc_mf_kepler_sw < 8, 16, 1, 8, 32, false ><<>>(images_data, hidacts_data, targets_data, numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, numImgColors, numGroups, sumWidth, scaleTargets, scaleOutput); } } else if (numFilterColors % 48 == 0) { if (numFiltersPerGroup % 128 == 0) { cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_kepler_sw < 8, 32, 4, 6, 32, false >, cudaFuncCachePreferShared); conv_weight_acts_mc_mf_kepler_sw < 8, 32, 4, 6, 32, false ><<>>(images_data, hidacts_data, targets_data, numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, numImgColors, numGroups, sumWidth, scaleTargets, scaleOutput); } else if (numFiltersPerGroup % 64 == 0) { cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_kepler_sw < 8, 16, 4, 6, 32, false >, cudaFuncCachePreferShared); conv_weight_acts_mc_mf_kepler_sw < 8, 16, 4, 6, 32, false ><<>>(images_data, hidacts_data, targets_data, numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, numImgColors, numGroups, sumWidth, scaleTargets, scaleOutput); } else if (numFiltersPerGroup % 32 == 0) { cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_kepler_sw < 8, 16, 2, 6, 32, false >, cudaFuncCachePreferShared); conv_weight_acts_mc_mf_kepler_sw < 8, 16, 2, 6, 32, false ><<>>(images_data, hidacts_data, targets_data, numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, numImgColors, numGroups, sumWidth, scaleTargets, scaleOutput); } else if (numFiltersPerGroup % 16 == 0) { cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_kepler_sw < 8, 16, 1, 6, 32, false >, cudaFuncCachePreferShared); conv_weight_acts_mc_mf_kepler_sw < 8, 16, 1, 6, 32, false ><<>>(images_data, hidacts_data, targets_data, numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, numImgColors, numGroups, sumWidth, scaleTargets, scaleOutput); } } else if (numFilterColors % 32 == 0) { if (numFiltersPerGroup % 128 == 0) { cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_kepler_sw < 4, 32, 4, 8, 16, false >, cudaFuncCachePreferShared); conv_weight_acts_mc_mf_kepler_sw < 4, 32, 4, 8, 16, false ><<>>(images_data, hidacts_data, targets_data, numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, numImgColors, numGroups, sumWidth, scaleTargets, scaleOutput); } else if (numFiltersPerGroup % 64 == 0) { cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_kepler_sw < 4, 16, 4, 8, 16, false >, cudaFuncCachePreferShared); conv_weight_acts_mc_mf_kepler_sw < 4, 16, 4, 8, 16, false ><<>>(images_data, hidacts_data, targets_data, numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, numImgColors, numGroups, sumWidth, scaleTargets, scaleOutput); } else if (numFiltersPerGroup % 32 == 0) { cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_kepler_sw < 4, 16, 2, 8, 32, false >, cudaFuncCachePreferShared); conv_weight_acts_mc_mf_kepler_sw < 4, 16, 2, 8, 32, false ><<>>(images_data, hidacts_data, targets_data, numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, numImgColors, numGroups, sumWidth, scaleTargets, scaleOutput); } else if (numFiltersPerGroup % 16 == 0) { cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_kepler_sw < 4, 16, 1, 8, 32, false >, cudaFuncCachePreferShared); conv_weight_acts_mc_mf_kepler_sw < 4, 16, 1, 8, 32, false ><<>>(images_data, hidacts_data, targets_data, numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, numImgColors, numGroups, sumWidth, scaleTargets, scaleOutput); } } else if (numFilterColors % 16 == 0) { if (numFiltersPerGroup % 128 == 0) { cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_kepler_sw < 4, 32, 4, 4, 32, false >, cudaFuncCachePreferShared); conv_weight_acts_mc_mf_kepler_sw < 4, 32, 4, 4, 32, false ><<>>(images_data, hidacts_data, targets_data, numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, numImgColors, numGroups, sumWidth, scaleTargets, scaleOutput); } else if (numFiltersPerGroup % 64 == 0) { cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_kepler_sw < 4, 16, 4, 4, 32, false >, cudaFuncCachePreferShared); conv_weight_acts_mc_mf_kepler_sw < 4, 16, 4, 4, 32, false ><<>>(images_data, hidacts_data, targets_data, numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, numImgColors, numGroups, sumWidth, scaleTargets, scaleOutput); } else if (numFiltersPerGroup % 32 == 0) { cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_kepler_sw < 4, 16, 2, 4, 32, false >, cudaFuncCachePreferShared); conv_weight_acts_mc_mf_kepler_sw < 4, 16, 2, 4, 32, false ><<>>(images_data, hidacts_data, targets_data, numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, numImgColors, numGroups, sumWidth, scaleTargets, scaleOutput); } else if (numFiltersPerGroup % 16 == 0) { cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_kepler_sw < 4, 16, 1, 4, 32, false >, cudaFuncCachePreferShared); conv_weight_acts_mc_mf_kepler_sw < 4, 16, 1, 4, 32, false ><<>>(images_data, hidacts_data, targets_data, numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, numImgColors, numGroups, sumWidth, scaleTargets, scaleOutput); } } } else if (numFilterColors <= 3) { if (numFilterColors == 3) { if (numFiltersPerGroup % 64 == 0) { cudaFuncSetCacheConfig(conv_weight_acts_c_kepler_sw < 16, 16, 2, 2, 4, 32, 3, false, true >, cudaFuncCachePreferShared); conv_weight_acts_c_kepler_sw < 16, 16, 2, 2, 4, 32, 3, false, true ><<>>(images_data, hidacts_data, targets_data, numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, sumWidth, scaleTargets, scaleOutput); } else if (numFiltersPerGroup % 48 == 0) { cudaFuncSetCacheConfig(conv_weight_acts_c_kepler_sw < 16, 16, 2, 4, 3, 32, 3, false, true >, cudaFuncCachePreferShared); conv_weight_acts_c_kepler_sw < 16, 16, 2, 4, 3, 32, 3, false, true ><<>>(images_data, hidacts_data, targets_data, numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, sumWidth, scaleTargets, scaleOutput); } else if (numFiltersPerGroup % 32 == 0) { cudaFuncSetCacheConfig(conv_weight_acts_c_kepler_sw < 8, 16, 2, 2, 2, 16, 3, false, true >, cudaFuncCachePreferShared); conv_weight_acts_c_kepler_sw < 8, 16, 2, 2, 2, 16, 3, false, true ><<>>(images_data, hidacts_data, targets_data, numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, sumWidth, scaleTargets, scaleOutput); } else if (numFiltersPerGroup % 16 == 0) { cudaFuncSetCacheConfig(conv_weight_acts_c_kepler_sw < 16, 16, 2, 16, 1, 32, 3, false, true >, cudaFuncCachePreferShared); conv_weight_acts_c_kepler_sw < 16, 16, 2, 16, 1, 32, 3, false, true ><<>>(images_data, hidacts_data, targets_data, numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, sumWidth, scaleTargets, scaleOutput); } } else if (numFilterColors == 2) { if (numFiltersPerGroup % 64 == 0) { cudaFuncSetCacheConfig(conv_weight_acts_c_kepler_sw < 16, 16, 2, 2, 4, 32, 2, false, true >, cudaFuncCachePreferShared); conv_weight_acts_c_kepler_sw < 16, 16, 2, 2, 4, 32, 2, false, true ><<>>(images_data, hidacts_data, targets_data, numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, sumWidth, scaleTargets, scaleOutput); } else if (numFiltersPerGroup % 48 == 0) { cudaFuncSetCacheConfig(conv_weight_acts_c_kepler_sw < 16, 16, 2, 4, 3, 32, 2, false, true >, cudaFuncCachePreferShared); conv_weight_acts_c_kepler_sw < 16, 16, 2, 4, 3, 32, 2, false, true ><<>>(images_data, hidacts_data, targets_data, numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, sumWidth, scaleTargets, scaleOutput); } else if (numFiltersPerGroup % 32 == 0) { cudaFuncSetCacheConfig(conv_weight_acts_c_kepler_sw < 8, 16, 2, 2, 2, 16, 2, false, true >, cudaFuncCachePreferShared); conv_weight_acts_c_kepler_sw < 8, 16, 2, 2, 2, 16, 2, false, true ><<>>(images_data, hidacts_data, targets_data, numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, sumWidth, scaleTargets, scaleOutput); } else if (numFiltersPerGroup % 16 == 0) { cudaFuncSetCacheConfig(conv_weight_acts_c_kepler_sw < 16, 16, 2, 16, 1, 32, 2, false, true >, cudaFuncCachePreferShared); conv_weight_acts_c_kepler_sw < 16, 16, 2, 16, 1, 32, 2, false, true ><<>>(images_data, hidacts_data, targets_data, numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, sumWidth, scaleTargets, scaleOutput); } } else if (numFilterColors == 1) { if (numFiltersPerGroup % 64 == 0) { cudaFuncSetCacheConfig(conv_weight_acts_c_kepler_sw < 16, 16, 2, 2, 4, 32, 1, false, true >, cudaFuncCachePreferShared); conv_weight_acts_c_kepler_sw < 16, 16, 2, 2, 4, 32, 1, false, true ><<>>(images_data, hidacts_data, targets_data, numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, sumWidth, scaleTargets, scaleOutput); } else if (numFiltersPerGroup % 48 == 0) { cudaFuncSetCacheConfig(conv_weight_acts_c_kepler_sw < 16, 16, 2, 4, 3, 32, 1, false, true >, cudaFuncCachePreferShared); conv_weight_acts_c_kepler_sw < 16, 16, 2, 4, 3, 32, 1, false, true ><<>>(images_data, hidacts_data, targets_data, numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, sumWidth, scaleTargets, scaleOutput); } else if (numFiltersPerGroup % 32 == 0) { cudaFuncSetCacheConfig(conv_weight_acts_c_kepler_sw < 8, 16, 2, 2, 2, 16, 1, false, true >, cudaFuncCachePreferShared); conv_weight_acts_c_kepler_sw < 8, 16, 2, 2, 2, 16, 1, false, true ><<>>(images_data, hidacts_data, targets_data, numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, sumWidth, scaleTargets, scaleOutput); } else if (numFiltersPerGroup % 16 == 0) { cudaFuncSetCacheConfig(conv_weight_acts_c_kepler_sw < 16, 16, 2, 16, 1, 32, 1, false, true >, cudaFuncCachePreferShared); conv_weight_acts_c_kepler_sw < 16, 16, 2, 16, 1, 32, 1, false, true ><<>>(images_data, hidacts_data, targets_data, numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, sumWidth, scaleTargets, scaleOutput); } } } } } else if (scale == true) { if (checkCaseBounds == false) { if (numFilterColors > 3) { if (numFilterColors % 64 == 0) { if (numFiltersPerGroup % 128 == 0) { cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_32_f_4_c_8_r_16< 8, 32, 4, 8, 16, true >, cudaFuncCachePreferShared); conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_32_f_4_c_8_r_16< 8, 32, 4, 8, 16, true ><<>>(tex_images, tex_hidacts, targets_data, numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, numImgColors, numGroups, sumWidth, scaleTargets, scaleOutput); } else if (numFiltersPerGroup % 64 == 0) { cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_16_f_4_c_8_r_16< 8, 16, 4, 8, 16, true >, cudaFuncCachePreferShared); conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_16_f_4_c_8_r_16< 8, 16, 4, 8, 16, true ><<>>(tex_images, tex_hidacts, targets_data, numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, numImgColors, numGroups, sumWidth, scaleTargets, scaleOutput); } else if (numFiltersPerGroup % 32 == 0) { cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_kepler_sw < 8, 16, 2, 8, 32, true >, cudaFuncCachePreferShared); conv_weight_acts_mc_mf_kepler_sw < 8, 16, 2, 8, 32, true ><<>>(images_data, hidacts_data, targets_data, numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, numImgColors, numGroups, sumWidth, scaleTargets, scaleOutput); } else if (numFiltersPerGroup % 16 == 0) { cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_kepler_sw < 8, 16, 1, 8, 32, true >, cudaFuncCachePreferShared); conv_weight_acts_mc_mf_kepler_sw < 8, 16, 1, 8, 32, true ><<>>(images_data, hidacts_data, targets_data, numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, numImgColors, numGroups, sumWidth, scaleTargets, scaleOutput); } } else if (numFilterColors % 48 == 0) { if (numFiltersPerGroup % 128 == 0) { cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_32_f_4_c_6_r_32< 8, 32, 4, 6, 32, true >, cudaFuncCachePreferShared); conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_32_f_4_c_6_r_32< 8, 32, 4, 6, 32, true ><<>>(tex_images, tex_hidacts, targets_data, numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, numImgColors, numGroups, sumWidth, scaleTargets, scaleOutput); } else if (numFiltersPerGroup % 64 == 0) { cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_kepler_sw < 8, 16, 4, 6, 32, true >, cudaFuncCachePreferShared); conv_weight_acts_mc_mf_kepler_sw < 8, 16, 4, 6, 32, true ><<>>(images_data, hidacts_data, targets_data, numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, numImgColors, numGroups, sumWidth, scaleTargets, scaleOutput); } else if (numFiltersPerGroup % 32 == 0) { cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_kepler_sw < 8, 16, 2, 6, 32, true >, cudaFuncCachePreferShared); conv_weight_acts_mc_mf_kepler_sw < 8, 16, 2, 6, 32, true ><<>>(images_data, hidacts_data, targets_data, numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, numImgColors, numGroups, sumWidth, scaleTargets, scaleOutput); } else if (numFiltersPerGroup % 16 == 0) { cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_kepler_sw < 8, 16, 1, 6, 32, true >, cudaFuncCachePreferShared); conv_weight_acts_mc_mf_kepler_sw < 8, 16, 1, 6, 32, true ><<>>(images_data, hidacts_data, targets_data, numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, numImgColors, numGroups, sumWidth, scaleTargets, scaleOutput); } } else if (numFilterColors % 32 == 0) { if (numFiltersPerGroup % 128 == 0) { cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_kepler_sw < 4, 32, 4, 8, 16, true >, cudaFuncCachePreferShared); conv_weight_acts_mc_mf_kepler_sw < 4, 32, 4, 8, 16, true ><<>>(images_data, hidacts_data, targets_data, numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, numImgColors, numGroups, sumWidth, scaleTargets, scaleOutput); } else if (numFiltersPerGroup % 64 == 0) { cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_kepler_sw < 4, 16, 4, 8, 16, true >, cudaFuncCachePreferShared); conv_weight_acts_mc_mf_kepler_sw < 4, 16, 4, 8, 16, true ><<>>(images_data, hidacts_data, targets_data, numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, numImgColors, numGroups, sumWidth, scaleTargets, scaleOutput); } else if (numFiltersPerGroup % 32 == 0) { cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_kepler_sw < 4, 16, 2, 8, 32, true >, cudaFuncCachePreferShared); conv_weight_acts_mc_mf_kepler_sw < 4, 16, 2, 8, 32, true ><<>>(images_data, hidacts_data, targets_data, numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, numImgColors, numGroups, sumWidth, scaleTargets, scaleOutput); } else if (numFiltersPerGroup % 16 == 0) { cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_kepler_sw < 4, 16, 1, 8, 32, true >, cudaFuncCachePreferShared); conv_weight_acts_mc_mf_kepler_sw < 4, 16, 1, 8, 32, true ><<>>(images_data, hidacts_data, targets_data, numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, numImgColors, numGroups, sumWidth, scaleTargets, scaleOutput); } } else if (numFilterColors % 16 == 0) { if (numFiltersPerGroup % 128 == 0) { cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_kepler_sw < 4, 32, 4, 4, 32, true >, cudaFuncCachePreferShared); conv_weight_acts_mc_mf_kepler_sw < 4, 32, 4, 4, 32, true ><<>>(images_data, hidacts_data, targets_data, numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, numImgColors, numGroups, sumWidth, scaleTargets, scaleOutput); } else if (numFiltersPerGroup % 64 == 0) { cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_kepler_sw < 4, 16, 4, 4, 32, true >, cudaFuncCachePreferShared); conv_weight_acts_mc_mf_kepler_sw < 4, 16, 4, 4, 32, true ><<>>(images_data, hidacts_data, targets_data, numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, numImgColors, numGroups, sumWidth, scaleTargets, scaleOutput); } else if (numFiltersPerGroup % 32 == 0) { cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_kepler_sw < 4, 16, 2, 4, 32, true >, cudaFuncCachePreferShared); conv_weight_acts_mc_mf_kepler_sw < 4, 16, 2, 4, 32, true ><<>>(images_data, hidacts_data, targets_data, numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, numImgColors, numGroups, sumWidth, scaleTargets, scaleOutput); } else if (numFiltersPerGroup % 16 == 0) { cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_kepler_sw < 4, 16, 1, 4, 32, true >, cudaFuncCachePreferShared); conv_weight_acts_mc_mf_kepler_sw < 4, 16, 1, 4, 32, true ><<>>(images_data, hidacts_data, targets_data, numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, numImgColors, numGroups, sumWidth, scaleTargets, scaleOutput); } } } else if (numFilterColors <= 3) { if (numFilterColors == 3) { if (numFiltersPerGroup % 64 == 0) { cudaFuncSetCacheConfig(conv_weight_acts_c_preload_pc_2_pt_2_f_4_r_32_c_3 < 16, 16, 2, 2, 4, 32, 3, true, false >, cudaFuncCachePreferShared); conv_weight_acts_c_preload_pc_2_pt_2_f_4_r_32_c_3 < 16, 16, 2, 2, 4, 32, 3, true, false ><<>>(tex_images, tex_hidacts, targets_data, numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, sumWidth, scaleTargets, scaleOutput); } else if (numFiltersPerGroup % 48 == 0) { cudaFuncSetCacheConfig(conv_weight_acts_c_preload_pc_2_pt_4_f_3_r_32_c_3 < 16, 16, 2, 4, 3, 32, 3, true, false >, cudaFuncCachePreferShared); conv_weight_acts_c_preload_pc_2_pt_4_f_3_r_32_c_3 < 16, 16, 2, 4, 3, 32, 3, true, false ><<>>(tex_images, tex_hidacts, targets_data, numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, sumWidth, scaleTargets, scaleOutput); } else if (numFiltersPerGroup % 32 == 0) { cudaFuncSetCacheConfig(conv_weight_acts_c_kepler_sw < 8, 16, 2, 2, 2, 16, 3, true, false >, cudaFuncCachePreferShared); conv_weight_acts_c_kepler_sw < 8, 16, 2, 2, 2, 16, 3, true, false ><<>>(images_data, hidacts_data, targets_data, numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, sumWidth, scaleTargets, scaleOutput); } else if (numFiltersPerGroup % 16 == 0) { cudaFuncSetCacheConfig(conv_weight_acts_c_kepler_sw < 16, 16, 2, 16, 1, 32, 3, true, false >, cudaFuncCachePreferShared); conv_weight_acts_c_kepler_sw < 16, 16, 2, 16, 1, 32, 3, true, false ><<>>(images_data, hidacts_data, targets_data, numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, sumWidth, scaleTargets, scaleOutput); } } else if (numFilterColors == 2) { if (numFiltersPerGroup % 64 == 0) { cudaFuncSetCacheConfig(conv_weight_acts_c_kepler_sw < 16, 16, 2, 2, 4, 32, 2, true, false >, cudaFuncCachePreferShared); conv_weight_acts_c_kepler_sw < 16, 16, 2, 2, 4, 32, 2, true, false ><<>>(images_data, hidacts_data, targets_data, numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, sumWidth, scaleTargets, scaleOutput); } else if (numFiltersPerGroup % 48 == 0) { cudaFuncSetCacheConfig(conv_weight_acts_c_kepler_sw < 16, 16, 2, 4, 3, 32, 2, true, false >, cudaFuncCachePreferShared); conv_weight_acts_c_kepler_sw < 16, 16, 2, 4, 3, 32, 2, true, false ><<>>(images_data, hidacts_data, targets_data, numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, sumWidth, scaleTargets, scaleOutput); } else if (numFiltersPerGroup % 32 == 0) { cudaFuncSetCacheConfig(conv_weight_acts_c_kepler_sw < 8, 16, 2, 2, 2, 16, 2, true, false >, cudaFuncCachePreferShared); conv_weight_acts_c_kepler_sw < 8, 16, 2, 2, 2, 16, 2, true, false ><<>>(images_data, hidacts_data, targets_data, numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, sumWidth, scaleTargets, scaleOutput); } else if (numFiltersPerGroup % 16 == 0) { cudaFuncSetCacheConfig(conv_weight_acts_c_kepler_sw < 16, 16, 2, 16, 1, 32, 2, true, false >, cudaFuncCachePreferShared); conv_weight_acts_c_kepler_sw < 16, 16, 2, 16, 1, 32, 2, true, false ><<>>(images_data, hidacts_data, targets_data, numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, sumWidth, scaleTargets, scaleOutput); } } else if (numFilterColors == 1) { if (numFiltersPerGroup % 64 == 0) { cudaFuncSetCacheConfig(conv_weight_acts_c_kepler_sw < 16, 16, 2, 2, 4, 32, 1, true, false >, cudaFuncCachePreferShared); conv_weight_acts_c_kepler_sw < 16, 16, 2, 2, 4, 32, 1, true, false ><<>>(images_data, hidacts_data, targets_data, numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, sumWidth, scaleTargets, scaleOutput); } else if (numFiltersPerGroup % 48 == 0) { cudaFuncSetCacheConfig(conv_weight_acts_c_kepler_sw < 16, 16, 2, 4, 3, 32, 1, true, false >, cudaFuncCachePreferShared); conv_weight_acts_c_kepler_sw < 16, 16, 2, 4, 3, 32, 1, true, false ><<>>(images_data, hidacts_data, targets_data, numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, sumWidth, scaleTargets, scaleOutput); } else if (numFiltersPerGroup % 32 == 0) { cudaFuncSetCacheConfig(conv_weight_acts_c_kepler_sw < 8, 16, 2, 2, 2, 16, 1, true, false >, cudaFuncCachePreferShared); conv_weight_acts_c_kepler_sw < 8, 16, 2, 2, 2, 16, 1, true, false ><<>>(images_data, hidacts_data, targets_data, numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, sumWidth, scaleTargets, scaleOutput); } else if (numFiltersPerGroup % 16 == 0) { cudaFuncSetCacheConfig(conv_weight_acts_c_kepler_sw < 16, 16, 2, 16, 1, 32, 1, true, false >, cudaFuncCachePreferShared); conv_weight_acts_c_kepler_sw < 16, 16, 2, 16, 1, 32, 1, true, false ><<>>(images_data, hidacts_data, targets_data, numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, sumWidth, scaleTargets, scaleOutput); } } } } else if (checkCaseBounds == true) { if (numFilterColors > 3) { if (numFilterColors % 64 == 0) { if (numFiltersPerGroup % 128 == 0) { cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_kepler_sw < 8, 32, 4, 8, 16, true >, cudaFuncCachePreferShared); conv_weight_acts_mc_mf_kepler_sw < 8, 32, 4, 8, 16, true ><<>>(images_data, hidacts_data, targets_data, numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, numImgColors, numGroups, sumWidth, scaleTargets, scaleOutput); } else if (numFiltersPerGroup % 64 == 0) { cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_kepler_sw < 8, 16, 4, 8, 16, true >, cudaFuncCachePreferShared); conv_weight_acts_mc_mf_kepler_sw < 8, 16, 4, 8, 16, true ><<>>(images_data, hidacts_data, targets_data, numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, numImgColors, numGroups, sumWidth, scaleTargets, scaleOutput); } else if (numFiltersPerGroup % 32 == 0) { cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_kepler_sw < 8, 16, 2, 8, 32, true >, cudaFuncCachePreferShared); conv_weight_acts_mc_mf_kepler_sw < 8, 16, 2, 8, 32, true ><<>>(images_data, hidacts_data, targets_data, numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, numImgColors, numGroups, sumWidth, scaleTargets, scaleOutput); } else if (numFiltersPerGroup % 16 == 0) { cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_kepler_sw < 8, 16, 1, 8, 32, true >, cudaFuncCachePreferShared); conv_weight_acts_mc_mf_kepler_sw < 8, 16, 1, 8, 32, true ><<>>(images_data, hidacts_data, targets_data, numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, numImgColors, numGroups, sumWidth, scaleTargets, scaleOutput); } } else if (numFilterColors % 48 == 0) { if (numFiltersPerGroup % 128 == 0) { cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_kepler_sw < 8, 32, 4, 6, 32, true >, cudaFuncCachePreferShared); conv_weight_acts_mc_mf_kepler_sw < 8, 32, 4, 6, 32, true ><<>>(images_data, hidacts_data, targets_data, numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, numImgColors, numGroups, sumWidth, scaleTargets, scaleOutput); } else if (numFiltersPerGroup % 64 == 0) { cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_kepler_sw < 8, 16, 4, 6, 32, true >, cudaFuncCachePreferShared); conv_weight_acts_mc_mf_kepler_sw < 8, 16, 4, 6, 32, true ><<>>(images_data, hidacts_data, targets_data, numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, numImgColors, numGroups, sumWidth, scaleTargets, scaleOutput); } else if (numFiltersPerGroup % 32 == 0) { cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_kepler_sw < 8, 16, 2, 6, 32, true >, cudaFuncCachePreferShared); conv_weight_acts_mc_mf_kepler_sw < 8, 16, 2, 6, 32, true ><<>>(images_data, hidacts_data, targets_data, numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, numImgColors, numGroups, sumWidth, scaleTargets, scaleOutput); } else if (numFiltersPerGroup % 16 == 0) { cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_kepler_sw < 8, 16, 1, 6, 32, true >, cudaFuncCachePreferShared); conv_weight_acts_mc_mf_kepler_sw < 8, 16, 1, 6, 32, true ><<>>(images_data, hidacts_data, targets_data, numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, numImgColors, numGroups, sumWidth, scaleTargets, scaleOutput); } } else if (numFilterColors % 32 == 0) { if (numFiltersPerGroup % 128 == 0) { cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_kepler_sw < 4, 32, 4, 8, 16, true >, cudaFuncCachePreferShared); conv_weight_acts_mc_mf_kepler_sw < 4, 32, 4, 8, 16, true ><<>>(images_data, hidacts_data, targets_data, numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, numImgColors, numGroups, sumWidth, scaleTargets, scaleOutput); } else if (numFiltersPerGroup % 64 == 0) { cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_kepler_sw < 4, 16, 4, 8, 16, true >, cudaFuncCachePreferShared); conv_weight_acts_mc_mf_kepler_sw < 4, 16, 4, 8, 16, true ><<>>(images_data, hidacts_data, targets_data, numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, numImgColors, numGroups, sumWidth, scaleTargets, scaleOutput); } else if (numFiltersPerGroup % 32 == 0) { cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_kepler_sw < 4, 16, 2, 8, 32, true >, cudaFuncCachePreferShared); conv_weight_acts_mc_mf_kepler_sw < 4, 16, 2, 8, 32, true ><<>>(images_data, hidacts_data, targets_data, numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, numImgColors, numGroups, sumWidth, scaleTargets, scaleOutput); } else if (numFiltersPerGroup % 16 == 0) { cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_kepler_sw < 4, 16, 1, 8, 32, true >, cudaFuncCachePreferShared); conv_weight_acts_mc_mf_kepler_sw < 4, 16, 1, 8, 32, true ><<>>(images_data, hidacts_data, targets_data, numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, numImgColors, numGroups, sumWidth, scaleTargets, scaleOutput); } } else if (numFilterColors % 16 == 0) { if (numFiltersPerGroup % 128 == 0) { cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_kepler_sw < 4, 32, 4, 4, 32, true >, cudaFuncCachePreferShared); conv_weight_acts_mc_mf_kepler_sw < 4, 32, 4, 4, 32, true ><<>>(images_data, hidacts_data, targets_data, numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, numImgColors, numGroups, sumWidth, scaleTargets, scaleOutput); } else if (numFiltersPerGroup % 64 == 0) { cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_kepler_sw < 4, 16, 4, 4, 32, true >, cudaFuncCachePreferShared); conv_weight_acts_mc_mf_kepler_sw < 4, 16, 4, 4, 32, true ><<>>(images_data, hidacts_data, targets_data, numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, numImgColors, numGroups, sumWidth, scaleTargets, scaleOutput); } else if (numFiltersPerGroup % 32 == 0) { cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_kepler_sw < 4, 16, 2, 4, 32, true >, cudaFuncCachePreferShared); conv_weight_acts_mc_mf_kepler_sw < 4, 16, 2, 4, 32, true ><<>>(images_data, hidacts_data, targets_data, numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, numImgColors, numGroups, sumWidth, scaleTargets, scaleOutput); } else if (numFiltersPerGroup % 16 == 0) { cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_kepler_sw < 4, 16, 1, 4, 32, true >, cudaFuncCachePreferShared); conv_weight_acts_mc_mf_kepler_sw < 4, 16, 1, 4, 32, true ><<>>(images_data, hidacts_data, targets_data, numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, numImgColors, numGroups, sumWidth, scaleTargets, scaleOutput); } } } else if (numFilterColors <= 3) { if (numFilterColors == 3) { if (numFiltersPerGroup % 64 == 0) { cudaFuncSetCacheConfig(conv_weight_acts_c_kepler_sw < 16, 16, 2, 2, 4, 32, 3, true, true >, cudaFuncCachePreferShared); conv_weight_acts_c_kepler_sw < 16, 16, 2, 2, 4, 32, 3, true, true ><<>>(images_data, hidacts_data, targets_data, numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, sumWidth, scaleTargets, scaleOutput); } else if (numFiltersPerGroup % 48 == 0) { cudaFuncSetCacheConfig(conv_weight_acts_c_kepler_sw < 16, 16, 2, 4, 3, 32, 3, true, true >, cudaFuncCachePreferShared); conv_weight_acts_c_kepler_sw < 16, 16, 2, 4, 3, 32, 3, true, true ><<>>(images_data, hidacts_data, targets_data, numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, sumWidth, scaleTargets, scaleOutput); } else if (numFiltersPerGroup % 32 == 0) { cudaFuncSetCacheConfig(conv_weight_acts_c_kepler_sw < 8, 16, 2, 2, 2, 16, 3, true, true >, cudaFuncCachePreferShared); conv_weight_acts_c_kepler_sw < 8, 16, 2, 2, 2, 16, 3, true, true ><<>>(images_data, hidacts_data, targets_data, numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, sumWidth, scaleTargets, scaleOutput); } else if (numFiltersPerGroup % 16 == 0) { cudaFuncSetCacheConfig(conv_weight_acts_c_kepler_sw < 16, 16, 2, 16, 1, 32, 3, true, true >, cudaFuncCachePreferShared); conv_weight_acts_c_kepler_sw < 16, 16, 2, 16, 1, 32, 3, true, true ><<>>(images_data, hidacts_data, targets_data, numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, sumWidth, scaleTargets, scaleOutput); } } else if (numFilterColors == 2) { if (numFiltersPerGroup % 64 == 0) { cudaFuncSetCacheConfig(conv_weight_acts_c_kepler_sw < 16, 16, 2, 2, 4, 32, 2, true, true >, cudaFuncCachePreferShared); conv_weight_acts_c_kepler_sw < 16, 16, 2, 2, 4, 32, 2, true, true ><<>>(images_data, hidacts_data, targets_data, numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, sumWidth, scaleTargets, scaleOutput); } else if (numFiltersPerGroup % 48 == 0) { cudaFuncSetCacheConfig(conv_weight_acts_c_kepler_sw < 16, 16, 2, 4, 3, 32, 2, true, true >, cudaFuncCachePreferShared); conv_weight_acts_c_kepler_sw < 16, 16, 2, 4, 3, 32, 2, true, true ><<>>(images_data, hidacts_data, targets_data, numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, sumWidth, scaleTargets, scaleOutput); } else if (numFiltersPerGroup % 32 == 0) { cudaFuncSetCacheConfig(conv_weight_acts_c_kepler_sw < 8, 16, 2, 2, 2, 16, 2, true, true >, cudaFuncCachePreferShared); conv_weight_acts_c_kepler_sw < 8, 16, 2, 2, 2, 16, 2, true, true ><<>>(images_data, hidacts_data, targets_data, numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, sumWidth, scaleTargets, scaleOutput); } else if (numFiltersPerGroup % 16 == 0) { cudaFuncSetCacheConfig(conv_weight_acts_c_kepler_sw < 16, 16, 2, 16, 1, 32, 2, true, true >, cudaFuncCachePreferShared); conv_weight_acts_c_kepler_sw < 16, 16, 2, 16, 1, 32, 2, true, true ><<>>(images_data, hidacts_data, targets_data, numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, sumWidth, scaleTargets, scaleOutput); } } else if (numFilterColors == 1) { if (numFiltersPerGroup % 64 == 0) { cudaFuncSetCacheConfig(conv_weight_acts_c_kepler_sw < 16, 16, 2, 2, 4, 32, 1, true, true >, cudaFuncCachePreferShared); conv_weight_acts_c_kepler_sw < 16, 16, 2, 2, 4, 32, 1, true, true ><<>>(images_data, hidacts_data, targets_data, numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, sumWidth, scaleTargets, scaleOutput); } else if (numFiltersPerGroup % 48 == 0) { cudaFuncSetCacheConfig(conv_weight_acts_c_kepler_sw < 16, 16, 2, 4, 3, 32, 1, true, true >, cudaFuncCachePreferShared); conv_weight_acts_c_kepler_sw < 16, 16, 2, 4, 3, 32, 1, true, true ><<>>(images_data, hidacts_data, targets_data, numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, sumWidth, scaleTargets, scaleOutput); } else if (numFiltersPerGroup % 32 == 0) { cudaFuncSetCacheConfig(conv_weight_acts_c_kepler_sw < 8, 16, 2, 2, 2, 16, 1, true, true >, cudaFuncCachePreferShared); conv_weight_acts_c_kepler_sw < 8, 16, 2, 2, 2, 16, 1, true, true ><<>>(images_data, hidacts_data, targets_data, numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, sumWidth, scaleTargets, scaleOutput); } else if (numFiltersPerGroup % 16 == 0) { cudaFuncSetCacheConfig(conv_weight_acts_c_kepler_sw < 16, 16, 2, 16, 1, 32, 1, true, true >, cudaFuncCachePreferShared); conv_weight_acts_c_kepler_sw < 16, 16, 2, 16, 1, 32, 1, true, true ><<>>(images_data, hidacts_data, targets_data, numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, sumWidth, scaleTargets, scaleOutput); } } } } } checkCudaErrors(cudaDestroyTextureObject(tex_images)); checkCudaErrors(cudaDestroyTextureObject(tex_hidacts)); checkCudaErrors(cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeFourByte)); getLastCudaError("weightActs: kernel execution failed"); } void convWeightActs(caffe2::CUDAContext* context, caffe2::TensorCUDA* images, caffe2::TensorCUDA* hidActs, caffe2::TensorCUDA* targets, int imgSizeY, int numModulesY, int numModulesX, int filterSize, int paddingStart, int moduleStride, int numImgColors, int numGroups, int partialSum) { _weightActs(context, images, hidActs, targets, imgSizeY, numModulesY, numModulesX, filterSize, paddingStart, moduleStride, numImgColors, numGroups, partialSum, 0, 1); } void convWeightActs(caffe2::CUDAContext* context, caffe2::TensorCUDA* images, caffe2::TensorCUDA* hidActs, caffe2::TensorCUDA* targets, int imgSizeY, int numModulesY, int numModulesX, int filterSize, int paddingStart, int moduleStride, int numImgColors, int numGroups, int partialSum, float scaleTargets, float scaleOutput) { _weightActs(context, images, hidActs, targets, imgSizeY, numModulesY, numModulesX, filterSize, paddingStart, moduleStride, numImgColors, numGroups, partialSum, scaleTargets, scaleOutput); } void localWeightActs(caffe2::CUDAContext* context, caffe2::TensorCUDA* images, caffe2::TensorCUDA* hidActs, caffe2::TensorCUDA* targets, int imgSizeY, int numModulesY, int numModulesX, int filterSize, int paddingStart, int moduleStride, int numImgColors, int numGroups) { _weightActs(context, images, hidActs, targets, imgSizeY, numModulesY, numModulesX, filterSize, paddingStart, moduleStride, numImgColors, numGroups, 1, 0, 1); } void localWeightActs(caffe2::CUDAContext* context, caffe2::TensorCUDA* images, caffe2::TensorCUDA* hidActs, caffe2::TensorCUDA* targets, int imgSizeY, int numModulesY, int numModulesX, int filterSize, int paddingStart, int moduleStride, int numImgColors, int numGroups, float scaleTargets, float scaleOutput) { _weightActs(context, images, hidActs, targets, imgSizeY, numModulesY, numModulesX, filterSize, paddingStart, moduleStride, numImgColors, numGroups, 1, scaleTargets, scaleOutput); } ================================================ FILE: caffe2/contrib/cuda-convnet2/cudaconvnet/Makefile ================================================ ################################################################################ # # Copyright 1993-2012 NVIDIA Corporation. All rights reserved. # # NOTICE TO USER: # # This source code is subject to NVIDIA ownership rights under U.S. and # international Copyright laws. # # NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE # CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR # IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH # REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF # MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. # IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, # OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS # OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE # OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE # OR PERFORMANCE OF THIS SOURCE CODE. # # U.S. Government End Users. This source code is a "commercial item" as # that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of # "commercial computer software" and "commercial computer software # documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) # and is provided to the U.S. Government only as a commercial end item. # Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through # 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the # source code with only those rights set forth herein. # ################################################################################ # Location of the CUDA Toolkit binaries and libraries CUDA_INC_PATH = $(CUDA_INSTALL_PATH)/include CUDA_BIN_PATH = $(CUDA_INSTALL_PATH)/bin CUDA_LIB_PATH = $(CUDA_INSTALL_PATH)/lib64 # Common binaries NVCC = $(CUDA_BIN_PATH)/nvcc GCC = g++ AR = ar # CUDA code generation flags GENCODE_SM35 := -gencode arch=compute_35,code=sm_35 GENCODE_FLAGS := $(GENCODE_SM35) LDFLAGS := -L$(CUDA_LIB_PATH) -lcudart CCFLAGS := -m64 NVCCFLAGS := -m64 # Debug build flags ifeq ($(dbg),1) CCFLAGS += -g NVCCFLAGS += -g -G DBG := debug else DBG := release NVCCFLAGS += -O3 CCFLAGS += -O3 endif # Add profiler output ifeq ($(prof),1) NVCCFLAGS += --ptxas-options=-v endif TARGETDIR := ./bin/$(DBG) OBJDIR := ./obj/$(DBG) ########## USER STUFF ########### PYTHON_VERSION=$(shell python -V 2>&1 | cut -d ' ' -f 2 | cut -d '.' -f 1,2) MODELNAME := _ConvNet LDFLAGS += -lpthread -ljpeg -lpython$(PYTHON_VERSION) -L../util -lutilpy -L../nvmatrix -lnvmatrix -L../cudaconv3 -lcudaconv -lcublas -Wl,-rpath=./util -Wl,-rpath=./nvmatrix -Wl,-rpath=./cudaconv3 INCLUDES := -I$(CUDA_INC_PATH) -I $(CUDA_SDK_PATH)/common/inc -I./include -I$(PYTHON_INCLUDE_PATH) -I$(NUMPY_INCLUDE_PATH) DEFINES := -DNUMPY_INTERFACE CUFILES := $(shell find . -name "*.cu") CU_DEPS := $(shell find . -name "*.cuh") CCFILES := $(shell find . -name "*.cpp") C_DEPS := $(shell find . -name "*.h") NVCCFLAGS += --compiler-options '-fPIC' LDFLAGS += -shared CCFLAGS += -fPIC TARGET := $(TARGETDIR)/$(MODELNAME).so ################################################################################ # Set up target and object files ################################################################################ OBJS += $(patsubst %.cpp,$(OBJDIR)/%.cpp.o,$(CCFILES)) OBJS += $(patsubst %.c,$(OBJDIR)/%.c.o,$(CFILES)) OBJS += $(patsubst %.cu,$(OBJDIR)/%.cu.o,$(CUFILES)) # Target rules all: makedirs $(TARGET) $(OBJDIR)/%.cu.o : %.cu $(CU_DEPS) $(NVCC) $(DEFINES) $(NVCCFLAGS) $(GENCODE_FLAGS) $(INCLUDES) -o $@ -c $< $(OBJDIR)/%.cpp.o : %.cpp $(C_DEPS) $(GCC) $(DEFINES) $(CCFLAGS) $(INCLUDES) -o $@ -c $< $(TARGET): $(OBJS) $(GCC) $(CCFLAGS) -o $@ $+ $(LDFLAGS) $(EXTRA_LDFLAGS) ln -sf $(TARGET) . makedirs: mkdir -p $(TARGETDIR) mkdir -p $(OBJDIR)/src clean: rm -rf ./obj ================================================ FILE: caffe2/contrib/cuda-convnet2/cudaconvnet/__init__.py ================================================ ================================================ FILE: caffe2/contrib/cuda-convnet2/cudaconvnet/include/actbroadcaster.cuh ================================================ /* * Copyright 2014 Google Inc. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef ACTBROADCASTER_CUH_H_ #define ACTBROADCASTER_CUH_H_ #include #include "streambroadcast.cuh" #include "copypipeline.cuh" class BroadcastMessage { public: enum MESSAGE_TYPE { BROADCAST, EXIT }; protected: int _srcDevice; std::map _mats; int _userIdx; Queue* _finishQueue; MESSAGE_TYPE _type; BroadcastMessage(MESSAGE_TYPE type); public: BroadcastMessage(std::map mats, int srcDevice, int userIdx, Queue& finishQueue); int getSrcDevice(); std::map& getMatrices(); int getUserIdx(); Queue& getFinishQueue(); MESSAGE_TYPE getMessageType(); }; class ExitBroadcastMessage : public BroadcastMessage { public: ExitBroadcastMessage(); }; class ActBroadcaster : public Thread { protected: std::map _broadcasters; // src device --> broadcaster Queue _messageQueue; int _numUsers; public: ActBroadcaster(int numUsers, intv& cpus); ~ActBroadcaster(); Queue& getMessageQueue(); virtual void* run(); void stop(); }; #endif /* ACTBROADCASTER_CUH_H_ */ ================================================ FILE: caffe2/contrib/cuda-convnet2/cudaconvnet/include/convnet.cuh ================================================ /* * Copyright 2014 Google Inc. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef CONVNET3 #define CONVNET3 #include #include #include #include #include #include #include "../../util/include/queue.h" #include "../../util/include/thread.h" #include #include "../../util/include/sync.h" #include "messages.cuh" #include "streambroadcast.cuh" #include "layer.cuh" #include "data.cuh" #include "worker.cuh" #include "weights.cuh" #include "pipedispenser.cuh" #include "timer.cuh" class Worker; class WorkResult; class Layer; class DataLayer; class CostLayer; class ConvNetThread; class StreamBroadcast; class Weights; // name -> device id -> layer* typedef std::map > NameReplicaLayerMap; typedef std::map NameLayerMap; // name -> ReplicaMap //typedef std::map ReplicaNameLayerMap; typedef std::vector ConvNetThreadV; typedef std::vector DataLayerVector; //typedef std::map ReplicaThreadsMap; class ConvNet : public Thread { private: void checkGradient_copyWeightsToGPU(Matrix& weightsCPU, Weights& weights); protected: NameReplicaLayerMap _layerMap; DataLayerVector _dataLayers; // Vector of convnet threads (one thread == one GPU) ConvNetThreadV _convNetThreads; DataProvider* _dp; CPUData* _data, *_bufferData; int _bufferMinibatchIdx, _bufferPassIdx; ThreadSynchronizer* _sync; intv _deviceIDs; Queue _workerQueue; Queue _resultQueue; Queue _msgQueue; int _numFwdTerminal; std::map _numBwdTerminal; // pass idx -> #terminal int _totalPassesDone; int _numReplicasMin, _numReplicasMax; // For gradient checking int _numFailures; int _numTests; // Training progress (between 0 and 1). // Used to determine learning rate based on ParameterSchedule. double _trainingProgress; double _baseErr; bool _conserveMem; PipeDispenser *_dataCopyPD; void waitForTerminals(int numMsgs, MESSAGES msg); void sendMessage(MESSAGES msg, bool sync); void sendMessage(Message* msg, bool sync); void findBwdTerminal(Layer& l, std::set& visited, int& terminal, int passIdx); void connectReplicas(); void initDataLayers(PyObjectV* layerList); void initGPUThreads(PyObjectV* layerList); void connectChildren(PyObject* layerParams); void* run(); void setData(CPUData& data, int passIdx); void setDataFromBuffer(); void setBuffer(CPUData* bufferData, int bufferMinibatchIdx, int bufferPassIdx); public: ConvNet(PyObject* layerParams, intv& deviceIDs, int minibatchSize, bool conserveMem); ~ConvNet(); void stop(); Queue& getMessageQueue(); Queue& getWorkerQueue(); Queue& getResultQueue(); DataProvider& getDataProvider(); Layer& getLayer(std::string& name, int replicaID); void copyToCPU(); void copyToGPU(); void updateWeights(int passIdx); void reset(int passIdx); void reset(); void bprop(int passIdx, PASS_TYPE passType); void fprop(int miniIdx, int passIdx, PASS_TYPE passType); void fprop(CPUData& data, int passIdx, PASS_TYPE passType); void setTrainingProgress(double progress); double getTrainingProgress() const; bool checkGradient(const std::string& name, float eps, Weights& weights); void checkGradients(); Cost& getCost(); Cost& getCost(Cost& cost); CPUData& getData(); // Returns last minibatch fpropped double getCostValue(); intv& getDeviceIDs(); ThreadSynchronizer& getSync(); void syncWithChildren(); int getMinibatchSize(); bool isConserveMemory(); int getNumReplicasMax(); int getNumReplicasMin(); int getNumPasses(); int getTotalPassesDone(); PipeDispenser& getDataCopyPD(); }; class ConvNetThread : public Thread { protected: NameLayerMap _nameLayerMap; std::vector _costs; ConvNet* _convNet; int _deviceID; Queue _msgQueue; Timer _timer; // StreamBroadcast* _weightSynchronizer; void initCuda(); virtual void initLayer(PyObject* paramsDict, int replicaID); void* run(); public: ConvNetThread(PyObjectV* layerList, int deviceID, int deviceIdx, ConvNet* convNet); ~ConvNetThread(); NameLayerMap& getLayerMap(); int getDeviceID(); ConvNet& getConvNet(); Queue& getMessageQueue(); std::vector& getCostLayers(); // StreamBroadcast& getWeightSynchronizer(); Cost& getCost(); Layer& getLayer(std::string& name); void startTimer(); double stopTimer(); }; #endif /* CONVNET */ ================================================ FILE: caffe2/contrib/cuda-convnet2/cudaconvnet/include/copypipeline.cuh ================================================ /* * Copyright 2014 Google Inc. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef COPYPIPELINE_CUH_ #define COPYPIPELINE_CUH_ #include #include "../../util/include/thread.h" #include "../../util/include/queue.h" #include #include "../../nvmatrix/include/nvmatrix.cuh" #include "util.cuh" #define COPY_MIN_CHUNK_SIZE (1<<18) // 256k #define COPY_MAX_CHUNKS 16 #define COPY_MIN_CHUNKS 2 class CopyPeer; class CopySource; class ICopySegment; class IBroadcastNetwork; class CopyMessage { protected: std::map* _mats; float _scaleSource, _scaleTargets; public: enum COPY_MESSAGE_TYPE { COPY_CHUNK, COPY_START, EXIT }; CopyMessage(COPY_MESSAGE_TYPE msgType, float scaleSource, float scaleTargets, std::map& mats) : _msgType(msgType), _scaleSource(scaleSource), _scaleTargets(scaleTargets), _mats(&mats) { } CopyMessage(COPY_MESSAGE_TYPE msgType) : _msgType(msgType), _scaleSource(0), _scaleTargets(0), _mats(NULL) { } inline COPY_MESSAGE_TYPE getType() const { return _msgType; } inline NVMatrix& getMatrix(int deviceID) const { return *_mats->at(deviceID); } inline std::map& getMatrices() const { return *_mats; } inline float getScaleSource() const { return _scaleSource; } inline float getScaleTargets() const { return _scaleTargets; } protected: COPY_MESSAGE_TYPE _msgType; }; class CopyChunkMessage : public CopyMessage { protected: int _chunkIdx; int _chunkSize; int _numChunks; public: CopyChunkMessage(int chunkIdx, int chunkSize, int numChunks, float scaleSource, float scaleTargets, std::map& mats) : _chunkIdx(chunkIdx), _chunkSize(chunkSize), _numChunks(numChunks), CopyMessage(COPY_CHUNK, scaleSource, scaleTargets, mats) { } inline int getChunkIdx() const { return _chunkIdx; } inline int getChunkSize() const { return _chunkSize; } inline int getNumChunks() const { return _numChunks; } }; class CopyStartMessage : public CopyMessage { public: CopyStartMessage(float scaleSource, float scaleTargets, std::map& mats) : CopyMessage(COPY_START, scaleSource, scaleTargets, mats) { } }; class ICopySegment : public Thread { protected: int _deviceID, _execDeviceID; cudaStream_t _stream; ICopySegment* _prev; std::vector _next; Queue _queue; Queue* _finishQueue; HostNVMatrix _hmat; IBroadcastNetwork* _parent; NVMatrix& getChunk(NVMatrix& mat, int chunkSize, int chunkIdx); void* run(); virtual bool processMessage(CopyMessage& msg) = 0; public: ICopySegment(IBroadcastNetwork& parent, int deviceID, Queue* finishQueue); virtual ~ICopySegment(); inline NVMatrix& getMatrix(CopyMessage& msg); Queue& getQueue(); inline int getDeviceID(); void addPrev(ICopySegment& c); void addNext(CopyPeer& c); bool isTerminal() const; virtual bool isSource() const = 0; }; class CopySource : public ICopySegment { protected: bool processMessage(CopyMessage& msg); public: CopySource(IBroadcastNetwork& parent, int deviceID); inline bool isSource() const; }; class CopyPeer : public ICopySegment { protected: bool processMessage(CopyMessage& msg); public: CopyPeer(IBroadcastNetwork& parent, int deviceID, Queue* finishQueue); inline bool isSource() const; }; class IBroadcastNetwork { protected: Queue _finishQueue; CopySource* _src; std::vector _peers; int _srcDeviceID, _numTerminal; bool _constructed; std::set _devices; std::pair,std::vector > makeGPULists(); void makePeers(std::pair,std::vector >& gpus); virtual void makeConnections() = 0; virtual void _broadcast(std::map& mats, float scaleSource, float scaleTargets); IBroadcastNetwork(std::set& devices, int srcDeviceID, int numTerminal); public: virtual IBroadcastNetwork& construct(); virtual ~IBroadcastNetwork(); virtual void broadcast(std::map& mats); int getSourceDeviceID() const; static IBroadcastNetwork& make(std::set devices, int srcDeviceID); }; class ISafeBroadcastNetwork : public IBroadcastNetwork { protected: ISafeBroadcastNetwork(std::set& devices, int srcDeviceID, int numTerminal); public: virtual void broadcast(std::map& mats, float scaleSource, float scaleTargets); virtual ISafeBroadcastNetwork& construct(); static ISafeBroadcastNetwork& make(std::set devices, int srcDeviceID); }; class NullBroadcaster : public ISafeBroadcastNetwork { protected: NullBroadcaster(std::set& devices, int srcDeviceID); void makeConnections(); public: NullBroadcaster& construct(); void broadcast(std::map& mats, float scaleSource, float scaleTargets); void broadcast(std::map& mats); friend class IBroadcastNetwork; friend class ISafeBroadcastNetwork; }; /* * This one goes to host and then to targets. */ class NaiveBroadcaster : public ISafeBroadcastNetwork { protected: NaiveBroadcaster(std::set& devices, int srcDeviceID); void makeConnections(); friend class IBroadcastNetwork; friend class ISafeBroadcastNetwork; }; class EightGPUBroadcaster1 : public IBroadcastNetwork { protected: EightGPUBroadcaster1(std::set& devices, int srcDeviceID); void makeConnections(); friend class IBroadcastNetwork; }; class TwoPeeringGPUsBroadcaster : public ISafeBroadcastNetwork { protected: int _tgtDeviceID; cudaStream_t _tgtStream; void makeConnections(); void resetDeviceID(int d); void _broadcast(std::map& mats, float scaleSource, float scaleTargets); public: TwoPeeringGPUsBroadcaster(std::set& devices, int srcDeviceID); ~TwoPeeringGPUsBroadcaster(); ISafeBroadcastNetwork& construct(); friend class IBroadcastNetwork; friend class ISafeBroadcastNetwork; }; #endif /* COPYPIPELINE_CUH_ */ ================================================ FILE: caffe2/contrib/cuda-convnet2/cudaconvnet/include/cost.cuh ================================================ /* * Copyright 2014 Google Inc. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef COST_CUH #define COST_CUH #include #include #include #include "layer.cuh" #include "util.cuh" class CostLayer; /* * Wrapper for dictionary mapping cost name to vector of returned values. */ class Cost { protected: std::map _numCases; CostMap _costMap; CostCoeffMap _costCoeffMap; std::map& getNumCasesMap(); public: Cost(); Cost(std::vector& costs); doublev& operator [](const std::string s); CostMap& getCostMap(); CostCoeffMap& getCostCoeffMap(); int getNumCases(); /* * Returns sum of first values returned by all the CostLayers, weighted by the cost coefficients. */ double getValue(); Cost& operator += (Cost& er); virtual ~Cost(); void print(); }; #endif /* COST_CUH */ ================================================ FILE: caffe2/contrib/cuda-convnet2/cudaconvnet/include/data.cuh ================================================ /* * Copyright 2014 Google Inc. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef DATA_CUH #define DATA_CUH #include #include #include "util.cuh" class CPUData { protected: MatrixV* _data; void assertDimensions() { assert(_data->size() > 0); for (int i = 1; i < _data->size(); i++) { assert(_data->at(i-1)->getNumCols() == _data->at(i)->getNumCols()); if (_data->at(i-1)->isTrans() != _data->at(i)->isTrans() && _data->at(i)->getNumElements() < 2) { _data->at(i)->setTrans(_data->at(i-1)->isTrans()); } assert(_data->at(i-1)->isTrans() == _data->at(i)->isTrans()); } assert(_data->at(0)->getNumCols() > 0); } public: typedef typename MatrixV::iterator T_iter; // Cases in columns, but array may be transposed // (so in memory they can really be in rows -- in which case the array is transposed // during the copy to GPU). CPUData(PyObject* pyData) { _data = getMatrixV(pyData); assertDimensions(); } CPUData(MatrixV* data) : _data(data) { assertDimensions(); } ~CPUData() { for (T_iter it = _data->begin(); it != _data->end(); ++it) { delete *it; } delete _data; } Matrix& operator [](int idx) const { return *_data->at(idx); } int getSize() const { return _data->size(); } MatrixV& getData() const { return *_data; } Matrix& getData(int i) const { return *_data->at(i); } bool isTrans() const { return _data->at(0)->isTrans(); } int getNumCases() const { return _data->at(0)->getNumCols(); } }; class DataProvider { protected: CPUData* _hData; NVMatrixV _data; int _minibatchSize; public: DataProvider(int minibatchSize); void setData(CPUData&); void clearData(); CPUData& getMinibatch(int idx); CPUData& getDataSlice(int startCase, int endCase); int getNumMinibatches(); int getMinibatchSize(); int getNumCases(); }; #endif /* DATA_CUH */ ================================================ FILE: caffe2/contrib/cuda-convnet2/cudaconvnet/include/gradreducer.cuh ================================================ /* * Copyright 2014 Google Inc. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef GRADREDUCER_CUH_ #define GRADREDUCER_CUH_ #include #include #include "streambroadcast.cuh" #include "reducepipeline.cuh" #include "layer.cuh" #include "util.cuh" class StreamBroadcast; class Layer; #define ACT_GRAD_REDUCER_EXIT (1 << 16) //class ReduceMessage { // ReduceMessage(); // ReduceMessage(bool exit); //}; class IActGradReducer : public Thread { protected: Layer* _parent; Queue _finishQueue; int _numExpectedMsgsTotal; std::map _numExpectedMsgs; // map from device id -> num expected msgs void* run(); virtual bool reduce() = 0; virtual void reset() = 0; public: IActGradReducer(Layer& parent, std::map numExpectedMsgs); virtual ~IActGradReducer(); int waitForFinish(); virtual void enqueueReduction(int deviceID) = 0; virtual void stop() = 0; static IActGradReducer& makeGradReducer(Layer& parent, std::map numExpectedMsgs); }; class SequentialActGradReducer : public IActGradReducer { protected: std::map _numReceivedMsgs; // map from device id -> num received msgs std::map* > _messageQueues; intv _deviceIDs; StreamBroadcast* _broadcaster; bool reduce(); void reset(); public: SequentialActGradReducer(Layer& parent, std::map numExpectedMsgs); ~SequentialActGradReducer(); void enqueueReduction(int deviceID); void stop(); }; class ParallelActGradReducer : public IActGradReducer { protected: IEightGPUReducer* _reducer; int _numReceivedMsgs; float _scaleTarget; Queue _messageQueue; bool reduce(); void reset(); public: ParallelActGradReducer(Layer& parent, std::map numExpectedMsgs); void enqueueReduction(int deviceID); void stop(); }; #endif /* GRADREDUCER_CUH_ */ ================================================ FILE: caffe2/contrib/cuda-convnet2/cudaconvnet/include/jpeg.h ================================================ /* * Copyright 2014 Google Inc. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef JPEG_MAIN_H #define JPEG_MAIN_H #include #include #include #include #include #include #include //#include #include "../../util/include/thread.h" #include "../../util/include/matrix.h" #ifndef DIVUP #define DIVUP(x, y) (((x) + (y) - 1) / (y)) #endif #define NUM_JPEG_DECODER_THREADS 4 class DecoderThread : public Thread { protected: PyObject* _pyList; Matrix* _target; int64 _start_img, _end_img; int64 _img_size, _inner_size, _inner_pixels; bool _test, _multiview; unsigned char* _decodeTarget; int64 _decodeTargetSize; unsigned int _rseed; void* run(); void decodeJpeg(int idx, int& width, int& height); double randUniform(); double randUniform(double min, double max); void crop(int64 i, int64 width, int64 height, bool flip); virtual void crop(int64 i, int64 src_width, int64 src_height, bool flip, int64 crop_start_x, int64 crop_start_y); public: DecoderThread(PyObject* pyList, Matrix& target, int start_img, int end_img, int img_size, int inner_size, bool test, bool multiview); virtual ~DecoderThread(); }; #endif // JPEG_MAIN_H ================================================ FILE: caffe2/contrib/cuda-convnet2/cudaconvnet/include/layer.cuh ================================================ /* * Copyright 2014 Google Inc. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef LAYER_CUH #define LAYER_CUH #include #include #include #include #include #include #include "../../nvmatrix/include/nvmatrix.cuh" //#include "experimental/akrizhevsky/g3/mactruck-gpu-tests/gpu_util.cuh" #include "weights.cuh" #include "convnet.cuh" #include "cost.cuh" #include "neuron.cuh" #include "data.cuh" #include "layer_kernels.cuh" #include "streambroadcast.cuh" #include "actbroadcaster.cuh" #include "gradreducer.cuh" #include "util.cuh" #include "timer.cuh" #include "memorysource.cuh" class Cost; class ConvNet; class ConvNetThread; class CostLayer; class DataLayer; class Layer; class ActBroadcaster; class BroadcastMessage; class IActGradReducer; class Weights; class WeightList; typedef std::vector LayerV; class BinomialCrossEntOperator { protected: float _posWeight; public: BinomialCrossEntOperator(float posWeight) : _posWeight(posWeight) { } __device__ inline float operator()(const float t, const float y) const { return _posWeight * t * safelog(y) + (1.0f - t) * safelog(1.0f - y); } }; class CrossEntOperator { protected: float _posWeight; public: CrossEntOperator(float posWeight) : _posWeight(posWeight) { } __device__ inline float operator()(const float t, const float y) const { return _posWeight * t * safelog(y); } }; /* * Abstract layer. */ class Layer { protected: ConvNetThread* _convNetThread; // This is a vector[#layers_next] std::vector _next; // This is a vector[#replicas_prev][#layers_prev] std::map > _prev; int _rcvdFInputMsgs; std::map _numComputedActsGrads; int _rcvdBInputMsgs; int _numOutputs; std::map _inputs; // input idx -> matrix std::map _memSrcActs; // device id -> memory source std::map _memSrcActsGrad; // device id -> memory source bool _gradConsumer, _foundGradConsumers, _trans; std::map _bwdTerminal; // One bool per pass int _numGradProducersNext; int _actsTarget, _actsGradTarget; std::string _name, _type; intv _nextDeviceIDs, _prevDeviceIDs; HostNVMatrix _hostMemFwd; // New replica-related stuff: std::map _replicas; // NOTE: a layer is its own sibling, too // Previous layers sorted by device ID, in reverse order in which they are procesed by // sequential grad reducer. map from replica -> device id -> layers std::map > > _prevByDevice; std::map _inputIndices; int _replicaID; int _numReplicas; int _numReplicasPrev, _numReplicasNext; Queue _broadcastFinishQueue; Queue _reductionFinishQueue; ActBroadcaster* _actBroadcaster; IActGradReducer* _gradReducer; Timer _timer; bool _initialized; virtual void fpropNext(PASS_TYPE passType, int passIdx); virtual void truncBwdActs(); virtual void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) = 0; virtual void bpropCommon(NVMatrix& v, int replicaIdx, PASS_TYPE passType) { // Do nothing by default } virtual void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) { assert(!isGradProducer()); // Only do nothing if not grad producer } virtual void fpropCommon(PASS_TYPE passType) { } void bpropActsCall(NVMatrix& v, PASS_TYPE passType, int replicaIdx, int inputIdx); ActBroadcaster& getActBroadcaster(); IActGradReducer& getGradReducer(); int getInputIdx(std::string& parentName); void setInputIdx(std::string& parentName, int idx); public: static bool _saveActsGrad, _saveActs; Layer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID, bool trans); virtual ~Layer(); virtual bool fprop(PASS_TYPE passType, int passIdx); void fprop(NVMatrix& v, int inpIdx, PASS_TYPE passType, int passIdx); virtual void fprop(std::map& v, PASS_TYPE passType, int passIdx); virtual void bprop(PASS_TYPE passType, int passIdx); virtual void bprop(NVMatrix& v, PASS_TYPE passType, int passIdx); virtual void reset(); virtual void resetPassIdx(); int getNumCases(NVMatrix& v); int& getNumComputedActsGrads(int deviceID); int incRcvdBInputMsgs(); bool isGradConsumer(); bool hasGradProducerNext(std::string& layerName); // Does this layer produce a gradient for any layer? virtual bool isGradProducer(); // Does this layer produce a gradient for layer of given name? virtual bool isGradProducer(std::string& layerName); std::string& getName(); std::string& getType(); virtual void addNext(Layer& l); virtual void addPrev(Layer& l, int replicaIdx); virtual void addReplica(Layer& l); std::map >& getPrev(); std::vector& getNext(); virtual NVMatrix& getActs(); virtual NVMatrix& getActs(int deviceID); virtual NVMatrix& getActs(int deviceID, int numCases); virtual NVMatrix& getActsGrad(); virtual NVMatrix& getActsGrad(int deviceID); virtual std::map getAllActs(); virtual std::map getAllActsGrads(); virtual bool postInit(); int getDeviceID(); ConvNetThread& getConvNetThread(); cudaStream_t getStream(); void syncStream(); void setBwdTerminal(int passIdx); // Do nothing if this layer has no weights virtual bool updateWeights() { return false; } virtual bool constrainWeights() { return false; } virtual void checkGradient() { } virtual void copyToCPU() { } virtual void copyToGPU() { } intv& getNextDeviceIDs() { return _nextDeviceIDs; } int getReplicaID(); int getNumReplicas(); int getNumSiblingReplicas(); int getNumReplicasPrev(); int getNumReplicasNext(); int getNumOutputs(); void setMemorySourceActs(int deviceID, MemoryView& mem); void setMemorySourceActsGrad(int deviceID, MemoryView& mem); MemoryView& getMemorySourceActs(int deviceID); MemoryView& getMemorySourceActsGrad(int deviceID); int getFwdActiveInputReplicaIdx(int passIdx); int getBwdActiveInputReplicaIdx(int passIdx); int getFwdActiveReplicaIdx(int passIdx); int getNumLayersPrev(); virtual int getNumInputReplicas(); int getNumExpectedBwdMsgs(); int getNumExpectedFwdMsgs(); int getReplicaIdx(); int getActivePassPeriod(); int getNumGradProducersNext(); virtual ConvNet& getConvNet(); }; class TwoDLayerInterface { protected: int _channels, _imgSize, _imgPixels; public: TwoDLayerInterface(PyObject* paramsDict); }; class NeuronLayer : public Layer { protected: Neuron* _neuron; std::string _neuronType; virtual void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx); virtual void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType); virtual bool bpropSpecial(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType); public: class CrossEntLogisticGradientOperator { private: float _coeff, _posWeight; public: CrossEntLogisticGradientOperator(float coeff, float posWeight) : _coeff(coeff), _posWeight(posWeight) { } __device__ inline float operator()(const float y, const float t) const { return _coeff * (_posWeight * t * (1.0f - y) + (t - 1.0f) * y); } }; NeuronLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID); ~NeuronLayer(); std::string& getNeuronType(); }; class WeightLayer : public Layer { protected: WeightList* _weights; Weights *_biases; NVMatrix _norm2; float _wStep, _bStep; int _weightUpdatePassPeriod; void fpropCommon(PASS_TYPE passType); void bpropCommon(NVMatrix& v, int replicaIdx, PASS_TYPE passType); virtual void bpropBiases(NVMatrix& v, PASS_TYPE passType) = 0; virtual void bpropWeights(NVMatrix& v, int replicaIdx, int inpIdx, PASS_TYPE passType) = 0; virtual void _constrainWeights(); virtual float getGradScale(int inpIdx, PASS_TYPE passType); virtual float getIncScale(int inpIdx, PASS_TYPE passType); virtual float getBGradScale(PASS_TYPE passType); virtual float getBIncScale(); virtual NVMatrix& getGradTarget(int inpIdx); NVMatrix& getWeightMatrix(PASS_TYPE passType, int inpIdx); NVMatrix& getBiasMatrix(PASS_TYPE passType); public: WeightLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID, bool trans, bool useGrad); virtual ~WeightLayer(); virtual bool updateWeights(); virtual bool constrainWeights(); virtual void copyToCPU(); virtual void copyToGPU(); virtual void checkGradient(); Weights& getWeights(int idx); void addReplica(Layer& l); virtual bool postInit(); }; class FCLayer : public WeightLayer { protected: virtual void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx); virtual void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType); virtual void bpropBiases(NVMatrix& v, PASS_TYPE passType); virtual void bpropWeights(NVMatrix& v, int replicaIdx, int inpIdx, PASS_TYPE passType); virtual void _constrainWeights(); public: FCLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID, bool useGrad); FCLayer(); }; class SplitFCLayer : public FCLayer { protected: int _numParts; void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx); void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType); // void bpropBiases(NVMatrix& v, PASS_TYPE passType); void bpropWeights(NVMatrix& v, int replicaIdx, int inpIdx, PASS_TYPE passType); void splitWeights(); public: SplitFCLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID, bool useGrad); }; class SoftmaxLayer : public Layer { protected: bool _doUpperGrad; NVMatrix _max, _sum; void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx); void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType); public: SoftmaxLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID); void setDoUpperGrad(bool b); }; class ConcatenationLayer : public Layer { protected: intv* _copyOffsets; void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx); void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType); public: ConcatenationLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID); virtual ~ConcatenationLayer(); }; class PassThroughLayer : public Layer { protected: void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx); void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType); public: PassThroughLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID); virtual bool postInit(); }; class EltwiseSumLayer : public Layer { protected: floatv* _coeffs; void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx); void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType); public: EltwiseSumLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID); ~EltwiseSumLayer(); }; class EltwiseMaxLayer : public Layer { protected: void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx); void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType); public: EltwiseMaxLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID); }; class SumLayer : public Layer { protected: int _stride; void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx); void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType); public: SumLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID); }; class DataCopyMessage { public: enum MESSAGE_TYPE { COPY, EXIT }; protected: CPUData* _cpuData; int _passIdx; bool _other; DataCopyMessage::MESSAGE_TYPE _type; DataCopyMessage(DataCopyMessage::MESSAGE_TYPE type) : _cpuData(NULL), _other(false), _passIdx(0), _type(type) { } public: DataCopyMessage(CPUData& cpuData, bool other, int passIdx) : _cpuData(&cpuData), _other(other), _passIdx(passIdx), _type(DataCopyMessage::COPY) { } CPUData& getData() const { return *_cpuData; } int getPassIdx() const { return _passIdx; } bool isOther() const { return _other; } DataCopyMessage::MESSAGE_TYPE getType() { return _type; } }; class DataCopyExitMessage : public DataCopyMessage { public: DataCopyExitMessage() : DataCopyMessage(DataCopyMessage::EXIT) { } }; class DataCopyThread; class DataLayer : public Layer { protected: bool _useBuffer; int _dataIdx; ConvNet* _convNet; // std::map _outputs2; // Buffer for copying data during computation std::map _memSrcActs2; // // Buffer for copying data during computation std::map _copyStreams; void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx); Queue _copyFinishQueue; DataCopyThread* _copier; bool _outstandingCopyRequest; int _start, _end; public: void fprop(PASS_TYPE passType, int passIdx, bool fromBuffer); DataLayer(ConvNet* convNet, PyObject* paramsDict, int replicaID); ~DataLayer(); NVMatrix& getActs(int deviceID); // NVMatrix& getActs(int deviceID, bool other); NVMatrix& getActs(int deviceID, bool other, int numCases); bool isGradProducer(); void toggleBuffer(int passIdx); void copyData(CPUData& data, bool other, int passIdx); bool postInit(); ConvNet& getConvNet(); int getNumInputReplicas(); cudaStream_t getCopyStream(int deviceID); Queue& getCopyFinishQueue() { return _copyFinishQueue; } void waitForCopyFinish(); int getDataIdx() const { return _dataIdx; } int getStart() const { return _start; } int getEnd() const { return _end; } }; class DataCopyThread : public Thread { protected: DataLayer* _parent; Queue _queue; HostNVMatrix _hostMemFwd; Timer _requestTimer; int _sleepUsec; virtual void* run(); public: DataCopyThread(DataLayer& parent, intv& cpus); Queue& getQueue(); void stop(); }; class LocalLayer : public WeightLayer { protected: intv* _padding, *_stride, *_filterSize, *_channels, *_imgSize, *_groups; intv* _imgPixels, *_filterPixels, *_filterChannels; int _modulesX, _modules, _numFilters; public: LocalLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID, bool useGrad); virtual ~LocalLayer(); }; class ConvLayer : public LocalLayer { protected: int _sumWidth; bool _sharedBiases; floatv* _weightContrastNormMin, *_weightContrastNormMax; NVMatrix _weightGradTmp; void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx); void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType); void bpropBiases(NVMatrix& v, PASS_TYPE passType); void bpropWeights(NVMatrix& v, int replicaIdx, int inpIdx, PASS_TYPE passType); void truncBwdActs(); void _constrainWeights(); public: ConvLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID); virtual ~ConvLayer(); }; class LocalUnsharedLayer : public LocalLayer { protected: void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx); void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType); void bpropBiases(NVMatrix& v, PASS_TYPE passType); void bpropWeights(NVMatrix& v, int replicaIdx, int inpIdx, PASS_TYPE passType); void _constrainWeights(); public: LocalUnsharedLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID); }; class PoolLayer : public Layer, public TwoDLayerInterface { protected: int _sizeX, _start, _stride, _outputsX; std::string _pool; public: PoolLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID, bool trans); static PoolLayer& make(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID); }; class AvgPoolLayer : public PoolLayer { protected: bool _sum; void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx); void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType); public: AvgPoolLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID); }; class MaxPoolLayer : public PoolLayer { protected: bool _abs; void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx); void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType); public: MaxPoolLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID, bool abs); }; class CrossMapPoolLayer : public Layer, public TwoDLayerInterface { protected: int _size, _start, _stride, _outputs; std::string _pool; public: CrossMapPoolLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID, bool trans); static CrossMapPoolLayer& make(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID); }; class CrossMapMaxPoolLayer : public CrossMapPoolLayer { protected: void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx); void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType); public: CrossMapMaxPoolLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID); }; class RandomScaleLayer : public Layer, public TwoDLayerInterface { protected: int _tgtSize, _minScaledSize; float _maxScale; // should be >= 1 NVMatrix _rescaledActs; std::vector _scaleProbs; public: void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx); void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType); RandomScaleLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID); }; class CropLayer : public Layer, public TwoDLayerInterface { protected: int _tgtSize, _startX, _startY; public: void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx); void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType); CropLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID); }; class NailbedLayer : public Layer, public TwoDLayerInterface { protected: int _start, _stride, _outputsX; public: void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx); void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType); NailbedLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID); }; class GaussianBlurLayer : public Layer, public TwoDLayerInterface { protected: Matrix* _hFilter; NVMatrix _filter; NVMatrix _actGradsTmp; public: void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx); void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType); void copyToGPU(); GaussianBlurLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID); ~GaussianBlurLayer(); }; class HorizontalReflectionLayer : public Layer, public TwoDLayerInterface { protected: public: void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx); void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType); HorizontalReflectionLayer(ConvNetThread* convNet, PyObject* paramsDict, int replicaID); }; class ResizeLayer : public Layer, public TwoDLayerInterface { protected: float _scale; int _tgtSize; public: void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx); void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType); ResizeLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID); }; class DropoutLayer : public Layer { protected: bool _enable; float _keep; NVMatrix _keepMask; public: virtual void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx); virtual void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType); void truncBwdActs(); DropoutLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID); class DropoutSmallerThanOperator { private: float _keep, _scale; public: DropoutSmallerThanOperator(float keep) : _keep(keep), _scale(1.0f/keep) { } __device__ inline float operator()(const float x) const { return (x < _keep) * _scale; } }; }; class Dropout2Layer : public DropoutLayer { protected: public: void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx); void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType); Dropout2Layer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID); }; class RGBToYUVLayer : public Layer { public: void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx); void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType); RGBToYUVLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID); }; class RGBToLABLayer : public Layer { protected: bool _center; public: void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx); void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType); RGBToLABLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID); }; class ResponseNormLayer : public Layer, public TwoDLayerInterface { protected: int _size; float _scale, _pow; float _minDiv; NVMatrix _denoms; void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx); void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType); void truncBwdActs(); public: ResponseNormLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID); }; class CrossMapResponseNormLayer : public ResponseNormLayer { protected: bool _blocked; void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx); void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType); public: CrossMapResponseNormLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID); }; class ContrastNormLayer : public ResponseNormLayer { protected: NVMatrix _meanDiffs; void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx); void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType); void truncBwdActs(); public: ContrastNormLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID); }; class CostLayer : public Layer { protected: float _coeff; doublev _costv; NVMatrix _tmpbuf; // For error accumulation int _numCases; // number of cases that the values in _costv were computed on bool _aggregated; void fpropCommon(PASS_TYPE passType); public: CostLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID, bool trans); void bprop(NVMatrix& v, PASS_TYPE passType, int passIdx); bool fprop(PASS_TYPE passType, int passIdx); int getNumCases(); virtual doublev& getCost(); float getCoeff(); bool isGradProducer(); void setSendTerminalMessages(bool send); void resetPassIdx(); static CostLayer& make(ConvNetThread* convNetThread, PyObject* paramsDict, std::string& type, int replicaID); }; /* * Input 0: labels * Input 1: softmax outputs */ class CrossEntCostLayer : public CostLayer { protected: NVMatrix _trueLabelLogProbs, _correctProbs; void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx); void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType); public: CrossEntCostLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID); }; /* * Input 0: labels * Input 1: softmax outputs */ class LogregCostLayer : public CostLayer { protected: NVMatrix _trueLabelLogProbs, _correctProbs, _topkProbs; std::map _probsAccum; // input replica idx -> nvmatrix NVMatrix _maxProbs; std::map _numAccumed; // input replica idx -> int int _topk; bool _doCompute; virtual void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx); void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType); public: LogregCostLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID); NVMatrix& getProbsAccum(int replicaIdx); }; /* * Input 0: labels * Input 1: logistic outputs */ class BinomialCrossEntropyCostLayer : public CostLayer { protected: bool _computeSoftmaxErrorRate; NVMatrix _tmpProbs, _tmpVec, _correctProbs; float _posWeight; virtual void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx); void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType); public: BinomialCrossEntropyCostLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID); float getPosWeight(); // Only for use with non-logistic units class BinomialCrossEntGradientOperator { private: float _coeff, _posWeight; public: BinomialCrossEntGradientOperator(float coeff, float posWeight) : _coeff(coeff), _posWeight(posWeight) { } __device__ inline float operator()(const float t, const float y) const { return _coeff * (_posWeight * __fdividef(t, y) + __fdividef(t - 1.0f, 1.0f - y)); } }; }; /* * Input 0: labels * Input 1: logistic outputs */ class DetectionCrossEntropyCostLayer : public BinomialCrossEntropyCostLayer { protected: Matrix _hNumPositive, _hNumTruePositive, _hNumDeclaredPositive; NVMatrix _numPositive, _numTrueNegative, _numTruePositive, _numDeclaredPositive; void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx); public: DetectionCrossEntropyCostLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID); }; class SumOfSquaresCostLayer : public CostLayer { protected: NVMatrix _tmp; void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx); void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType); public: SumOfSquaresCostLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID); }; #endif /* LAYER_CUH */ ================================================ FILE: caffe2/contrib/cuda-convnet2/cudaconvnet/include/layer_kernels.cuh ================================================ /* * Copyright 2014 Google Inc. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef LAYER_KERNELS_CUH #define LAYER_KERNELS_CUH #include #include #include "../../nvmatrix/include/nvmatrix.cuh" #define LOGREG_GRAD_THREADS_X 32 #define LOGREG_GRAD_THREADS_Y 4 #define LOGREG_ERR_THREADS_X 128 #define LOGREG_ERR_THREADS_Y 1 __device__ inline float safelog(const float x) { return x > 0.0f ? __logf(x) : -50.0f; } // The input matrix here is the squared norm. // This replaces the squared norm with: // 1 if it is below the threshold given by norm2 // norm/sqrt(a) otherwise -- i.e. the desired norm (not squared) class MaxWeightConstraintOperator { private: float _norm, _norm2; public: MaxWeightConstraintOperator(float norm) : _norm(norm), _norm2(norm*norm) { } __device__ inline float operator()(const float a) const { return a > _norm2 ? __fdividef(_norm, sqrtf(a)) : 1.0f; } }; class HardWeightConstraintOperator { private: float _norm, _norm2; public: HardWeightConstraintOperator(float norm) : _norm(norm), _norm2(norm*norm) { } __device__ inline float operator()(const float a) const { return __fdividef(_norm, sqrtf(a)); } }; class WeightContrastNormOperator { private: float _min, _max, _scale; public: WeightContrastNormOperator(float min, float max, float scale) : _min(min), _max(max), _scale(scale) { } __device__ inline float operator()(float a) const { a = sqrtf(a) * _scale; return a < _min ? __fdividef(_min, a) : a > _max ? __fdividef(_max, a) : 1.0f; } }; void computeCrossEntCost(NVMatrix& labels, NVMatrix& probs, NVMatrix& labelLogProbs_out, NVMatrix& correctProbs_out); void computeCrossEntGrad(NVMatrix& labels, NVMatrix& probs, NVMatrix& target, bool add, float coeff); void computeSoftmaxGrad(NVMatrix& acts, NVMatrix& actsGrad, NVMatrix& target, float scaleTarget, float scaleGrad); void computeLogregCost(NVMatrix& labels, NVMatrix& probs, NVMatrix& maxProbs, NVMatrix& labelLogProbs_out, NVMatrix& correctProbs_out); void computeLogregGrad(NVMatrix& labels, NVMatrix& probs, NVMatrix& target, bool add, float coeff); // Numerical stability optimization: this routine combines computeLogregGrad with computeSoftmaxGrad // to avoi dividing and then multiplying by quantities that may be near zero. void computeCrossEntSoftmaxGrad(NVMatrix& labels, NVMatrix& probs, NVMatrix& target, bool add, float coeff); void computeLogregSoftmaxGrad(NVMatrix& labels, NVMatrix& probs, NVMatrix& target, bool add, float coeff); void computeEltwiseMaxGrad(NVMatrix& actGrad, NVMatrix& input, NVMatrix& output, NVMatrix& target, bool add); void computeMultiSoftmaxCost(NVMatrix& labels, NVMatrix& probs, NVMatrix& maxProbs, NVMatrix& labelLogProbs_out, NVMatrix& correctProbs_out, NVMatrix& top5Probs_out, int setSize); #endif /* LAYER_KERNELS_CUH */ ================================================ FILE: caffe2/contrib/cuda-convnet2/cudaconvnet/include/lr.cuh ================================================ /* * Copyright 2014 Google Inc. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef LR_CUH #define LR_CUH #include #include #include #include #include #include #include "util.cuh" #include "../../nvmatrix/include/nvmatrix.cuh" #include "../../util/include/matrix.h" /* * The maximum learning rate is _baseRate. * The minimum learning rate is _baseRate / _tgtFactor. * * These classes define annealing schedules that interpolate between these * two extrema. */ class ParameterSchedule { protected: double _baseRate; public: ParameterSchedule(double base); virtual double getValue(double progress); double getBaseValue() const; virtual ~ParameterSchedule(); static ParameterSchedule& make(PyObject* schedDict); }; class LinearParameterSchedule : public ParameterSchedule { protected: double _finalRate; public: LinearParameterSchedule(double base, double tgtFactor); virtual double getValue(double progress); }; class ExpParameterSchedule : public ParameterSchedule { protected: double _powBase; public: ExpParameterSchedule(double baseRate, double tgtFactor); virtual double getValue(double progress); }; class DiscreteExpParameterSchedule : public ParameterSchedule { protected: std::vector _rates; public: DiscreteExpParameterSchedule(double baseRate, double tgtFactor, int numSteps); virtual double getValue(double progress); }; #endif /* LR_CUH */ ================================================ FILE: caffe2/contrib/cuda-convnet2/cudaconvnet/include/memorysource.cuh ================================================ /* * Copyright 2014 Google Inc. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include #include #include "../../nvmatrix/include/nvmatrix.cuh" class MemorySource; class MemoryView { protected: MemorySource* _src; std::string _name; public: MemoryView(MemorySource& src, std::string& name); ~MemoryView(); NVMatrix& getMemory(int numCases); NVMatrix& getMemory(); MemorySource& getMemorySource(); bool isParent(); std::string& getName(); MemoryView& clone(std::string& name); }; // Remember: PassThroughLayer, and therefore MemorySource, exists on a particular GPU. class MemorySource { protected: // int _inputIdx; NVMatrix _memory; int _deviceID; int _size; std::map > _viewRanges; std::map _memoryViews; // input idx --> slice of _memory std::set _truncateRequests; Lock _lock; public: MemorySource(int size, int deviceID); ~MemorySource(); NVMatrix& getMemory(std::string& name, int numCases); NVMatrix& getMemory(std::string& name); MemoryView& addUser(std::string& name, std::pair range); MemoryView& addUser(std::string& name); std::pair getRange(std::string& name); int getSize(); bool truncate(std::string& name); static MemoryView& make(int size, int deviceID, std::string& parentUser); }; ================================================ FILE: caffe2/contrib/cuda-convnet2/cudaconvnet/include/messages.cuh ================================================ /* * Copyright 2014 Google Inc. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef MESSAGES_CUH_ #define MESSAGES_CUH_ #include #include "layer.cuh" class Layer; enum MESSAGES { FPROP_TERMINAL, BPROP_TERMINAL, BPROP_READY, FPROP_READY, SYNC, COPY_TO_CPU, COPY_TO_GPU, UPDATE_WEIGHTS, CONSTRAIN_WEIGHTS, RESET, RESET_PASS_IDX, COST_COMPUTED, BPROP_START, EXIT_CONVNET}; class Message { protected: MESSAGES _messageType; public: MESSAGES getType() { return _messageType; } virtual Message* clone() { return new Message(_messageType); } Message(MESSAGES messageType) : _messageType(messageType) { } virtual ~Message() { } }; class PropMessage : public Message { protected: Layer *_toLayer; PASS_TYPE _passType; int _passIdx; public: Layer& getToLayer() { return *_toLayer; } PASS_TYPE getPassType() { return _passType; } int getPassIdx() { return _passIdx; } virtual PropMessage* clone() { return new PropMessage(*_toLayer, _passType, _passIdx, _messageType); } PropMessage(Layer& toLayer, PASS_TYPE passType, int passIdx, MESSAGES msgType) : _toLayer(&toLayer), _passType(passType), _passIdx(passIdx), Message(msgType) { } }; class FpropMessage : public PropMessage { public: FpropMessage(Layer& toLayer, PASS_TYPE passType, int passIdx) : PropMessage(toLayer, passType, passIdx, FPROP_READY) { } virtual FpropMessage* clone() { return new FpropMessage(*_toLayer, _passType, _passIdx); } }; class BpropMessage : public PropMessage { public: BpropMessage(Layer& toLayer, PASS_TYPE passType, int passIdx) : PropMessage(toLayer, passType, passIdx, BPROP_READY) { } virtual BpropMessage* clone() { return new BpropMessage(*_toLayer, _passType, _passIdx); } }; class BpropStartMessage : public Message { protected: PASS_TYPE _passType; int _passIdx; public: PASS_TYPE getPassType() { return _passType; } int getPassIdx() { return _passIdx; } virtual BpropStartMessage* clone() { return new BpropStartMessage(_passType, _passIdx); } BpropStartMessage(PASS_TYPE passType, int passIdx) : _passType(passType), Message(BPROP_START), _passIdx(passIdx) { } }; #endif /* MESSAGES_CUH_ */ ================================================ FILE: caffe2/contrib/cuda-convnet2/cudaconvnet/include/neuron.cuh ================================================ /* * Copyright 2014 Google Inc. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef NEURONS_CUH #define NEURONS_CUH #include #include #include #include "../../nvmatrix/include/nvmatrix.cuh" #include template class AddGradientBinaryOperator { GradientOp _op; public: AddGradientBinaryOperator(GradientOp op) : _op(op) { } __device__ inline float operator()(const float unitActGrad, const float unitAct, const float target) const { return _op(unitActGrad, unitAct) + target; } }; template class AddGradientOperator { GradientOp _op; public: AddGradientOperator(GradientOp op) : _op(op) { } __device__ inline float operator()(const float unitActGrad, const float target) const { return target + _op(unitActGrad); } }; /* ======================= * Neuron * ----------------------- * * f(x) = x * ======================= */ class Neuron { protected: bool _activated; // Inputs and outputs potentially point to the same matrix, depending on the neuron NVMatrix* _inputs, *_outputs; virtual void _activate() { if (_inputs != _outputs) { _inputs->copy(*_outputs); } } virtual void _computeInputGrad(NVMatrix& actsGrad, NVMatrix& target) { if (&target != &actsGrad) { actsGrad.copy(target); } } virtual void _addInputGrad(NVMatrix& actsGrad, NVMatrix& target) { if (&target != &actsGrad) { target.add(actsGrad); } } public: Neuron() : _activated(false), _inputs(NULL), _outputs(NULL) { } virtual void activate(NVMatrix& inputs, NVMatrix& outputs) { _activated = true; _inputs = &inputs; _outputs = &outputs; _activate(); } virtual void computeInputGrad(NVMatrix& actsGrad, NVMatrix& target, bool add) { assert(_activated); if (!add) { target.resize(actsGrad); _computeInputGrad(actsGrad, target); } else { _addInputGrad(actsGrad, target); } } static Neuron& makeNeuron(PyObject* neuronDict); }; /* ======================= * LogisticNeuron * ----------------------- * * f(x) = 1 / (1 + e^-x) * ======================= */ class LogisticNeuron : public Neuron { protected: void _activate() { _inputs->apply(NVMatrixOps::Logistic(), *_outputs); } void _computeInputGrad(NVMatrix& actsGrad, NVMatrix& target) { actsGrad.applyBinary(LogisticGradientOperator(), *_outputs, target); } void _addInputGrad(NVMatrix& actsGrad, NVMatrix& target) { actsGrad.applyTernary(AddGradientBinaryOperator(LogisticGradientOperator()), *_outputs, target, target); } public: class LogisticGradientOperator { public: __device__ inline float operator()(float unitActGrad, float unitAct) const { return unitActGrad * unitAct * (1.0f - unitAct); } }; LogisticNeuron() : Neuron() { } }; /* ======================= * LogNeuron * ----------------------- * * f(x) = log(eps + x) * ======================= */ class LogNeuron : public Neuron { protected: float _eps; void _activate() { _inputs->apply(LogOperator(_eps), *_outputs); } void _computeInputGrad(NVMatrix& actsGrad, NVMatrix& target) { actsGrad.applyBinary(LogGradientOperator(_eps), *_inputs, target); } void _addInputGrad(NVMatrix& actsGrad, NVMatrix& target) { actsGrad.applyTernary(AddGradientBinaryOperator(LogGradientOperator(_eps)), *_inputs, target, target); } public: class LogGradientOperator { protected: float _eps; public: __device__ inline float operator()(float unitActGrad, float unitInput) const { return __fdividef(unitActGrad, _eps + unitInput); } LogGradientOperator(float eps) : _eps(eps) { } }; class LogOperator { protected: float _eps; public: __device__ inline float operator()(float x) const { return __logf(_eps + x); } LogOperator(float eps) : _eps(eps) { } }; LogNeuron(float eps) : _eps(eps), Neuron() { } }; /* ======================= * ReluNeuron * ----------------------- * * f(x) = max(0, x) * ======================= */ class ReluNeuron : public Neuron { protected: virtual void _activate() { _inputs->apply(ReluOperator(), *_outputs); } void _computeInputGrad(NVMatrix& actsGrad, NVMatrix& target) { actsGrad.applyBinary(ReluGradientOperator(), *_outputs, target); } void _addInputGrad(NVMatrix& actsGrad, NVMatrix& target) { actsGrad.applyTernary(AddGradientBinaryOperator(ReluGradientOperator()), *_outputs, target, target); } public: class ReluOperator { public: __device__ inline float operator()(float x) const { return x < 0.0f ? 0.0f : x; } }; class ReluGradientOperator { public: __device__ inline float operator()(float unitActGrad, float unitAct) const { return unitActGrad * (unitAct > 0.0f); } }; ReluNeuron() : Neuron() { } }; /* ======================= * BoundedReluNeuron * ----------------------- * * f(x) = min(a, max(0, x)) * ======================= */ class BoundedReluNeuron : public Neuron { protected: float _a; void _activate() { _inputs->apply(BoundedReluOperator(_a), *_outputs); } void _computeInputGrad(NVMatrix& actsGrad, NVMatrix& target) { actsGrad.applyBinary(BoundedReluGradientOperator(_a), *_outputs, target); } void _addInputGrad(NVMatrix& actsGrad, NVMatrix& target) { actsGrad.applyTernary(AddGradientBinaryOperator(BoundedReluGradientOperator(_a)), *_outputs, target, target); } public: class BoundedReluOperator { private: float _a; public: BoundedReluOperator(float a) : _a(a) { } __device__ inline float operator()(float x) const { return x < 0.0f ? 0.0f : x > _a ? _a : x; } }; class BoundedReluGradientOperator { private: float _a; public: BoundedReluGradientOperator(float a) : _a(a) { } __device__ inline float operator()(float unitActGrad, float unitAct) const { return unitActGrad * (unitAct > 0.0f) * (unitAct < _a); } }; BoundedReluNeuron(float a) : Neuron(), _a(a) { } }; /* ======================= * AbsNeuron * ----------------------- * * f(x) = abs(x) * ======================= */ class AbsNeuron : public Neuron { protected: void _activate() { assert(_inputs != _outputs); _inputs->apply(NVMatrixOps::Abs(), *_outputs); } void _computeInputGrad(NVMatrix& actsGrad, NVMatrix& target) { actsGrad.applyBinary(AbsGradientOperator(), *_inputs, target); } void _addInputGrad(NVMatrix& actsGrad, NVMatrix& target) { actsGrad.applyTernary(AddGradientBinaryOperator(AbsGradientOperator()), *_inputs, target, target); } public: class AbsGradientOperator { public: __device__ inline float operator()(float unitActGrad, float unitInput) const { return unitActGrad * (unitInput > 0.0f ? 1.0f : -1.0f); } }; AbsNeuron() : Neuron() { } }; /* ======================= * TanhNeuron * ----------------------- * * f(x) = a*tanh(b*x) * ======================= */ class TanhNeuron : public Neuron { protected: float _a, _b; void _activate() { _inputs->apply(TanhOperator(_a, _b), *_outputs); } void _computeInputGrad(NVMatrix& actsGrad, NVMatrix& target) { actsGrad.applyBinary(TanhGradientOperator(_a, _b), *_outputs, target); } void _addInputGrad(NVMatrix& actsGrad, NVMatrix& target) { actsGrad.applyTernary(AddGradientBinaryOperator(TanhGradientOperator(_a, _b)), *_outputs, target, target); } public: class TanhOperator { private: float _a, _n2b; public: TanhOperator(float a, float b) : _a(a), _n2b(-2*b) { } virtual __device__ inline float operator()(float x) const { return _a * (__fdividef(2.0f, 1.0f + __expf(x * _n2b)) - 1.0f); } }; class TanhGradientOperator { private: float _b, _a; public: TanhGradientOperator(float a, float b) : _b(b), _a(a) { } __device__ inline float operator()(float unitActGrad, float unitAct) const { // const float t = (1.0f - __fdividef(unitAct, _a)) / 2.0f; // return unitActGrad * _n4ab * (t * (t - 1.0f)); return unitActGrad * _b * (_a - __fdividef(unitAct * unitAct, _a)); } }; TanhNeuron(float a, float b) : Neuron(), _a(a), _b(b) { } }; /* ======================= * DoubleReluNeuron * ----------------------- * * f(x) = x - a*tanh(x/a) * ======================= */ class DoubleReluNeuron : public Neuron { protected: float _a; void _activate() { assert(_inputs != _outputs); _inputs->apply(DoubleReluOperator(_a), *_outputs); } void _computeInputGrad(NVMatrix& actsGrad, NVMatrix& target) { actsGrad.applyBinary(DoubleReluGradientOperator(_a), *_inputs, target); } void _addInputGrad(NVMatrix& actsGrad, NVMatrix& target) { actsGrad.applyTernary(AddGradientBinaryOperator(DoubleReluGradientOperator(_a)), *_inputs, target, target); } public: class DoubleReluOperator { private: float _a, _n2a; public: DoubleReluOperator(float a) : _a(a), _n2a(-2.0f / a) { } virtual __device__ inline float operator()(float x) const { return x - _a * (__fdividef(2.0f, 1.0f + __expf(_n2a * x)) - 1.0f); } }; class DoubleReluGradientOperator { private: float _n2a; public: DoubleReluGradientOperator(float a) : _n2a(-2.0f / a) { } __device__ inline float operator()(float unitActGrad, float unitInput) const { const float tanh = __fdividef(2.0f, 1.0f + __expf(_n2a * unitInput)) - 1.0f; return unitActGrad * (tanh*tanh); } }; DoubleReluNeuron(float a) : Neuron(), _a(a) { } }; /* ======================= * SoftReluNeuron * ----------------------- * * f(x) = log(1 + e^x) * ======================= */ class SoftReluNeuron : public Neuron { protected: void _activate() { // assert(_inputs != _outputs); _inputs->apply(SoftReluOperator(), *_outputs); } void _computeInputGrad(NVMatrix& actsGrad, NVMatrix& target) { actsGrad.applyBinary(SoftReluGradientOperator(), *_outputs, target); } void _addInputGrad(NVMatrix& actsGrad, NVMatrix& target) { actsGrad.applyTernary(AddGradientBinaryOperator(SoftReluGradientOperator()), *_outputs, target, target); } public: class SoftReluOperator { public: __device__ inline float operator()(float x) const { // This piece-wise implementation has better numerical stability than // simply computing log(1 + e^x). return x > 4.0f ? x : __logf(1.0f + __expf(x)); } }; class SoftReluGradientOperator { public: __device__ inline float operator()(float unitActGrad, float unitOutput) const { if (unitOutput > 4.0f) { return unitActGrad; } const float f = __expf(-unitOutput); return unitActGrad * (1.0f - f); } }; SoftReluNeuron() : Neuron() { } }; /* ======================= * SquareNeuron * ----------------------- * * f(x) = x^2 * ======================= */ class SquareNeuron : public Neuron { protected: void _activate() { assert(_inputs != _outputs); _inputs->apply(NVMatrixOps::Square(), *_outputs); } void _computeInputGrad(NVMatrix& actsGrad, NVMatrix& target) { actsGrad.applyBinary(SquareGradientOperator(), *_inputs, target); } void _addInputGrad(NVMatrix& actsGrad, NVMatrix& target) { actsGrad.applyTernary(AddGradientBinaryOperator(SquareGradientOperator()), *_inputs, target, target); } public: class SquareGradientOperator { public: __device__ inline float operator()(float unitActGrad, float unitInput) const { return unitActGrad * 2.0f * unitInput; } }; SquareNeuron() : Neuron() { } }; /* ======================= * SqrtNeuron * ----------------------- * * f(x) = sqrt(x) * ======================= */ class SqrtNeuron : public Neuron { protected: void _activate() { _inputs->apply(NVMatrixOps::Sqrt(), *_outputs); } void _computeInputGrad(NVMatrix& actsGrad, NVMatrix& target) { actsGrad.applyBinary(SqrtGradientOperator(), *_outputs, target); } void _addInputGrad(NVMatrix& actsGrad, NVMatrix& target) { actsGrad.applyTernary(AddGradientBinaryOperator(SqrtGradientOperator()), *_outputs, target, target); } public: class SqrtGradientOperator { public: __device__ inline float operator()(float unitActGrad, float unitAct) const { return __fdividef(unitActGrad, 2.0f * unitAct); } }; SqrtNeuron() : Neuron() { } }; /* ======================= * LinearNeuron * ----------------------- * * f(x) = a*x + b * ======================= */ class LinearNeuron : public Neuron { protected: float _a, _b; void _activate() { _inputs->apply(NVMatrixOps::Linear(_a, _b), *_outputs); } void _computeInputGrad(NVMatrix& actsGrad, NVMatrix& target) { actsGrad.scale(_a, target); } void _addInputGrad(NVMatrix& actsGrad, NVMatrix& target) { actsGrad.applyBinary(AddGradientOperator(NVMatrixOps::MultByScalar(_a)), target, target); } public: LinearNeuron(float a, float b) : Neuron(), _a(a), _b(b) { } }; #endif /* NEURONS_CUH */ ================================================ FILE: caffe2/contrib/cuda-convnet2/cudaconvnet/include/pipedispenser.cuh ================================================ /* * Copyright 2014 Google Inc. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef PIPEDISPENSER_CUH_ #define PIPEDISPENSER_CUH_ #include #include #include #include #include "../../util/include/thread.h" #include "util.cuh" /* * PipeDispenser interface */ class PipeDispenser { protected: int _numPipes; seti _pipes; pthread_mutex_t *_mutex; void lock() { pthread_mutex_lock(_mutex); } void unlock() { pthread_mutex_unlock(_mutex); } virtual void init() { _mutex = (pthread_mutex_t*)(malloc(sizeof (pthread_mutex_t))); pthread_mutex_init(_mutex, NULL); } public: PipeDispenser(const seti& pipes) { _pipes.insert(pipes.begin(), pipes.end()); init(); } PipeDispenser(int numPipes) { for (int i = 0; i < numPipes; ++i) { _pipes.insert(i); } init(); } virtual ~PipeDispenser() { pthread_mutex_destroy(_mutex); free(_mutex); } virtual int getPipe(const seti& interested) = 0; int getPipe(int interested) { seti tmp; tmp.insert(interested); return getPipe(tmp); } virtual void freePipe(int pipe) = 0; }; /* * This one blocks until there is a free pipe to return. */ class PipeDispenserBlocking : public PipeDispenser { protected: pthread_cond_t *_cv; void wait() { pthread_cond_wait(_cv, _mutex); } void broadcast() { pthread_cond_broadcast(_cv); } int getAvailablePipes(const seti& interested, intv& available) { available.clear(); std::set_intersection(_pipes.begin(), _pipes.end(), interested.begin(), interested.end(), std::back_inserter(available)); return available.size(); } virtual void init() { PipeDispenser::init(); _cv = (pthread_cond_t*)(malloc(sizeof (pthread_cond_t))); pthread_cond_init(_cv, NULL); } public: PipeDispenserBlocking(const seti& pipes) : PipeDispenser(pipes) { init(); } PipeDispenserBlocking(int numPipes) : PipeDispenser(numPipes) { init(); } ~PipeDispenserBlocking() { pthread_cond_destroy(_cv); free(_cv); } int getPipe(const seti& interested) { lock(); intv avail; while (getAvailablePipes(interested, avail) == 0) { wait(); } int pipe = avail[0]; _pipes.erase(pipe); unlock(); return pipe; } void freePipe(int pipe) { lock(); _pipes.insert(pipe); broadcast(); unlock(); } }; /* * This one returns the least-occupied pipe. */ class PipeDispenserNonBlocking : public PipeDispenser { protected: std::map _pipeUsers; public: PipeDispenserNonBlocking(const seti& pipes) : PipeDispenser(pipes) { for (seti::iterator it = pipes.begin(); it != pipes.end(); ++it) { _pipeUsers[*it] = 0; } } int getPipe(const seti& interested) { lock(); int pipe = -1, users = 1 << 30; for (seti::iterator it = _pipes.begin(); it != _pipes.end(); ++it) { if (interested.count(*it) > 0 && _pipeUsers[*it] < users) { pipe = *it; users = _pipeUsers[*it]; } } if (pipe >= 0) { _pipeUsers[pipe]++; } unlock(); return pipe; } void freePipe(int pipe) { lock(); _pipeUsers[pipe]--; unlock(); } }; #endif /* PIPEDISPENSER_CUH_ */ ================================================ FILE: caffe2/contrib/cuda-convnet2/cudaconvnet/include/pyconvnet.cuh ================================================ /* * Copyright 2014 Google Inc. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef PYCONVNET3_CUH #define PYCONVNET3_CUH #define _QUOTEME(x) #x #define QUOTEME(x) _QUOTEME(x) extern "C" void init_ConvNet(); PyObject* initModel(PyObject *self, PyObject *args); PyObject* startBatch(PyObject *self, PyObject *args); PyObject* finishBatch(PyObject *self, PyObject *args); PyObject* checkGradients(PyObject *self, PyObject *args); PyObject* syncWithHost(PyObject *self, PyObject *args); PyObject* startMultiviewTest(PyObject *self, PyObject *args); PyObject* startFeatureWriter(PyObject *self, PyObject *args); PyObject* startDataGrad(PyObject *self, PyObject *args); PyObject* decodeJpeg(PyObject *self, PyObject *args); #endif ================================================ FILE: caffe2/contrib/cuda-convnet2/cudaconvnet/include/reducepipeline.cuh ================================================ /* * Copyright 2014 Google Inc. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef REDUCEPIPELINE_CUH_H_ #define REDUCEPIPELINE_CUH_H_ #include "../../util/include/thread.h" #include "../../util/include/queue.h" #include #include "../../nvmatrix/include/nvmatrix.cuh" #include "util.cuh" #define REDUCE_MIN_CHUNK_SIZE (1<<18) // 256k #define REDUCE_MAX_CHUNKS 16 #define REDUCE_MIN_CHUNKS 2 enum REDUCE_MESSAGE_TYPE { REDUCE_CHUNK, REDUCE_START, EXIT }; class ReducePeer; class ReducerSource; class IReduceSegment; class IEightGPUReducer; class ReduceMessage { protected: REDUCE_MESSAGE_TYPE _msgType; float _scaleIntermediates, _scaleTarget; std::map* _mats; public: ReduceMessage(REDUCE_MESSAGE_TYPE msgType, float scaleIntermediates, float scaleTarget, std::map& mats) : _msgType(msgType), _scaleIntermediates(scaleIntermediates), _scaleTarget(scaleTarget), _mats(&mats) { } ReduceMessage(REDUCE_MESSAGE_TYPE msgType) : _msgType(msgType), _scaleIntermediates(0), _scaleTarget(0), _mats(NULL) { } inline REDUCE_MESSAGE_TYPE getType() const { return _msgType; } inline float getScaleIntermediates() const { return _scaleIntermediates; } inline float getScaleTarget() const { return _scaleTarget; } inline NVMatrix& getMatrix(int deviceID) const { return *_mats->at(deviceID); } inline std::map& getMatrices() const { return *_mats; } }; class ReduceChunkMessage : public ReduceMessage { protected: int _chunkIdx; int _chunkSize; int _numChunks; IReduceSegment* _src; public: ReduceChunkMessage(IReduceSegment& src, int chunkIdx, int chunkSize, int numChunks, float scaleIntermediates, float scaleTarget, std::map& mats) : _src(&src), _chunkIdx(chunkIdx), _chunkSize(chunkSize), _numChunks(numChunks), ReduceMessage(REDUCE_CHUNK, scaleIntermediates, scaleTarget, mats) { } inline int getChunkIdx() const { return _chunkIdx; } inline int getChunkSize() const { return _chunkSize; } inline int getNumChunks() const { return _numChunks; } inline IReduceSegment& getSource() const { return *_src; } }; class ReduceStartMessage : public ReduceMessage { public: ReduceStartMessage(float scaleIntermediates, float scaleTarget, std::map& mats) : ReduceMessage(REDUCE_START, scaleIntermediates, scaleTarget, mats) { } }; class IReduceSegment : public Thread { protected: int _deviceID; std::vector _prev; ReducePeer* _next; Queue _queue; Queue* _finishQueue; NVMatrix& getChunk(const NVMatrix& mat, int chunkSize, int chunkIdx); void* run(); virtual bool processMessage(ReduceMessage& msg) = 0; public: IReduceSegment(IEightGPUReducer& parent, int deviceID, Queue* finishQueue); virtual ~IReduceSegment(); inline virtual NVMatrix& getMatrix(ReduceMessage& msg); Queue& getQueue(); int getDeviceID() const; void addPrev(IReduceSegment& c); void addNext(ReducePeer& c); bool isTerminal() const; }; class ReducerSource : public IReduceSegment { protected: bool processMessage(ReduceMessage& msg); public: ReducerSource(IEightGPUReducer& parent, int deviceID); }; class ReducePeer : public IReduceSegment { protected: std::map _streams; // device id -> stream std::map _numInputsReceived; // chunk idx -> num inputs int _numInputsFinished; HostNVMatrix _mat; bool _add; bool processMessage(ReduceMessage& msg); inline cudaStream_t getStream(int deviceID); inline NVMatrix& getMatrix(ReduceMessage& msg); void hostAdd(const float* src, float* tgt, const int n, const float scaleTgt); public: ReducePeer(IEightGPUReducer& parent, int deviceID, Queue* finishQueue); ReducePeer(IEightGPUReducer& parent); ~ReducePeer(); }; class IEightGPUReducer { protected: std::vector _sources; std::vector _peers; Queue _finishQueue; int _tgtDeviceID; virtual void makeConnections(std::vector& same, std::vector&other) = 0; public: IEightGPUReducer(int tgtDeviceID); virtual ~IEightGPUReducer(); IEightGPUReducer& construct(); void reduce(std::map& mats, float scaleIntermediates, float scaleTarget); void reduce(std::map& mats, float scaleIntermediates); void reduce(std::map& mats); int getTgtDeviceID() const; }; class EightGPUReducer1 : public IEightGPUReducer { protected: void makeConnections(std::vector& same, std::vector&other); public: EightGPUReducer1(int tgtDeviceID); }; class EightGPUReducer2 : public IEightGPUReducer { protected: void makeConnections(std::vector& same, std::vector&other); public: EightGPUReducer2(int tgtDeviceID); }; #endif /* REDUCEPIPELINE_CUH_H_ */ ================================================ FILE: caffe2/contrib/cuda-convnet2/cudaconvnet/include/streambroadcast.cuh ================================================ /* * Copyright 2014 Google Inc. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef STREAMBROADCAST_CUH_ #define STREAMBROADCAST_CUH_ #include #include "../../util/include/queue.h" #include "../../nvmatrix/include/nvmatrix.cuh" #include "util.cuh" class Layer; //#define NUM_STREAM_COPY_PARTS 4 // This is in 4-byte words, not bytes #define SB_MIN_CHUNK_SIZE (1<<17) #define SB_MAX_CHUNKS 16 class StreamBroadcast { protected: std::map _streams; std::set _ownedStreams; HostNVMatrix _hostMem; void toHostMem(NVMatrix& src, NVMatrix& hostmem, int srcDevice); void toTarget(NVMatrix& hostmem, NVMatrix& tgt, int tgtDevice, float scaleTarget, float scaleOutput); void init(std::map& streams); void init(std::map& mats); public: StreamBroadcast(std::map& streams); StreamBroadcast(); virtual ~StreamBroadcast(); void transfer(std::map& mats, HostNVMatrix& hostmem, int srcDevice, float scaleTarget, float scaleOutput); void transfer(std::map& mats, int srcDevice, float scaleTarget, float scaleOutput); void transfer(std::map& mats, int srcDevice); void sync(int deviceID); cudaStream_t getStream(int deviceID); }; #endif /* STREAMBROADCAST_CUH_ */ ================================================ FILE: caffe2/contrib/cuda-convnet2/cudaconvnet/include/timer.cuh ================================================ /* * Copyright 2014 Google Inc. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef TIMER_CC_H_ #define TIMER_CC_H_ #include class Timer { protected: StopWatchInterface* _timer; bool _started; public: Timer() : _started(false) { sdkCreateTimer(&_timer); } ~Timer() { sdkDeleteTimer(&_timer); } inline void start () { _started = true; sdkResetTimer(&_timer); sdkStartTimer(&_timer); } inline double stop() { sdkStopTimer(&_timer); _started = false; return sdkGetTimerValue(&_timer); } inline bool isStarted() const { return _started; } }; #endif /* TIMER_CC_H_ */ ================================================ FILE: caffe2/contrib/cuda-convnet2/cudaconvnet/include/util.cuh ================================================ /* * Copyright 2014 Google Inc. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef UTIL_H #define UTIL_H #include #include #include #include #include #include #include #include #include #include #include "../../nvmatrix/include/nvmatrix.cuh" #include "../../util/include/matrix.h" #define PASS_TYPE uint #define PASS_TRAIN 0x1 #define PASS_TEST 0x2 #define PASS_GC 0x4 #define PASS_MULTIVIEW_TEST (PASS_TEST | 0x8) #define PASS_MULTIVIEW_TEST_START (PASS_MULTIVIEW_TEST | 0x10) #define PASS_MULTIVIEW_TEST_END (PASS_MULTIVIEW_TEST | 0x20) #define PASS_FEATURE_GEN 0x40 #define HAS_FLAG(f, x) (((x) & (f)) == (f)) #define IS_MULTIVIEW_TEST(x) HAS_FLAG(PASS_MULTIVIEW_TEST, x) #define IS_MULTIVIEW_TEST_START(x) HAS_FLAG(PASS_MULTIVIEW_TEST_START, x) #define IS_MULTIVIEW_TEST_END(x) HAS_FLAG(PASS_MULTIVIEW_TEST_END, x) #define IS_TEST(x) HAS_FLAG(PASS_TEST, x) #define IS_TRAIN(x) HAS_FLAG(PASS_TRAIN, x) // For gradient checking #define GC_SUPPRESS_PASSES false #define GC_REL_ERR_THRESH 0.02 #ifdef DO_PRINT #define PRINT(x, args...) printf(x, ## args); #else #define PRINT(x, args...) ; #endif /* * Generates a random floating point number in the range 0-1. */ #define randf ((float)rand() / RAND_MAX) //typedef std::vector MatrixV; //typedef std::vector NVMatrixV; typedef std::map*> CostMap; typedef std::map CostCoeffMap; typedef std::vector doublev; typedef std::vector floatv; typedef std::vector intv; typedef std::vector stringv; typedef std::set seti; typedef std::vector PyObjectV; stringv* getStringV(PyObject* pyList); floatv* getFloatV(PyObject* pyList); intv* getIntV(PyObject* pyList); MatrixV* getMatrixV(PyObject* pyList); MatrixV* getMatrixV(PyObject* pyList, int len); int* getIntA(PyObject* pyList); int pyDictGetInt(PyObject* dict, const char* key); intv* pyDictGetIntV(PyObject* dict, const char* key); std::string pyDictGetString(PyObject* dict, const char* key); float pyDictGetFloat(PyObject* dict, const char* key); floatv* pyDictGetFloatV(PyObject* dict, const char* key); Matrix* pyDictGetMatrix(PyObject* dict, const char* key); MatrixV* pyDictGetMatrixV(PyObject* dict, const char* key); int* pyDictGetIntA(PyObject* dict, const char* key); stringv* pyDictGetStringV(PyObject* dict, const char* key); bool pyDictHasKey(PyObject* dict, const char* key); PyObjectV* pyDictGetValues(PyObject* dict); template std::string tostr(T n); template void shuffleVector(std::vector& v, int start, int end); template void deleteElements(std::vector& v); template void deleteElements(std::vector& v, bool deleteContainer); template int indexOf(std::vector& v, T e) { int i = 0; // typename vector::iterator it2 = v.begin(); for (typename std::vector::const_iterator it = v.begin(); it != v.end(); ++it) { if (*it == e) { return i; } ++i; } return -1; } std::vector& getDeviceCPUs(int deviceID); template std::set getKeys(std::map& m) { std::set s; for (typename std::map::const_iterator it = m.begin(); it != m.end(); ++it) { s.insert(it->first); } return s; } struct LayerIDComparator { bool operator()(PyObject* i, PyObject* j) { return pyDictGetInt(i, "id") < pyDictGetInt(j, "id"); } }; #endif /* UTIL_H */ ================================================ FILE: caffe2/contrib/cuda-convnet2/cudaconvnet/include/weights.cuh ================================================ /* * Copyright 2014 Google Inc. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef WEIGHTS_CUH #define WEIGHTS_CUH #include #include #include #include #include #include "../../nvmatrix/include/nvmatrix.cuh" #include "../../util/include/matrix.h" #include "util.cuh" #include "lr.cuh" #include "layer.cuh" #include "copypipeline.cuh" #include "reducepipeline.cuh" #include "streambroadcast.cuh" class Layer; class Weights; class StreamBroadcast; class IWeightReducer { protected: int _tgtReplicaID; std::map _replicas; int getDeviceID(); public: IWeightReducer(std::map& replicas, int srcReplicaID); virtual ~IWeightReducer(); static IWeightReducer& make(std::map& replicas, int srcReplicaID); virtual void reduce(std::map gradShards, float gradScale, bool toInc) = 0; }; class SequentialWeightReducer : public IWeightReducer { protected: StreamBroadcast* _sb; public: SequentialWeightReducer(std::map& replicas, int srcReplicaID); ~SequentialWeightReducer(); void reduce(std::map gradShards, float gradScale, bool toInc); }; class ParallelWeightReducer : public IWeightReducer { protected: IEightGPUReducer* _reducer; public: ParallelWeightReducer(std::map& replicas, int srcReplicaID); ~ParallelWeightReducer(); void reduce(std::map gradShards, float gradScale, bool toInc); }; class Weights { protected: Matrix* _hWeights, *_hWeightsInc; NVMatrix* _weights, *_weightsInc, *_weightsGrad; ParameterSchedule* _lrs; float _wc, _mom, _wball; bool _onGPU, _useGrad, _cleanup; int _numUpdates; // Note: every layer is its own sibling too std::map _replicas; // Non-NULL if these weights are really shared from some other layer Weights* _srcWeights; Layer* _parent; int _shardSize; IWeightReducer* _reducer; ISafeBroadcastNetwork* _broadcaster; void aggregateReplicaGradients(float progress); // TODO: assert that these retrun contiguous views template T& getShard(T& mat, int replicaID); template T& getShard(T& mat); void init(Matrix& hWeights, Matrix& hWeightsInc, ParameterSchedule& lrs, Layer& parent, float wc, float wball, float mom, bool useGrad, bool cleanup); public: NVMatrix& operator*() const; Weights(Weights& srcWeights, ParameterSchedule& lrs, Layer& parent); Weights(Matrix& hWeights, Matrix& hWeightsInc, ParameterSchedule& lrs, Layer& parent, float wc, float wball, float mom, bool useGrad); virtual ~Weights(); virtual NVMatrix& getW() const; virtual NVMatrix& getInc() const; virtual NVMatrix& getGrad() const; virtual Matrix& getCPUW() const; virtual Matrix& getCPUWInc() const; virtual ParameterSchedule& getLearningRateSchedule() const; virtual int getNumRows() const; virtual int getNumCols() const; virtual void copyToCPU(); // This function is assumed to be called in the order in which the layers // were defined virtual void copyToGPU(); virtual void update(float progress); virtual void addReplica(Weights& sibling); int incNumUpdates(); // Returns the number of times a gradient has been computed for this // weight matrix during the current pass (interval between two calls of update()) // through the net. This number will only be greater than 1 if this weight matrix // is *shared* by multiple layers in the net. int getNumUpdates() const; float getEps(float progress) const; float getMom() const; float getWC() const; float getWBall() const; bool isUseGrad() const; bool isOwner() const; int getReplicaID(); int getDeviceID(); Layer& getParent(); std::map& getReplicas(); ISafeBroadcastNetwork& getBroadcaster(); IWeightReducer& getReducer(); }; class WeightList { private: std::vector _weightList; public: Weights& operator[](const int idx) const; ~WeightList(); WeightList(); Weights& at(const int i) const; void addWeights(Weights& w); void addReplica(WeightList& sibling); void update(float progress); void copyToCPU(); void copyToGPU(); int getSize() const; }; #endif /* WEIGHTS_CUH */ ================================================ FILE: caffe2/contrib/cuda-convnet2/cudaconvnet/include/worker.cuh ================================================ /* * Copyright 2014 Google Inc. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef WORKER_CUH #define WORKER_CUH #include "convnet.cuh" #include "cost.cuh" #include "data.cuh" class ConvNet; class Cost; class WorkResult { public: enum RESULTS {BATCH_DONE, SYNC_DONE}; protected: WorkResult::RESULTS _resultType; Cost* _results; public: WorkResult(WorkResult::RESULTS resultType, Cost& results); WorkResult(WorkResult::RESULTS resultType); virtual ~WorkResult(); Cost& getResults() const; WorkResult::RESULTS getResultType() const; }; class Worker { protected: ConvNet* _convNet; public: Worker(ConvNet& convNet); virtual ~Worker(); virtual bool run() = 0; }; class DataWorker : public Worker { protected: CPUData* _data; DataProvider* _dp; public: DataWorker(ConvNet& convNet, CPUData& data); virtual ~DataWorker(); bool run(); virtual void _run() = 0; }; class TrainingWorker : public DataWorker { protected: bool _test; double _progress; public: TrainingWorker(ConvNet& convNet, CPUData& data, double progress, bool test); void _run(); }; class SyncWorker : public Worker { public: SyncWorker(ConvNet& convNet); bool run(); }; class ExitWorker : public Worker { public: ExitWorker(ConvNet& convNet); bool run(); }; class GradCheckWorker : public DataWorker { public: GradCheckWorker(ConvNet& convNet, CPUData& data); void _run(); }; class MultiviewTestWorker : public DataWorker { protected: int _numViews; Matrix* _cpuProbs; std::string _logregName; CPUData& getMinibatch(int v, int i); public: MultiviewTestWorker(ConvNet& convNet, CPUData& data, int numViews, Matrix& cpuProbs, const char* softmaxName); MultiviewTestWorker(ConvNet& convNet, CPUData& data, int numViews); ~MultiviewTestWorker(); void _run(); }; class FeatureWorker : public DataWorker { protected: MatrixV *_ftrs; stringv *_layerNames; bool _deleteFeatures; public: FeatureWorker(ConvNet& convNet, CPUData& data, MatrixV& ftrs, stringv& layerNames, bool deleteFeatures=true); ~FeatureWorker(); void _run(); }; class DataGradWorker : public DataWorker { protected: Matrix* _dataGrads; int _dataLayerIdx, _softmaxLayerIdx; public: DataGradWorker(ConvNet& convNet, CPUData& data, Matrix& dataGrads, int dataLayerIdx, int softmaxLayerIdx); ~DataGradWorker(); void _run(); }; #endif/* WORKER_CUH */ ================================================ FILE: caffe2/contrib/cuda-convnet2/cudaconvnet/src/actbroadcaster.cu ================================================ /* * Copyright 2014 Google Inc. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "../include/actbroadcaster.cuh" using namespace std; /* * ===================== * BroadcastMessage * ===================== */ BroadcastMessage::BroadcastMessage(map mats, int srcDevice, int userIdx, Queue& finishQueue) : _type(BROADCAST), _mats(mats), _srcDevice(srcDevice), _userIdx(userIdx), _finishQueue(&finishQueue) { } BroadcastMessage::BroadcastMessage(MESSAGE_TYPE type) : _type(type), _finishQueue(NULL) { } int BroadcastMessage::getSrcDevice() { return _srcDevice; } map& BroadcastMessage::getMatrices() { return _mats; } int BroadcastMessage::getUserIdx() { return _userIdx; } Queue& BroadcastMessage::getFinishQueue() { return *_finishQueue; } BroadcastMessage::MESSAGE_TYPE BroadcastMessage::getMessageType() { return _type; } /* * ===================== * ExitBroadcastMessage * ===================== */ ExitBroadcastMessage::ExitBroadcastMessage() : BroadcastMessage(BroadcastMessage::EXIT) { } /* * ===================== * ActBroadcaster * ===================== */ ActBroadcaster::ActBroadcaster(int numUsers, intv& cpus) : Thread(true, cpus), _numUsers(numUsers) { } ActBroadcaster::~ActBroadcaster() { for (map::const_iterator it = _broadcasters.begin(); it != _broadcasters.end(); ++it) { delete it->second; } } Queue& ActBroadcaster::getMessageQueue() { return _messageQueue; } void* ActBroadcaster::run() { int nextUserIdx = 0; bool exit = false; while (!exit) { BroadcastMessage& msg = *_messageQueue.dequeue(); if (msg.getMessageType() == BroadcastMessage::EXIT) { exit = true; delete &msg; } else { if (msg.getUserIdx() == nextUserIdx) { if (_broadcasters.count(msg.getSrcDevice()) == 0) { _broadcasters[msg.getSrcDevice()] = &IBroadcastNetwork::make(getKeys(msg.getMatrices()), msg.getSrcDevice()); } _broadcasters[msg.getSrcDevice()]->broadcast(msg.getMatrices()); msg.getFinishQueue().enqueue(0); delete &msg; nextUserIdx = (nextUserIdx + 1) % _numUsers; } else { _messageQueue.enqueue(&msg); } } } return NULL; } void ActBroadcaster::stop() { getMessageQueue().enqueue(new ExitBroadcastMessage()); join(); } ================================================ FILE: caffe2/contrib/cuda-convnet2/cudaconvnet/src/convnet.cu ================================================ /* * Copyright 2014 Google Inc. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include #include #include #include #include #include "../../nvmatrix/include/nvmatrix.cuh" #include "../../nvmatrix/include/nvmatrix_operators.cuh" #include "../../util/include/matrix.h" #include "../include/convnet.cuh" #include "../include/util.cuh" using namespace std; /* * ======================= * ConvNet * ======================= */ ConvNet::ConvNet(PyObject* layerParams, intv& deviceIDs, int minibatchSize, bool conserveMem) : Thread(true) { _deviceIDs = deviceIDs; _data = NULL; _bufferData = NULL; _bufferMinibatchIdx = -1; _bufferPassIdx = -1; _trainingProgress = 0; _totalPassesDone = 0; _conserveMem = conserveMem; _sync = new ThreadSynchronizer(deviceIDs.size() + 1); PyObjectV* layerList = pyDictGetValues(layerParams); std::sort(layerList->begin(), layerList->end(), LayerIDComparator()); _dataCopyPD = new PipeDispenserBlocking(DIVUP(_deviceIDs.size(),2)); // hard-coded for now initDataLayers(layerList); initGPUThreads(layerList); connectReplicas(); // Connect replicas to one another connectChildren(layerParams); // Connect forward/backward links in graph _numFwdTerminal = 0; // Execute post-initialization stuff for (NameReplicaLayerMap::iterator it = _layerMap.begin(); it != _layerMap.end(); ++it) { for (int r = 0; r < it->second.size(); r++) { _numFwdTerminal += it->second[r]->getNext().size() == 0; if (it->second[r]->getNext().size() == 0) { printf("Fwd terminal: %s\n", it->second[r]->getName().c_str()); } it->second[r]->postInit(); } } // Find and count the terminal nodes in the backward pass for (int p = 0; p < getNumPasses(); p++) { set visited; _numBwdTerminal[p] = 0; for (int t = 0; t < _convNetThreads.size(); t++) { vector& cl = _convNetThreads[t]->getCostLayers(); for (int c = 0; c < cl.size(); c++) { findBwdTerminal(*cl[c], visited, _numBwdTerminal[p], p); } } } _dp = new DataProvider(minibatchSize); // Py_DECREF(layerList); delete layerList; } ConvNet::~ConvNet() { for (vector::const_iterator it = _convNetThreads.begin(); it != _convNetThreads.end(); ++it) { (*it)->getMessageQueue().enqueue(new Message(EXIT_CONVNET)); (*it)->join(); delete *it; } for (DataLayerVector::const_iterator it = _dataLayers.begin(); it != _dataLayers.end(); ++it) { delete *it; } for (intv::const_iterator it = _deviceIDs.begin(); it != _deviceIDs.end(); ++it) { DEVICE_MEMORY_MANAGER::destroyInstance(*it); } HOST_MEMORY_MANAGER::destroyInstance(); delete _sync; delete _dataCopyPD; delete _dp; } void ConvNet::stop() { getWorkerQueue().enqueue(new ExitWorker(*this)); join(); } PipeDispenser& ConvNet::getDataCopyPD() { return *_dataCopyPD; } void ConvNet::initDataLayers(PyObjectV* layerList) { for (int i = 0; i < layerList->size(); i++) { PyObject* paramsDict = layerList->at(i); std::string layerType = pyDictGetString(paramsDict, "type"); if (layerType == "data") { int numReplicas = pyDictGetInt(paramsDict, "numReplicas"); for (int r = 0; r < numReplicas; ++r) { DataLayer* dataLayer = new DataLayer(this, paramsDict, r); _dataLayers.push_back(dataLayer); _layerMap[dataLayer->getName()][r] = dataLayer; } } } } void ConvNet::initGPUThreads(PyObjectV* layerList) { // Initialize GPU worker threads for (int i = 0; i < _deviceIDs.size(); ++i) { ConvNetThread* cng = new ConvNetThread(layerList, _deviceIDs[i], i, this); _convNetThreads.push_back(cng); for (NameLayerMap::iterator it = cng->getLayerMap().begin(); it != cng->getLayerMap().end(); ++it) { const std::string& name = it->first; Layer* layer = it->second; _layerMap[name][layer->getReplicaID()] = layer; } } } void ConvNet::connectReplicas() { _numReplicasMax = 0; _numReplicasMin = 1 << 16; for (NameReplicaLayerMap::iterator it = _layerMap.begin(); it != _layerMap.end(); ++it) { _numReplicasMax = max(_numReplicasMax, int(it->second.size())); _numReplicasMin = min(_numReplicasMin, int(it->second.size())); for (map::iterator it2 = it->second.begin(); it2 != it->second.end(); ++it2) { Layer& l1 = *it2->second; for (map::iterator it3 = it->second.begin(); it3 != it->second.end(); ++it3) { Layer& l2 = *it3->second; l1.addReplica(l2); } } } } void ConvNet::connectChildren(PyObject* layerParams) { for (NameReplicaLayerMap::iterator it = _layerMap.begin(); it != _layerMap.end(); ++it) { PyObject* paramsDict = PyDict_GetItemString(layerParams, it->first.c_str()); PyObject* inputList = PyDict_GetItemString(paramsDict, "inputs"); if (inputList != NULL) { // Iterate over "replicas" of this layer int numReplicas = _layerMap[it->first].size(); for (int i = 0; i < PyList_GET_SIZE(inputList); i++) { std::string inputName = PyString_AsString(PyList_GetItem(inputList, i)); int numReplicasPrev = _layerMap[inputName].size(); // How many replicas from the previous layer must this layer be connected to? int numInputReplicas = numReplicasPrev / numReplicas; for (int r = 0; r < numReplicas; r++) { for (int rp = r, ridx = 0; ridx < numInputReplicas; rp += numReplicas, ridx++) { it->second[r]->addPrev(*_layerMap[inputName][rp], ridx); _layerMap[inputName][rp]->addNext(*it->second[r]); } } } } } } void ConvNet::findBwdTerminal(Layer& l, set& visited, int& terminal, int passIdx) { if (visited.count(&l) == 0) { visited.insert(&l); if (l.isGradConsumer()) { bool hasPrevConsumer = false; if (l.getPrev().size() > 0) { for (int i = 0; i < l.getPrev()[0].size(); i++) { // Looking only at 0th replica is fine to see if you have // grad consumers below you. hasPrevConsumer |= l.getPrev()[0][i]->isGradConsumer(); } } if (!hasPrevConsumer || !l.isGradProducer() || (passIdx + 1 < l.getNumReplicasPrev() && l.getNumReplicasPrev() > l.getNumReplicas())) { terminal++; l.setBwdTerminal(passIdx); printf("found bwd terminal %s[%d] in passIdx=%d\n", l.getName().c_str(), l.getReplicaID(), passIdx); } else if (l.isGradProducer()) { for (int r = 0; r < l.getPrev().size(); r++) { for (int i = 0; i < l.getPrev()[r].size(); i++) { findBwdTerminal(*l.getPrev()[r][i], visited, terminal, passIdx); } } } } } } void* ConvNet::run() { for (vector::const_iterator it = _convNetThreads.begin(); it != _convNetThreads.end(); ++it) { (*it)->start(); } // The manager thread defaults to using the GPU of the first worker. // Put more logic here if this is inappropriate. NVMatrix::setDeviceID(_convNetThreads[0]->getDeviceID()); copyToGPU(); bool exit = false; while (!exit) { Worker* worker = _workerQueue.dequeue(); exit = worker->run(); delete worker; } return NULL; } Queue& ConvNet::getWorkerQueue() { return _workerQueue; } Queue& ConvNet::getResultQueue() { return _resultQueue; } DataProvider& ConvNet::getDataProvider() { return *_dp; } Layer& ConvNet::getLayer(std::string& name, int replicaID) { return *_layerMap[name][replicaID]; } void ConvNet::sendMessage(MESSAGES msg, bool sync) { sendMessage(new Message(msg), sync); } void ConvNet::sendMessage(Message* msg, bool sync) { for (int i = 0; i < _convNetThreads.size(); i++) { _convNetThreads[i]->getMessageQueue().enqueue(msg->clone()); } delete msg; if (sync) { syncWithChildren(); } } void ConvNet::copyToCPU() { sendMessage(COPY_TO_CPU, true); } void ConvNet::copyToGPU() { sendMessage(COPY_TO_GPU, false); } void ConvNet::updateWeights(int passIdx) { sendMessage(UPDATE_WEIGHTS, true); sendMessage(CONSTRAIN_WEIGHTS, true); } void ConvNet::reset(int passIdx) { sendMessage((passIdx % getNumPasses()) == 0 ? RESET : RESET_PASS_IDX, false); } void ConvNet::reset() { reset(0); } // Fprop given data void ConvNet::fprop(CPUData& data, int passIdx, PASS_TYPE passType) { reset(passIdx); // This is necessary because setData below could delete data. If there's // an outstanding copy request, this'll cause a segfault. for (int i = 0; i < _dataLayers.size(); i++) { _dataLayers[i]->waitForCopyFinish(); } setData(data, passIdx); for (int i = 0; i < _dataLayers.size(); i++) { _dataLayers[i]->fprop(passType, passIdx, false); } waitForTerminals(_numFwdTerminal, FPROP_TERMINAL); } // Fprop given minibatch idx void ConvNet::fprop(int miniIdx, int passIdx, PASS_TYPE passType) { reset(passIdx); bool fromBuffer = miniIdx == _bufferMinibatchIdx && passIdx == _bufferPassIdx; if (!fromBuffer) { // This is necessary because setData below could delete data. If there's // an outstanding copy request, this'll cause a segfault. for (int i = 0; i < _dataLayers.size(); i++) { _dataLayers[i]->waitForCopyFinish(); } setData(_dp->getMinibatch(miniIdx), passIdx); } else { setDataFromBuffer(); } for (int i = 0; i < _dataLayers.size(); i++) { _dataLayers[i]->fprop(passType, passIdx, fromBuffer); } if (passIdx == getNumPasses() - 1) { // Do double-buffering from next minibatch from the DataProvider setBuffer(miniIdx == _dp->getNumMinibatches() - 1 ? NULL : &_dp->getMinibatch(miniIdx + 1), miniIdx + 1, 0); } else { // Do double-buffering from next microbatch within current minibatch setBuffer(_data, miniIdx, passIdx + 1); } waitForTerminals(_numFwdTerminal, FPROP_TERMINAL); } void ConvNet::setDataFromBuffer() { if (_bufferData != _data) { delete _data; } _data = _bufferData; _bufferData = NULL; _bufferMinibatchIdx = -1; _bufferPassIdx = -1; } void ConvNet::setData(CPUData& data, int passIdx) { bool same = _data == _bufferData; if (&data != _data) { delete _data; } if (&data != _bufferData && !same) { delete _bufferData; _bufferData = NULL; _bufferMinibatchIdx = -1; _bufferPassIdx = -1; } _data = &data; for (int i = 0; i < _dataLayers.size(); i++) { _dataLayers[i]->copyData(*_data, false, passIdx); } } void ConvNet::setBuffer(CPUData* bufferData, int bufferMinibatchIdx, int bufferPassIdx) { _bufferData = bufferData; _bufferMinibatchIdx = bufferMinibatchIdx; _bufferPassIdx = bufferPassIdx; if (bufferData != NULL) { for (int i = 0; i < _dataLayers.size(); i++) { _dataLayers[i]->copyData(*_bufferData, true, bufferPassIdx); } } } CPUData& ConvNet::getData() { assert(_data != NULL); return *_data; } void ConvNet::bprop(int passIdx, PASS_TYPE passType) { _totalPassesDone++; sendMessage(new BpropStartMessage(passType, passIdx), false); waitForTerminals(_numBwdTerminal[passIdx], BPROP_TERMINAL); reset(passIdx + 1); } void ConvNet::waitForTerminals(int numMsgs, MESSAGES msgType) { for (int rcvd = 0; rcvd < numMsgs; rcvd++) { Message* m = _msgQueue.dequeue(); assert(m->getType() == msgType); delete m; } } // Same as getCost() but adds results to given cost and returns it Cost& ConvNet::getCost(Cost& cost) { Cost &tmp = getCost(); cost += tmp; delete &tmp; return cost; } Cost& ConvNet::getCost() { Cost& cost = *new Cost(); for (int t = 0; t < _convNetThreads.size(); t++) { Cost& tcost = _convNetThreads[t]->getCost(); cost += tcost; delete &tcost; } return cost; } double ConvNet::getCostValue() { Cost& cost = getCost(); double val = cost.getValue(); delete &cost; return val; } Queue& ConvNet::getMessageQueue() { return _msgQueue; } intv& ConvNet::getDeviceIDs() { return _deviceIDs; } ThreadSynchronizer& ConvNet::getSync() { return *_sync; } void ConvNet::syncWithChildren() { sendMessage(SYNC, false); _sync->sync(); } int ConvNet::getTotalPassesDone() { return _totalPassesDone; } int ConvNet::getMinibatchSize() { return _dp->getMinibatchSize(); } int ConvNet::getNumReplicasMax() { return _numReplicasMax; } int ConvNet::getNumReplicasMin() { return _numReplicasMin; } int ConvNet::getNumPasses() { return _numReplicasMax / _numReplicasMin; } void ConvNet::setTrainingProgress(double progress) { _trainingProgress = progress; } double ConvNet::getTrainingProgress() const { return _trainingProgress; } bool ConvNet::isConserveMemory() { return _conserveMem; } /* * Gradient checking stuff */ void ConvNet::checkGradients() { _numFailures = 0; _numTests = 0; _baseErr = 0; for (int p = 0; p < getNumPasses(); ++p) { fprop(0, p, PASS_GC); _baseErr += getCostValue(); bprop(p, PASS_GC); } // We call grad check only on the first replica, // but because weights are aware of their fellow replicas, // we can simultaneously perturb the weights of all // replicas. for (NameReplicaLayerMap::iterator it = _layerMap.begin(); it != _layerMap.end(); ++it) { map& layers = it->second; if (layers[0]->getDeviceID() >= 0 /*&& (layers[0]->getName() == "fc10")*/) { // If layer on GPU (data layers aren't) layers[0]->checkGradient(); } } cout << "------------------------" << endl; if (_numFailures > 0) { cout << _numFailures << "/" << _numTests << " TESTS FAILED" << endl; } else { cout << "ALL " << _numTests << " TESTS PASSED" << endl; } } // Copies to all replicas void ConvNet::checkGradient_copyWeightsToGPU(Matrix& weightsCPU, Weights& weights) { int d = NVMatrix::getDeviceID(); for (map::const_iterator it = weights.getReplicas().begin(); it != weights.getReplicas().end(); ++it) { NVMatrix::setDeviceID(it->second->getDeviceID()); it->second->getW().copyFromHost(weightsCPU); } NVMatrix::setDeviceID(d); } /* * name: weight matrix name * eps: finite difference step */ bool ConvNet::checkGradient(const std::string& name, float eps, Weights& weights) { Matrix numGrad(weights.getNumRows(), weights.getNumCols()); Matrix diff(numGrad); numGrad.apply(Matrix::ZERO); Matrix weightsCPU; weights.getW().copyToHost(weightsCPU, true); for(int i = 0; i < weights.getNumRows(); i++) { for (int j = 0; j < weights.getNumCols(); j++) { float v = weightsCPU(i,j); weightsCPU(i,j) += eps; checkGradient_copyWeightsToGPU(weightsCPU, weights); weightsCPU(i,j) = v; double err = 0; for (int p = 0; p < getNumPasses(); ++p) { // printf("trying fprop %d\n", p); fprop(0, p, PASS_GC); // printf(" success\n"); err += getCostValue(); } numGrad(i,j) = (err - _baseErr) / (_data->getNumCases() * eps); if (isnan((double)numGrad(i,j)) || isinf((double)numGrad(i,j))) { cout << "Numerical computation produced nan or inf when checking '" << name << "': " << numGrad(i,j) << endl; cout << "Consider reducing the sizes of the weights or finite difference steps." << endl; cout << "Exiting." << endl; exit(1); } checkGradient_copyWeightsToGPU(weightsCPU, weights); } } Matrix gradCPU; NVMatrix::setDeviceID(weights.getDeviceID()); map mats; for (map::const_iterator it = weights.getReplicas().begin(); it != weights.getReplicas().end(); ++it) { mats[it->first] = &it->second->getGrad(); } weights.getReducer().reduce(mats, 1, false); weights.getGrad().copyToHost(gradCPU, true); gradCPU.scale(-1.0 / _data->getNumCases()); float analNorm = gradCPU.norm(); float numNorm = numGrad.norm(); numGrad.subtract(gradCPU, diff); float relErr = diff.norm() / analNorm; bool fail = relErr >= GC_REL_ERR_THRESH; if (fail || !GC_SUPPRESS_PASSES) { cout << "========================" << endl; printf("(%s) %s GRADIENT CHECK\n", fail ? "****FAIL****" : "PASS", name.c_str()); cout << "========================" << endl; cout << "Analytic:" << endl; gradCPU.print(0, 6, 0, 4); cout << "Numeric:" << endl; numGrad.print(0, 6, 0, 4); printf("Analytic norm: %e\n", analNorm); printf("Numeric norm: %e\n", numNorm); printf("Relative error: %e\n", relErr); } _numTests++; _numFailures += fail; return fail; } /* * ======================================================================================================= * ConvNetThread * ======================================================================================================= */ ConvNetThread::ConvNetThread(PyObjectV* layerList, int deviceID, int deviceIdx, ConvNet* convNet) : Thread(true, getDeviceCPUs(deviceID)), _deviceID(deviceID), _convNet(convNet) { try { int numLayers = layerList->size(); for (int i = 0; i < numLayers; i++) { PyObject* paramsDict = layerList->at(i); std::string layerType = pyDictGetString(paramsDict, "type"); if (layerType != "data") { intv& gpus = *pyDictGetIntV(paramsDict, "gpu"); int rid = indexOf(gpus, deviceIdx); if (rid >= 0) { initLayer(paramsDict, rid); } delete &gpus; } } } catch (std::string& s) { cout << "Error creating ConvNet: " << s << endl; exit(1); } } ConvNetThread::~ConvNetThread() { NVMatrix::setDeviceID(_deviceID); NVMatrix::destroyCublas(); NVMatrix::destroyRandom(); for (NameLayerMap::const_iterator it = _nameLayerMap.begin(); it != _nameLayerMap.end(); ++it) { delete it->second; } _nameLayerMap.clear(); } void ConvNetThread::startTimer() { NVMatrix::syncStream(); _timer.start(); } double ConvNetThread::stopTimer() { NVMatrix::syncStream(); return _timer.stop(); } void ConvNetThread::initLayer(PyObject* paramsDict, int replicaID) { std::string type = pyDictGetString(paramsDict, "type"); std::string name = pyDictGetString(paramsDict, "name"); if (type == "fc") { _nameLayerMap[name] = new FCLayer(this, paramsDict, replicaID, false); } else if (type == "sfc") { _nameLayerMap[name] = new SplitFCLayer(this, paramsDict, replicaID, false); } else if (type == "conv") { _nameLayerMap[name] = new ConvLayer(this, paramsDict, replicaID); } else if (type == "local") { _nameLayerMap[name] = new LocalUnsharedLayer(this, paramsDict, replicaID); } else if (type == "pool") { _nameLayerMap[name] = &PoolLayer::make(this, paramsDict, replicaID); } else if (type == "cmpool") { _nameLayerMap[name] = &CrossMapPoolLayer::make(this, paramsDict, replicaID); } else if (type == "rnorm") { _nameLayerMap[name] = new ResponseNormLayer(this, paramsDict, replicaID); } else if (type == "cmrnorm") { _nameLayerMap[name] = new CrossMapResponseNormLayer(this, paramsDict, replicaID); } else if (type == "cnorm") { _nameLayerMap[name] = new ContrastNormLayer(this, paramsDict, replicaID); } else if (type == "softmax") { _nameLayerMap[name] = new SoftmaxLayer(this, paramsDict, replicaID); } else if (type == "eltsum") { _nameLayerMap[name] = new EltwiseSumLayer(this, paramsDict, replicaID); } else if (type == "eltmax") { _nameLayerMap[name] = new EltwiseMaxLayer(this, paramsDict, replicaID); } else if (type == "neuron") { _nameLayerMap[name] = new NeuronLayer(this, paramsDict, replicaID); } else if (type == "nailbed") { _nameLayerMap[name] = new NailbedLayer(this, paramsDict, replicaID); } else if (type == "blur") { _nameLayerMap[name] = new GaussianBlurLayer(this, paramsDict, replicaID); } else if (type == "href") { _nameLayerMap[name] = new HorizontalReflectionLayer(this, paramsDict, replicaID); } else if (type == "resize") { _nameLayerMap[name] = new ResizeLayer(this, paramsDict, replicaID); } else if (type == "rgb2yuv") { _nameLayerMap[name] = new RGBToYUVLayer(this, paramsDict, replicaID); } else if (type == "rgb2lab") { _nameLayerMap[name] = new RGBToLABLayer(this, paramsDict, replicaID); } else if (type == "rscale") { _nameLayerMap[name] = new RandomScaleLayer(this, paramsDict, replicaID); } else if (type == "crop") { _nameLayerMap[name] = new CropLayer(this, paramsDict, replicaID); } else if (type == "concat") { _nameLayerMap[name] = new ConcatenationLayer(this, paramsDict, replicaID); } else if (type == "pass") { _nameLayerMap[name] = new PassThroughLayer(this, paramsDict, replicaID); } else if (type == "dropout") { _nameLayerMap[name] = new DropoutLayer(this, paramsDict, replicaID); } else if (type == "dropout2") { _nameLayerMap[name] = new Dropout2Layer(this, paramsDict, replicaID); } else if (strncmp(type.c_str(), "cost.", 5) == 0) { CostLayer *c = &CostLayer::make(this, paramsDict, type, replicaID); _nameLayerMap[name] = c; _costs.push_back(c); } else { throw std::string("Unknown layer type ") + type; } } /* * This executes in a new CPU thread so it's OK to initialize CUDA stuff here. */ void ConvNetThread::initCuda() { NVMatrix::setDeviceID(_deviceID); checkCudaErrors(cudaDeviceSetCacheConfig(cudaFuncCachePreferShared)); for (int i = 0; i < _convNet->getDeviceIDs().size(); i++) { int d = _convNet->getDeviceIDs()[i]; if (d != _deviceID) { if (NVMatrix::canAccessPeer(_deviceID, d)) { printf("Enabling peer access GPU %d --> GPU %d\n", NVMatrix::getDeviceID(), d); checkCudaErrors(cudaDeviceEnablePeerAccess(d, 0)); } else { printf("No peer access GPU %d --> GPU %d\n", _deviceID, d); } } } // NVMatrix::syncStream(); NVMatrix::initCublas(); NVMatrix::initRandom(/*7*/); srand(time(0)); } void* ConvNetThread::run() { initCuda(); bool exit = false; while (!exit) { Message* m = _msgQueue.dequeue(); if (m->getType() == FPROP_READY) { FpropMessage* msg = static_cast(m); msg->getToLayer().fprop(msg->getPassType(), msg->getPassIdx()); } else if (m->getType() == BPROP_READY) { BpropMessage* msg = static_cast(m); msg->getToLayer().incRcvdBInputMsgs(); msg->getToLayer().bprop(msg->getPassType(), msg->getPassIdx()); } else if (m->getType() == BPROP_START) { BpropStartMessage* msg = static_cast(m); for (int i = 0; i < _costs.size(); i++) { dynamic_cast(_costs[i])->bprop(msg->getPassType(), msg->getPassIdx()); } } else if (m->getType() == SYNC) { NVMatrix::syncStream(); _convNet->getSync().sync(); } else if (m->getType() == COPY_TO_CPU) { for (NameLayerMap::iterator it = _nameLayerMap.begin(); it != _nameLayerMap.end(); ++it) { it->second->copyToCPU(); } } else if (m->getType() == COPY_TO_GPU) { for (NameLayerMap::iterator it = _nameLayerMap.begin(); it != _nameLayerMap.end(); ++it) { it->second->copyToGPU(); } } else if (m->getType() == RESET) { for (NameLayerMap::iterator it = _nameLayerMap.begin(); it != _nameLayerMap.end(); ++it) { it->second->reset(); } } else if (m->getType() == RESET_PASS_IDX) { for (NameLayerMap::iterator it = _nameLayerMap.begin(); it != _nameLayerMap.end(); ++it) { it->second->resetPassIdx(); } } else if (m->getType() == UPDATE_WEIGHTS) { for (NameLayerMap::iterator it = _nameLayerMap.begin(); it != _nameLayerMap.end(); ++it) { it->second->updateWeights(); } } else if (m->getType() == CONSTRAIN_WEIGHTS) { for (NameLayerMap::iterator it = _nameLayerMap.begin(); it != _nameLayerMap.end(); ++it) { it->second->constrainWeights(); } } else if (m->getType() == EXIT_CONVNET) { exit = true; } delete m; } return NULL; } Cost& ConvNetThread::getCost() { // In a single ConvNetThread, all costs are guaranteed to be different // (i.e. not replicas of one another) return *new Cost(_costs); } Layer& ConvNetThread::getLayer(std::string& name) { return *_nameLayerMap[name]; } int ConvNetThread::getDeviceID() { return _deviceID; } Queue& ConvNetThread::getMessageQueue() { return _msgQueue; } vector& ConvNetThread::getCostLayers() { return _costs; } NameLayerMap& ConvNetThread::getLayerMap() { return _nameLayerMap; } ConvNet& ConvNetThread::getConvNet() { return *_convNet; } ================================================ FILE: caffe2/contrib/cuda-convnet2/cudaconvnet/src/copypipeline.cu ================================================ /* * Copyright 2014 Google Inc. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "../include/copypipeline.cuh" //#include "gpu_util.cuh" using namespace std; /* ========================= * ICopySegment * ========================= */ ICopySegment::ICopySegment(IBroadcastNetwork& parent, int deviceID, Queue* finishQueue) : _parent(&parent), _prev(NULL), _stream(NULL), _deviceID(deviceID), _finishQueue(finishQueue), Thread(true, getDeviceCPUs(parent.getSourceDeviceID())) { _execDeviceID = _deviceID; } ICopySegment::~ICopySegment() { if (_stream != NULL) { checkCudaErrors(cudaStreamDestroy(_stream)); } } void* ICopySegment::run() { assert(_execDeviceID != DEVICE_HOST); NVMatrix::setDeviceID(_execDeviceID); checkCudaErrors(cudaStreamCreateWithFlags(&_stream, cudaStreamNonBlocking)); bool exit = false; while (!exit) { CopyMessage& msg = *_queue.dequeue(); if (msg.getType() == CopyMessage::EXIT) { exit = true; } else { bool term = processMessage(msg); if (term) { assert(_finishQueue != NULL); _finishQueue->enqueue(1); } } delete &msg; } return NULL; } NVMatrix& ICopySegment::getChunk(NVMatrix& mat, int chunkSize, int chunkIdx) { NVMatrix& line = mat.reshaped(1, mat.getNumElements()); int start = chunkIdx * chunkSize; int end = min((chunkIdx+1) * chunkSize, mat.getNumElements()); NVMatrix& chunk = line.sliceCols(start, end); delete &line; return chunk; } inline NVMatrix& ICopySegment::getMatrix(CopyMessage& msg) { if (getDeviceID() == DEVICE_HOST) { return _hmat; } return msg.getMatrix(getDeviceID()); } Queue& ICopySegment::getQueue() { return _queue; } inline int ICopySegment::getDeviceID() { return _deviceID; } void ICopySegment::addPrev(ICopySegment& c) { _prev = &c; if (_deviceID == DEVICE_HOST) { _execDeviceID = c.getDeviceID(); } } void ICopySegment::addNext(CopyPeer& c) { _next.push_back(&c); c.addPrev(*this); } bool ICopySegment::isTerminal() const { return _next.size() == 0; } /* ========================= * CopySource * ========================= */ CopySource::CopySource(IBroadcastNetwork& parent, int deviceID) : ICopySegment(parent, deviceID, NULL) { } bool CopySource::processMessage(CopyMessage& msg) { assert(msg.getType() == CopyMessage::COPY_START); int numChunks = min(getMatrix(msg).getNumElements(), max(COPY_MIN_CHUNKS, min(COPY_MAX_CHUNKS, DIVUP(getMatrix(msg).getNumElements(), COPY_MIN_CHUNK_SIZE)))); int chunkSize = DIVUP(getMatrix(msg).getNumElements(), numChunks); // printf("num chunks: %d\n", numChunks); for (int c = 0; c <= numChunks; ++c) { for (vector::const_iterator it = _next.begin(); it != _next.end(); ++it) { (*it)->getQueue().enqueue(new CopyChunkMessage(c, chunkSize, numChunks, msg.getScaleSource(), msg.getScaleTargets(), msg.getMatrices())); } } return false; } inline bool CopySource::isSource() const { return true; } /* ========================= * CopyPeer * ========================= */ CopyPeer::CopyPeer(IBroadcastNetwork& parent, int deviceID, Queue* finishQueue) : ICopySegment(parent, deviceID, finishQueue) { } bool CopyPeer::processMessage(CopyMessage& msg) { assert(msg.getType() == CopyMessage::COPY_CHUNK); CopyChunkMessage& cmsg = *static_cast(&msg); if (cmsg.getChunkIdx() < cmsg.getNumChunks()) { if (!isTerminal() || (isTerminal() && msg.getScaleTargets() == 0)) { getMatrix(msg).resize(_prev->getMatrix(msg)); } // getMatrix(msg).printShape("getMatrix(msg)"); // _prev->getMatrix(msg).printShape("_prev->getMatrix(msg)"); assert(getMatrix(msg).isSameDims(_prev->getMatrix(msg))); const float scaleSelf = isTerminal() ? msg.getScaleTargets() : 0; const float scalePrev = _prev->isSource() ? msg.getScaleSource() : 1; NVMatrix& prevChunk = getChunk(_prev->getMatrix(msg), cmsg.getChunkSize(), cmsg.getChunkIdx()); NVMatrix& myChunk = getChunk(getMatrix(msg), cmsg.getChunkSize(), cmsg.getChunkIdx()); prevChunk.add(myChunk, scalePrev, scaleSelf, myChunk, _stream); NVMatrix::syncStream(_stream); delete &prevChunk; delete &myChunk; } for (vector::const_iterator it = _next.begin(); it != _next.end(); ++it) { (*it)->getQueue().enqueue(new CopyChunkMessage(cmsg)); } return cmsg.getChunkIdx() >= cmsg.getNumChunks() && isTerminal(); } inline bool CopyPeer::isSource() const { return false; } /* ========================= * IBroadcastNetwork * ========================= */ IBroadcastNetwork& IBroadcastNetwork::make(set devices, int srcDevice) { if (devices.size() == 8) { return (new EightGPUBroadcaster1(devices, srcDevice))->construct(); } else if (devices.size() == 1) { return (new NullBroadcaster(devices, srcDevice))->construct(); } else if (devices.size() == 2 && NVMatrix::canAccessPeer(*devices.begin(), *(++devices.begin()))) { return (new TwoPeeringGPUsBroadcaster(devices, srcDevice))->construct(); } return (new NaiveBroadcaster(devices, srcDevice))->construct(); } IBroadcastNetwork::IBroadcastNetwork(set& devices, int srcDeviceID, int numTerminal) : _devices(devices), _srcDeviceID(srcDeviceID), _numTerminal(numTerminal), _constructed(false), _src(NULL) { } IBroadcastNetwork::~IBroadcastNetwork() { vector v; v.insert(v.end(), _peers.begin(), _peers.end()); v.insert(v.end(), _src); for (vector::const_iterator it = v.begin(); it != v.end(); ++it) { (*it)->getQueue().enqueue(new CopyMessage(CopyMessage::EXIT)); (*it)->join(); delete *it; } } IBroadcastNetwork& IBroadcastNetwork::construct() { assert(!_constructed); pair,vector > gpus = makeGPULists(); _src = new CopySource(*this, _srcDeviceID); makePeers(gpus); makeConnections(); _src->start(); for (vector::const_iterator it = _peers.begin(); it != _peers.end(); ++it) { (*it)->start(); } _constructed = true; return *this; } pair,vector > IBroadcastNetwork::makeGPULists() { vector same, other; for (set::const_iterator it = _devices.begin(); it != _devices.end(); ++it) { if (*it != _srcDeviceID) { if (NVMatrix::canAccessPeer(_srcDeviceID, *it)) { same.insert(same.begin() + rand() % (1 + same.size()), *it); } else { other.insert(other.begin() + rand() % (1 + other.size()), *it); } } } return pair,vector >(same, other); } void IBroadcastNetwork::broadcast(std::map& mats) { _broadcast(mats, 1, 0); } void IBroadcastNetwork::_broadcast(std::map& mats, float scaleSource, float scaleTargets) { assert(_constructed); assert(_finishQueue.getNumElements() == 0); assert(mats.size() == _devices.size()); assert(mats.size() > 1); if (mats[_srcDeviceID]->getNumElements() == 0) { for (map::const_iterator it = mats.begin(); it != mats.end(); ++it) { it->second->resize(*mats[_srcDeviceID]); } } else { _src->getQueue().enqueue(new CopyStartMessage(scaleSource, scaleTargets, mats)); for (int i = 0; i < _numTerminal; ++i) { _finishQueue.dequeue(); } } assert(_finishQueue.getNumElements() == 0); } int IBroadcastNetwork::getSourceDeviceID() const { return _srcDeviceID; } void IBroadcastNetwork::makePeers(pair,vector >& gpus) { vector& same = gpus.first, &other = gpus.second; for (int i = 0; i < same.size(); ++i) { _peers.push_back(new CopyPeer(*this, same[i], &_finishQueue)); } for (int i = 0; i < other.size(); ++i) { _peers.push_back(new CopyPeer(*this, other[i], &_finishQueue)); } _peers.push_back(new CopyPeer(*this, DEVICE_HOST, &_finishQueue)); // peers[7] } /* ========================= * ISafeBroadcastNetwork * ========================= */ ISafeBroadcastNetwork& ISafeBroadcastNetwork::make(set devices, int srcDevice) { if (devices.size() == 1) { return (new NullBroadcaster(devices, srcDevice))->construct(); } else if (devices.size() == 2 && NVMatrix::canAccessPeer(*devices.begin(), *(++devices.begin()))) { return (new TwoPeeringGPUsBroadcaster(devices, srcDevice))->construct(); } return (new NaiveBroadcaster(devices, srcDevice))->construct(); } ISafeBroadcastNetwork::ISafeBroadcastNetwork(std::set& devices, int srcDeviceID, int numTerminal) : IBroadcastNetwork(devices, srcDeviceID, numTerminal) { } void ISafeBroadcastNetwork::broadcast(std::map& mats, float scaleSource, float scaleTargets) { _broadcast(mats, scaleSource, scaleTargets); } ISafeBroadcastNetwork& ISafeBroadcastNetwork::construct() { IBroadcastNetwork::construct(); return *this; } /* ========================= * NullBroadcaster * ========================= */ NullBroadcaster::NullBroadcaster(std::set& devices, int srcDeviceID) : ISafeBroadcastNetwork(devices, srcDeviceID, 0) { } void NullBroadcaster::makeConnections() { } NullBroadcaster& NullBroadcaster::construct() { _constructed = true; return *this; } void NullBroadcaster::broadcast(std::map& mats, float scaleSource, float scaleTargets) { } void NullBroadcaster::broadcast(std::map& mats) { } /* ========================= * NaiveBroadcaster * ========================= * * This one does src -> host -> all */ NaiveBroadcaster::NaiveBroadcaster(std::set& devices, int srcDeviceID) : ISafeBroadcastNetwork(devices, srcDeviceID, devices.size()-1) { } void NaiveBroadcaster::makeConnections() { _src->addNext(*_peers.back()); // Make connection src -> host for (int i = 0; i < _peers.size() - 1; ++i) { if (_peers[i]->getDeviceID() != _src->getDeviceID()) { _peers.back()->addNext(*_peers[i]); // Make connection host -> peer } } } /* ========================= * EightGPUBroadcaster1 * ========================= * * This one does a fancy graph */ EightGPUBroadcaster1::EightGPUBroadcaster1(set& devices, int srcDeviceID) : IBroadcastNetwork(devices, srcDeviceID, 4) { } void EightGPUBroadcaster1::makeConnections() { _src->addNext(*_peers[7]); _peers[7]->addNext(*_peers[0]); _peers[7]->addNext(*_peers[1]); _peers[7]->addNext(*_peers[3]); _peers[7]->addNext(*_peers[4]); _peers[1]->addNext(*_peers[2]); _peers[3]->addNext(*_peers[5]); _peers[4]->addNext(*_peers[6]); } /* ========================= * TwoPeeringGPUsBroadcaster * ========================= */ TwoPeeringGPUsBroadcaster::TwoPeeringGPUsBroadcaster(std::set& devices, int srcDeviceID) : ISafeBroadcastNetwork(devices, srcDeviceID, 0) { _tgtDeviceID = *devices.begin() == srcDeviceID ? *(++devices.begin()) : *devices.begin(); } TwoPeeringGPUsBroadcaster::~TwoPeeringGPUsBroadcaster() { if (_constructed) { checkCudaErrors(cudaStreamDestroy(_tgtStream)); } } void TwoPeeringGPUsBroadcaster::makeConnections() { } void TwoPeeringGPUsBroadcaster::resetDeviceID(int d) { if (d >= 0) { NVMatrix::setDeviceID(d); } } ISafeBroadcastNetwork& TwoPeeringGPUsBroadcaster::construct() { assert(!_constructed); int d = NVMatrix::getDeviceID(); NVMatrix::setDeviceID(_tgtDeviceID); checkCudaErrors(cudaStreamCreateWithFlags(&_tgtStream, cudaStreamNonBlocking)); resetDeviceID(d); _constructed = true; return *this; } void TwoPeeringGPUsBroadcaster::_broadcast(std::map& mats, float scaleSource, float scaleTargets) { int d = NVMatrix::getDeviceID(); NVMatrix::setDeviceID(_tgtDeviceID); mats[_tgtDeviceID]->add(*mats[_srcDeviceID], scaleTargets, scaleSource, *mats[_tgtDeviceID], _tgtStream); NVMatrix::syncStream(_tgtStream); resetDeviceID(d); } ================================================ FILE: caffe2/contrib/cuda-convnet2/cudaconvnet/src/cost.cu ================================================ /* * Copyright 2014 Google Inc. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include #include "../include/cost.cuh" using namespace std; /* * ===================== * Cost * ===================== */ Cost::Cost() { } Cost::Cost(vector& costs) { for (vector::iterator it = costs.begin(); it != costs.end(); ++it) { _costMap[(*it)->getName()] = &(*it)->getCost(); _costCoeffMap[(*it)->getName()] = (*it)->getCoeff(); _numCases[(*it)->getName()] = (*it)->getNumCases(); } } int Cost::getNumCases() { return _numCases.size() == 0 ? 0 : _numCases.begin()->second; } map& Cost::getNumCasesMap() { return _numCases; } doublev& Cost::operator [](const std::string s) { return *_costMap[s]; } CostMap& Cost::getCostMap() { return _costMap; } CostCoeffMap& Cost::getCostCoeffMap() { return _costCoeffMap; } double Cost::getValue() { double val = 0; for (CostMap::iterator it = _costMap.begin(); it != _costMap.end(); ++it) { val += _costCoeffMap[it->first] * (it->second->size() == 0 ? 0 : it->second->at(0)); } return val; } Cost& Cost::operator += (Cost& er) { CostMap& otherMap = er.getCostMap(); CostCoeffMap& otherCoeffMap = er.getCostCoeffMap(); for (CostMap::const_iterator it = otherMap.begin(); it != otherMap.end(); ++it) { bool newCost = _costMap.count(it->first) == 0; if (newCost) { _costMap[it->first] = new doublev(); _costCoeffMap[it->first] = otherCoeffMap[it->first]; _numCases[it->first] = er.getNumCasesMap()[it->first]; } else { _numCases[it->first] += er.getNumCasesMap()[it->first]; } doublev& myVec = *_costMap[it->first]; doublev& otherVec = *otherMap[it->first]; assert(myVec.size() == 0 || otherVec.size() == 0 || myVec.size() == otherVec.size()); // Add costs from otherVec to me for (int i = 0; i < otherVec.size(); i++) { if (myVec.size() <= i) { myVec.push_back(0); } myVec[i] += otherVec[i]; } } return *this; } Cost::~Cost() { for (CostMap::const_iterator it = _costMap.begin(); it != _costMap.end(); ++it) { delete it->second; } } void Cost::print() { for (CostMap::const_iterator it = _costMap.begin(); it != _costMap.end(); ++it) { printf("%s (%.3f): ", it->first.c_str(), _costCoeffMap[it->first]); doublev& vec = *_costMap[it->first]; for (int z = 0; z < vec.size(); ++z) { printf("%.3f", vec[z]); if (z < vec.size() - 1) { printf(", "); } } printf("\n"); } } ================================================ FILE: caffe2/contrib/cuda-convnet2/cudaconvnet/src/data.cu ================================================ /* * Copyright 2014 Google Inc. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include #include #include "../../util/include/matrix.h" #include "../include/data.cuh" #include "../include/timer.cuh" using namespace std; DataProvider::DataProvider(int minibatchSize) : _minibatchSize(minibatchSize), _hData(NULL) { } void DataProvider::clearData() { delete _hData; _hData = NULL; } void DataProvider::setData(CPUData& hData) { // DataWorker calls clearData _hData = &hData; assert(_hData != NULL); } CPUData& DataProvider::getMinibatch(int idx) { assert(idx >= 0 && idx < getNumMinibatches()); return getDataSlice(idx * _minibatchSize, (idx + 1) * _minibatchSize); } CPUData& DataProvider::getDataSlice(int startCase, int endCase) { assert(_hData != 0); assert(_hData->getNumCases() > 0); endCase = min(_hData->getNumCases(), endCase); // TODO: maintain these matrices, no point re-creating them all the time MatrixV& miniData = *new MatrixV(); for (int i = 0; i < _hData->getData().size(); i++) { // NOTE: if hData is transposed, then the output minibatch matrix // can be a view. No need to allocate new CPU memory here. Might // want to look into optimizing that in the future, though it's // unlikely to be a big deal. if (_hData->isTrans()) { miniData.push_back(&(*_hData)[i].sliceCols(startCase, endCase)); } else { miniData.push_back(new Matrix()); (*_hData)[i].sliceCols(startCase, endCase, *miniData.back()); } } CPUData& cpuData = *new CPUData(&miniData); return *new CPUData(&miniData); } int DataProvider::getNumMinibatches() { assert(_hData != 0); assert(_hData->getNumCases() > 0); return DIVUP(_hData->getNumCases(), _minibatchSize); } int DataProvider::getMinibatchSize() { return _minibatchSize; } int DataProvider::getNumCases() { assert(_hData != 0); assert(_hData->getNumCases() > 0); return _hData->getNumCases(); } ================================================ FILE: caffe2/contrib/cuda-convnet2/cudaconvnet/src/gradreducer.cu ================================================ /* * Copyright 2014 Google Inc. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "../include/util.cuh" #include "../include/gradreducer.cuh" using namespace std; /* ===================== * IGradReducer * ===================== */ IActGradReducer::IActGradReducer(Layer& parent, map numExpectedMsgs) : Thread(true, getDeviceCPUs(parent.getDeviceID())), _parent(&parent), _numExpectedMsgs(numExpectedMsgs) { _numExpectedMsgsTotal = 0; for (map::const_iterator it = numExpectedMsgs.begin(); it != numExpectedMsgs.end(); ++it) { _numExpectedMsgsTotal += it->second; } // printf("%s[%d] expected %d backward msgs\n", parent.getName().c_str(), parent.getReplicaID(), _numExpectedMsgsTotal); } IActGradReducer::~IActGradReducer() { } void* IActGradReducer::run() { while (true) { reset(); if (reduce()) { break; } _finishQueue.enqueue(0); } return NULL; } // Cost layer will have nothing to dequeue, so just return immediately. int IActGradReducer::waitForFinish() { if (_numExpectedMsgsTotal > 0) { int i = _finishQueue.dequeue(); assert(_finishQueue.getNumElements() == 0); return i; } // printf("%s not waiting for finish\n", _name.c_str()); return 0; } IActGradReducer& IActGradReducer::makeGradReducer(Layer& parent, map numExpectedMsgs) { int tgtDeviceID = parent.getDeviceID(); if (numExpectedMsgs.count(tgtDeviceID) == 0) { numExpectedMsgs[tgtDeviceID] = 0; } if (numExpectedMsgs.size() == 8) { return *new ParallelActGradReducer(parent, numExpectedMsgs); } return *new SequentialActGradReducer(parent, numExpectedMsgs); } /* ===================== * SequentialGradReducer * ===================== */ SequentialActGradReducer::SequentialActGradReducer(Layer& parent, map numExpectedMsgs) : IActGradReducer(parent, numExpectedMsgs) { intv deviceIDs; int tgtDeviceID = parent.getDeviceID(); for (map::const_iterator it = numExpectedMsgs.begin(); it != numExpectedMsgs.end(); ++it) { if (it->first != tgtDeviceID) { deviceIDs.push_back(it->first); } } if (numExpectedMsgs[tgtDeviceID] > 0) { deviceIDs.push_back(tgtDeviceID); } sort(deviceIDs.begin(), deviceIDs.end()); int firstDeviceIdx = 0, firstDeviceID = 1 << 16; for (int i = 0; i < deviceIDs.size(); ++i) { if (deviceIDs[i] >= tgtDeviceID && deviceIDs[i] < firstDeviceID) { firstDeviceIdx = i; firstDeviceID = deviceIDs[i]; } } // This is the order in which we process devices. for (int i = firstDeviceIdx; _deviceIDs.size() < deviceIDs.size(); i = (i + 1) % deviceIDs.size()) { int d = deviceIDs[i]; _deviceIDs.push_back(d); _messageQueues[d] = new Queue(); } //shuffleVector(_deviceIDs, 1, _deviceIDs.size()); _broadcaster = new StreamBroadcast(); // Note that we MUST process the tgtDeviceID first because // we write to it at every iteration, and the computation // thread writes to it too. By processing it first we ensure // that there's no race condition. assert(numExpectedMsgs[tgtDeviceID] == 0 || _deviceIDs[0] == tgtDeviceID); reset(); } SequentialActGradReducer::~SequentialActGradReducer() { for(map* >::const_iterator it = _messageQueues.begin(); it != _messageQueues.end(); ++it) { delete it->second; } delete _broadcaster; } void SequentialActGradReducer::reset() { for (map::iterator it = _numReceivedMsgs.begin(); it != _numReceivedMsgs.end(); ++it) { _numReceivedMsgs[it->first] = 0; } } bool SequentialActGradReducer::reduce() { int tgtDeviceID = _parent->getDeviceID(); for (int didx = 0; didx < _deviceIDs.size(); ) { int d = _deviceIDs[didx]; _numReceivedMsgs[d] += _messageQueues[d]->dequeue(); if (_numReceivedMsgs[d] == _numExpectedMsgs[d]) { if (d != tgtDeviceID) { NVMatrix::setDeviceID(tgtDeviceID); _parent->getActsGrad().resize(_parent->getActsGrad(d)); map mats; mats[d] = &_parent->getActsGrad(d); mats[tgtDeviceID] = &_parent->getActsGrad(tgtDeviceID); _broadcaster->transfer(mats, d, didx > 0, 1); } didx++; assert(_messageQueues[d]->getNumElements() == 0); } else if (_numReceivedMsgs[d] >= _numExpectedMsgs[d]) { // exit return true; } } return false; } void SequentialActGradReducer::enqueueReduction(int deviceID) { _messageQueues[deviceID]->enqueue(1); } void SequentialActGradReducer::stop() { for(map* >::const_iterator it = _messageQueues.begin(); it != _messageQueues.end(); ++it) { it->second->enqueue(ACT_GRAD_REDUCER_EXIT); } join(); } /* ===================== * ParallelActGradReducer * ===================== */ ParallelActGradReducer::ParallelActGradReducer(Layer& parent, map numExpectedMsgs) : IActGradReducer(parent, numExpectedMsgs), _numReceivedMsgs(0) { _reducer = &(new EightGPUReducer1(parent.getDeviceID()))->construct(); _scaleTarget = numExpectedMsgs.count(parent.getDeviceID()) > 0 && numExpectedMsgs[parent.getDeviceID()] > 0; } bool ParallelActGradReducer::reduce() { // TODO: make it so that you can start the reduction before you've received all the messages. while(_numReceivedMsgs < _numExpectedMsgsTotal) { _numReceivedMsgs += _messageQueue.dequeue(); } if (_numReceivedMsgs > _numExpectedMsgsTotal) { return true; // exit } map mats = _parent->getAllActsGrads(); _reducer->reduce(mats, 1, _scaleTarget); assert(_messageQueue.getNumElements() == 0); return false; } void ParallelActGradReducer::enqueueReduction(int deviceID) { _messageQueue.enqueue(1); } void ParallelActGradReducer::stop() { _messageQueue.enqueue(ACT_GRAD_REDUCER_EXIT); join(); } void ParallelActGradReducer::reset() { _numReceivedMsgs = 0; } ================================================ FILE: caffe2/contrib/cuda-convnet2/cudaconvnet/src/jpeg.cpp ================================================ /* * Copyright 2014 Google Inc. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "../include/jpeg.h" using namespace std; /* ======================== * DecoderThread * ======================== */ DecoderThread::DecoderThread(PyObject* pyList, Matrix& target, int start_img, int end_img, int img_size, int inner_size, bool test, bool multiview) : Thread(true), _pyList(pyList), _target(&target), _start_img(start_img), _end_img(end_img), _img_size(img_size), _inner_size(inner_size), _test(test), _multiview(multiview), _decodeTarget(0), _decodeTargetSize(0) { _inner_pixels = _inner_size * _inner_size; _rseed = time(0); } DecoderThread::~DecoderThread(){ free(_decodeTarget); } void* DecoderThread::run() { int numSrcCases = PyList_GET_SIZE(_pyList); assert(_target->getNumCols() == _inner_pixels * 3); assert(_target->getNumRows() == PyList_GET_SIZE(_pyList) * (_multiview ? 10 : 1)); int width, height; for (int64 i = _start_img; i < _end_img; ++i) { decodeJpeg(i, width, height); assert((width == _img_size && height >= _img_size) || (height == _img_size && width >= _img_size)); if (_multiview) { for (int flip = 0; flip < 2; ++flip) { crop(numSrcCases * (flip * 5 + 0) + i, width, height, flip, 0, 0); // top-left crop(numSrcCases * (flip * 5 + 1) + i, width, height, flip, width - _inner_size, 0); // top-right crop(numSrcCases * (flip * 5 + 2) + i, width, height, flip, (width - _inner_size) / 2, (height - _inner_size) / 2); // center crop(numSrcCases * (flip * 5 + 3) + i, width, height, flip, 0, height - _inner_size); // bottom-left crop(numSrcCases * (flip * 5 + 4) + i, width, height, flip, width - _inner_size, height - _inner_size); // bottom-right } } else { crop(i, width, height, !_test && (rand_r(&_rseed) % 2)); } } return NULL; } void DecoderThread::decodeJpeg(int idx, int& width, int& height) { PyObject* pySrc = PyList_GET_ITEM(_pyList, idx); unsigned char* src = (unsigned char*)PyString_AsString(pySrc); size_t src_len = PyString_GET_SIZE(pySrc); struct jpeg_decompress_struct cinf; struct jpeg_error_mgr jerr; cinf.err = jpeg_std_error(&jerr); jpeg_create_decompress(&cinf); jpeg_mem_src(&cinf, src, src_len); assert(jpeg_read_header(&cinf, TRUE)); cinf.out_color_space = JCS_RGB; assert(jpeg_start_decompress(&cinf)); assert(cinf.num_components == 3 || cinf.num_components == 1); width = cinf.image_width; height = cinf.image_height; if (_decodeTargetSize < width * height * 3) { free(_decodeTarget); _decodeTargetSize = width * height * 3 * 3; _decodeTarget = (unsigned char*)malloc(_decodeTargetSize); } while (cinf.output_scanline < cinf.output_height) { JSAMPROW tmp = &_decodeTarget[width * cinf.out_color_components * cinf.output_scanline]; assert(jpeg_read_scanlines(&cinf, &tmp, 1) > 0); } assert(jpeg_finish_decompress(&cinf)); jpeg_destroy_decompress(&cinf); } /* * Uniform in [0,1) */ inline double DecoderThread::randUniform() { return double(rand_r(&_rseed)) / (int64(RAND_MAX) + 1); } /* * Uniform in [min, max) */ inline double DecoderThread::randUniform(double min, double max) { return (max - min) * randUniform() + min; } void DecoderThread::crop(int64 i, int64 src_width, int64 src_height, bool flip) { crop(i, src_width, src_height, flip, -1, -1); } void DecoderThread::crop(int64 i, int64 src_width, int64 src_height, bool flip, int64 crop_start_x, int64 crop_start_y) { const int64 border_size_y = src_height - _inner_size; const int64 border_size_x = src_width - _inner_size; if (crop_start_x < 0) { crop_start_x = _test ? (border_size_x / 2) : (rand_r(&_rseed) % (border_size_x + 1)); } if (crop_start_y < 0) { crop_start_y = _test ? (border_size_y / 2) : (rand_r(&_rseed) % (border_size_y + 1)); } const int64 src_pixels = src_width * src_height; for (int64 c = 0; c < 3; ++c) { for (int64 y = crop_start_y; y < crop_start_y + _inner_size; ++y) { for (int64 x = crop_start_x; x < crop_start_x + _inner_size; ++x) { assert((y >= 0 && y < src_height && x >= 0 && x < src_width)); _target->getCell(i, c * _inner_pixels + (y - crop_start_y) * _inner_size + (flip ? (_inner_size - 1 - x + crop_start_x) : (x - crop_start_x))) = _decodeTarget[3 * (y * src_width + x) + c]; } } } } ================================================ FILE: caffe2/contrib/cuda-convnet2/cudaconvnet/src/layer.cu ================================================ /* * Copyright 2014 Google Inc. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include #include #include #include "../../cudaconv3/include/cudaconv2.cuh" #include "../../util/include/matrix.h" #include "../include/layer_kernels.cuh" #include "../include/layer.cuh" #include "../include/data.cuh" #include "../include/util.cuh" #include "../include/weights.cuh" using namespace std; /* * ======================= * Layer * ======================= */ Layer::Layer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID, bool trans) : _convNetThread(convNetThread), _replicaID(replicaID), _trans(trans) { _name = pyDictGetString(paramsDict, "name"); _type = pyDictGetString(paramsDict, "type"); _foundGradConsumers = false; _gradConsumer = pyDictGetInt(paramsDict, "gradConsumer"); _actsTarget = pyDictGetInt(paramsDict, "actsTarget"); _actsGradTarget = pyDictGetInt(paramsDict, "actsGradTarget"); _numOutputs = pyDictGetInt(paramsDict, "outputs"); _numReplicas = pyDictGetInt(paramsDict, "numReplicas"); _numReplicasPrev = 1; _rcvdBInputMsgs = 0; _actBroadcaster = NULL; _gradReducer = NULL; _initialized = false; } Layer::~Layer() { if (_actBroadcaster != NULL) { _actBroadcaster->stop(); delete _actBroadcaster; } if (_gradReducer != NULL) { _gradReducer->stop(); delete _gradReducer; } // For now, gradReducer doesn't have a destructor // delete _gradReducer; for (std::map::iterator it = _memSrcActs.begin(); it != _memSrcActs.end(); ++it) { if (it->second->getMemorySource().truncate(_name)) { delete &it->second->getMemorySource(); } } for (std::map::iterator it = _memSrcActsGrad.begin(); it != _memSrcActsGrad.end(); ++it) { if (it->second->getMemorySource().truncate(_name)) { delete &it->second->getMemorySource(); } } } cudaStream_t Layer::getStream() { assert(getDeviceID() >= 0); return NVMatrix::getDefaultStream(getDeviceID()); } void Layer::syncStream() { NVMatrix::syncStream(getStream()); } void Layer::fpropNext(PASS_TYPE passType, int passIdx) { if (_next.size() > 0) { if (getFwdActiveReplicaIdx(passIdx) == 0/*getReplicaIdx()*/) { // 0 turns on pipelining if (_nextDeviceIDs.size() > 1 || (_nextDeviceIDs.size() == 1 && _nextDeviceIDs[0] != getDeviceID())) { syncStream(); // Make sure I've finished computing before broadcasting } getActBroadcaster().getMessageQueue().enqueue(new BroadcastMessage(getAllActs(), getDeviceID(), getReplicaIdx(), _broadcastFinishQueue)); } if (getFwdActiveReplicaIdx(passIdx) == getReplicaIdx()) { _broadcastFinishQueue.dequeue(); assert(_broadcastFinishQueue.getNumElements() == 0); } } for (int i = 0; i < _next.size(); i++) { _next[i]->getConvNetThread().getMessageQueue().enqueue(new FpropMessage(*_next[i], passType, passIdx)); } } bool Layer::fprop(PASS_TYPE passType, int passIdx) { _rcvdFInputMsgs++; // I require messages from *all* input replicas because it makes the propagation easier to think about. // Without this requirement, when all fprop terminal msgs arrive to ConvNet, the forward propagation // might not actually be finished yet. if (_rcvdFInputMsgs == getNumExpectedFwdMsgs()) { // printf("Layer %s[%d] fprop\n", _name.c_str(), getReplicaID()); int ridx = getFwdActiveInputReplicaIdx(passIdx); assert(getDeviceID() == NVMatrix::getDeviceID()); map v; if (ridx >= 0) { for (int i = 0; i < getNumLayersPrev(); i++) { v[i] = &_prev[ridx][i]->getActs(getDeviceID()); } } fprop(v, passType, passIdx); return true; } return false; } void Layer::fprop(map& v, PASS_TYPE passType, int passIdx) { if (getFwdActiveInputReplicaIdx(passIdx) >= 0) { assert(v.size() == getNumLayersPrev()); _inputs.clear(); _inputs.insert(v.begin(), v.end()); int numCases = _inputs[0]->getLeadingDim(); for (map::iterator it = _memSrcActs.begin(); it != _memSrcActs.end(); ++it) { it->second->getMemory(numCases); } if (numCases > 0) { //printf("layer %s fprop, numcases: %d\n", _name.c_str(), numCases); _rcvdFInputMsgs = getNumExpectedFwdMsgs(); for (map::iterator it = v.begin(); it != v.end(); ++it) { it->second->transpose(_trans); } getActs().transpose(_trans); fpropCommon(passType); // First do fprop on the input whose acts matrix I'm sharing, if any if (_actsTarget >= 0) { fpropActs(_actsTarget, 0, passType, passIdx); } // Then add the rest of the inputs to that for (int i = 0; i < getNumLayersPrev(); i++) { if (i != _actsTarget) { fpropActs(i, _actsTarget >= 0 || i > 0, passType, passIdx); } } } } fpropNext(passType, passIdx); } void Layer::truncBwdActs() { // Only truncate actsGrad if I own it if (_actsGradTarget < 0) { for (map::iterator it = _memSrcActsGrad.begin(); it != _memSrcActsGrad.end(); ++it) { it->second->getMemorySource().truncate(getName()); } } if (_actsTarget < 0) { for (map::iterator it = _memSrcActs.begin(); it != _memSrcActs.end(); ++it) { it->second->getMemorySource().truncate(getName()); } } } int Layer::getNumGradProducersNext() { return _numGradProducersNext; } int Layer::getNumExpectedBwdMsgs() { return _numGradProducersNext * getNumSiblingReplicas(); } int Layer::getNumExpectedFwdMsgs() { return getNumLayersPrev() * getNumInputReplicas(); } void Layer::bprop(PASS_TYPE passType, int passIdx) { if (getBwdActiveInputReplicaIdx(passIdx) >= 0 && _rcvdBInputMsgs == getNumExpectedBwdMsgs()) { // printf("Layer %s[%d] bprop\n", _name.c_str(), getReplicaID()); if (_gradReducer != NULL) { _gradReducer->waitForFinish(); } // This does sync, but only if it has grad consumers below! so we must sync again before sending bprop terminal messages bprop(getActsGrad(), passType, passIdx); if (_bwdTerminal[passIdx]) { syncStream(); getConvNet().getMessageQueue().enqueue(new Message(BPROP_TERMINAL)); } } } void Layer::bpropActsCall(NVMatrix& v, PASS_TYPE passType, int replicaIdx, int inputIdx) { Layer& prev = *_prev[replicaIdx][inputIdx]; if (prev.isGradConsumer() && isGradProducer(prev.getName())) { if (v.getLeadingDim() > 0) { // Only do computation if #cases > 0 bpropActs(v, replicaIdx, inputIdx, prev.getNumComputedActsGrads(getDeviceID()) > 0, passType); } prev.getNumComputedActsGrads(getDeviceID())++; // Synchronize if the previous layer is going to actually do a reduction. // If the previous layer is on the same GPU as us and has no next layers // on other GPUs then it won't need to do a reduction. if (prev.getNextDeviceIDs().size() > 1 || (prev.getNextDeviceIDs().size() == 1 && getDeviceID() != prev.getDeviceID())) { syncStream(); } prev.getGradReducer().enqueueReduction(getDeviceID()); } } void Layer::bprop(NVMatrix& v, PASS_TYPE passType, int passIdx) { v.transpose(_trans); assert(getDeviceID() == NVMatrix::getDeviceID()); int ridx = getBwdActiveInputReplicaIdx(passIdx); LayerV& prev = _prev[ridx]; map > prevByDevice = _prevByDevice[ridx]; for (int i = 0; i < prev.size(); i++) { _inputs[i]->transpose(_trans); prev[i]->getActsGrad().transpose(_trans); } getActs().transpose(_trans); // NOTE: this should be here (before the bpropActs) because if you have a layer // that has a weight matrix AND actsGradTarget >= 0, then the stuff below will overwrite // v which is used in bpropCommon. So bpropCommon must come first. bpropCommon(v, ridx, passType); if (isGradProducer()) { // First propagate activity gradient to all layers whose activity // gradient matrix I'm definitely not sharing. for (map >::const_iterator it = prevByDevice.begin(); it != prevByDevice.end(); ++it) { const set& deviceLayers = it->second; for (set::const_iterator it2 = deviceLayers.begin(); it2 != deviceLayers.end(); ++it2) { if (_actsGradTarget != (*it2)->getInputIdx(_name)) { bpropActsCall(v, passType, ridx, (*it2)->getInputIdx(_name)); } } } // Then propagate activity gradient to the layer whose activity gradient // matrix I'm sharing, if any. if (_actsGradTarget >= 0) { bpropActsCall(v, passType, ridx, _actsGradTarget); } } // Synchronization is necessary because the kernel calls that compute my backward acts // execute asynchronously. Therefore I don't want to tell other threads that I've // computed bprop activities for them when in fact I've only called a function which // will eventually compute them. if (_prevDeviceIDs.size() > 1 || (_prevDeviceIDs.size() == 1 && _prevDeviceIDs[0] != getDeviceID())) { syncStream(); } if (getConvNet().isConserveMemory()) { truncBwdActs(); } if (isGradProducer()) { /*for (int i = 0; i < prev.size(); i++) { if (prev[i]->isGradConsumer() && isGradProducer(prev[i]->getName())) { prev[i]->getGradReducer().enqueueReduction(getDeviceID()); } }*/ // Send backward messages to *all* replicas. // Note that the messages will be dismissed unless the passIdx indicates // that the previous layer should do some work. for (int r = 0; r < getNumInputReplicas(); r++) { for (int i = 0; i < _prev[r].size(); i++) { if (_prev[r][i]->isGradConsumer() && isGradProducer(_prev[r][i]->getName())) { _prev[r][i]->getConvNetThread().getMessageQueue().enqueue(new BpropMessage(*_prev[r][i], passType, passIdx)); } } } } } IActGradReducer& Layer::getGradReducer() { return *_gradReducer; } // This is called between minibatches void Layer::reset() { _rcvdFInputMsgs = 0; _rcvdBInputMsgs = 0; for (map::iterator it = _numComputedActsGrads.begin(); it != _numComputedActsGrads.end(); ++it) { it->second = 0; } } // This is called between microbatches void Layer::resetPassIdx() { _rcvdFInputMsgs = 0; if (_rcvdBInputMsgs >= getNumExpectedBwdMsgs()) { reset(); } } /* * Returns number of cases in given matrix. */ int Layer::getNumCases(NVMatrix& v) { return v.getLeadingDim(); } int Layer::incRcvdBInputMsgs() { return ++_rcvdBInputMsgs; } std::string& Layer::getName() { return _name; } std::string& Layer::getType() { return _type; } int& Layer::getNumComputedActsGrads(int deviceID) { return _numComputedActsGrads[deviceID]; } void Layer::addNext(Layer& l) { _next.push_back(&l); _numReplicasNext = l.getNumReplicas(); if (count(_nextDeviceIDs.begin(), _nextDeviceIDs.end(), l.getDeviceID()) == 0) { int pos = rand() % (_nextDeviceIDs.size() + 1); _nextDeviceIDs.insert(_nextDeviceIDs.begin() + pos, l.getDeviceID()); } } void Layer::addPrev(Layer& l, int replicaIdx) { _prev[replicaIdx].push_back(&l); _numReplicasPrev = l.getNumReplicas(); l.setInputIdx(getName(), _prev[replicaIdx].size() - 1); if (l.getDeviceID() >= 0 && count(_prevDeviceIDs.begin(), _prevDeviceIDs.end(), l.getDeviceID()) == 0) { int pos = rand() % (_prevDeviceIDs.size() + 1); _prevDeviceIDs.insert(_prevDeviceIDs.begin() + pos, l.getDeviceID()); } } void Layer::addReplica(Layer& l) { assert(_replicas.count(l.getReplicaID()) == 0); _replicas[l.getReplicaID()] = &l; } bool Layer::hasGradProducerNext(std::string& layerName) { bool b = _next.size() == 0; for (int i = 0; i < _next.size(); i++) { b |= _next[i]->hasGradProducerNext(_name); } return b && isGradProducer(layerName); } bool Layer::postInit() { // We choose not to populate _outputs[getDeviceID()] here because we do it instead in fprop(). // In fprop(), we can populate it from the _inputs vector, which is a bit more general than populating // it from _prev->getActs() // _outputs = _actsTarget < 0 ? new NVMatrix() : &_prev[_actsTarget]->getActs(); if (!_initialized) { _initialized = true; map numGradProducersNext; _numGradProducersNext = 0; for (int r = 0; r < getNumInputReplicas(); ++r) { for (vector::const_iterator it = _prev[r].begin(); it != _prev[r].end(); ++it) { (*it)->postInit(); } } _memSrcActs[getDeviceID()] = _actsTarget < 0 ? &MemorySource::make(_numOutputs, getDeviceID(), getName()) : &_prev[0][_actsTarget]->getMemorySourceActs(getDeviceID()).clone(_name); // _actsGradTarget will only be >= 0 when the number of replicas is the same in both layers, so this justifies the use of _prev[0] _memSrcActsGrad[getDeviceID()] = _actsGradTarget < 0 ? &MemorySource::make(_numOutputs, getDeviceID(), getName()) : &_prev[0][_actsGradTarget]->getMemorySourceActsGrad(getDeviceID()).clone(_name); for (int i = 0; i < _next.size(); ++i) { int d = _next[i]->getDeviceID(); _numComputedActsGrads[d] = 0; if (_next[i]->hasGradProducerNext(_name)) { if (numGradProducersNext.count(d) == 0) { numGradProducersNext[d] = 0; } numGradProducersNext[d]++; _numGradProducersNext++; if (_memSrcActsGrad.count(d) == 0) { _memSrcActsGrad[d] = &MemorySource::make(_numOutputs, d, getName()); } } if (_memSrcActs.count(d) == 0) { _memSrcActs[d] = &MemorySource::make(_numOutputs, d, getName()); } } if (_next.size() == 0) { _numReplicasNext = getNumReplicas(); } /* * Initialize forward broadcaster. First sibling owns it. */ if (getReplicaIdx() == 0 && _convNetThread != NULL) { _actBroadcaster = new ActBroadcaster(getNumSiblingReplicas(), getDeviceCPUs(_convNetThread->getDeviceID())); _actBroadcaster->start(); } /* * Initialize backward reducer. */ if (isGradConsumer() && _numGradProducersNext > 0) { _gradReducer = &IActGradReducer::makeGradReducer(*this, numGradProducersNext); _gradReducer->start(); } /* * Initialize specially sorted previous array */ for (int r = 0; r < _prev.size(); ++r) { for (int i = 0; i < _prev[r].size(); ++i) { // Previous devices in reverse order of processing by (sequential) GradReducer _prevByDevice[r][getDeviceID() - _prev[r][i]->getDeviceID() + 16 * (_prev[r][i]->getDeviceID() > getDeviceID())].insert(_prev[r][i]); } } return true; } return false; } ActBroadcaster& Layer::getActBroadcaster() { return getReplicaIdx() == 0 ? *_actBroadcaster : _replicas[getReplicaID() - getReplicaIdx()]->getActBroadcaster(); } // Does this layer, or some layer below it, need the gradient // for parameter updates? // Only weight layers should be grad consumers themselves. bool Layer::isGradConsumer() { if (!_foundGradConsumers && _prev.size() > 0) { for (int i = 0; i < _prev[0].size(); i++) { _gradConsumer |= _prev[0][i]->isGradConsumer(); } _foundGradConsumers = true; } return _gradConsumer; } // Does this layer produce gradient for layers below? bool Layer::isGradProducer() { return true; } bool Layer::isGradProducer(std::string& layerName) { return isGradProducer(); } map >& Layer::getPrev() { return _prev; } vector& Layer::getNext() { return _next; } NVMatrix& Layer::getActs() { return getActs(getDeviceID()); } NVMatrix& Layer::getActs(int deviceID) { assert(_memSrcActs.count(deviceID) > 0); return _memSrcActs[deviceID]->getMemory(); } NVMatrix& Layer::getActs(int deviceID, int numCases) { assert(_memSrcActs.count(deviceID) > 0); return _memSrcActs[deviceID]->getMemory(numCases); } NVMatrix& Layer::getActsGrad(int deviceID) { assert(_memSrcActsGrad.count(deviceID) > 0); return _memSrcActsGrad[deviceID]->getMemory(getActs(deviceID).getLeadingDim()); } NVMatrix& Layer::getActsGrad() { return getActsGrad(NVMatrix::getDeviceID()); } map Layer::getAllActs() { map m; for (map::const_iterator it = _memSrcActs.begin(); it != _memSrcActs.end(); ++it) { m[it->first] = &it->second->getMemory(); } return m; } map Layer::getAllActsGrads() { map m; for (map::const_iterator it = _memSrcActsGrad.begin(); it != _memSrcActsGrad.end(); ++it) { m[it->first] = &it->second->getMemory(); } return m; } int Layer::getDeviceID() { return _convNetThread == NULL ? -1 : _convNetThread->getDeviceID(); } ConvNetThread& Layer::getConvNetThread() { assert(_convNetThread != NULL); return *_convNetThread; } ConvNet& Layer::getConvNet() { return getConvNetThread().getConvNet(); } void Layer::setBwdTerminal(int passIdx) { _bwdTerminal[passIdx] = true; } int Layer::getReplicaID() { return _replicaID; } int Layer::getActivePassPeriod() { return getNumReplicas() / getConvNet().getNumReplicasMin(); } int Layer::getFwdActiveInputReplicaIdx(int passIdx) { const int edge = (passIdx / getActivePassPeriod()) % getNumInputReplicas(); return passIdx % getActivePassPeriod() == 0 ? edge : -1; } int Layer::getBwdActiveInputReplicaIdx(int passIdx) { const int edge = (passIdx / getActivePassPeriod()) % getNumInputReplicas(); return (passIdx + 1) % getActivePassPeriod() == 0 ? edge : -1; } int Layer::getFwdActiveReplicaIdx(int passIdx) { assert(_next.size() > 0); return _next[0]->getFwdActiveInputReplicaIdx(passIdx); } int Layer::getNumReplicas() { return _replicas.size(); } int Layer::getNumSiblingReplicas() { return getNumReplicas() / getNumReplicasNext(); } int Layer::getNumReplicasPrev() { return _numReplicasPrev; } int Layer::getNumReplicasNext() { return _numReplicasNext; } int Layer::getNumInputReplicas() { return _numReplicasPrev / getNumReplicas(); } int Layer::getReplicaIdx() { return getReplicaID() % getNumSiblingReplicas(); } int Layer::getNumLayersPrev() { return _prev.size() > 0 ? _prev[0].size() : 0; } void Layer::setMemorySourceActs(int deviceID, MemoryView& mem) { assert(_memSrcActs[deviceID]->isParent()); delete _memSrcActs[deviceID]; _memSrcActs[deviceID] = &mem; if (_actsTarget >= 0 && deviceID == getDeviceID()) { assert(getNumInputReplicas() == 1); _prev[0][_actsTarget]->setMemorySourceActs(deviceID, mem.clone(_prev[0][_actsTarget]->getName())); } } void Layer::setMemorySourceActsGrad(int deviceID, MemoryView& mem) { assert(_memSrcActsGrad[deviceID]->isParent()); delete _memSrcActsGrad[deviceID]; _memSrcActsGrad[deviceID] = &mem; if (_actsGradTarget >= 0 && deviceID == getDeviceID()) { assert(getNumInputReplicas() == 1); _prev[0][_actsGradTarget]->setMemorySourceActsGrad(deviceID, mem.clone(_prev[0][_actsGradTarget]->getName())); } } MemoryView& Layer::getMemorySourceActs(int deviceID) { return *_memSrcActs[deviceID]; } MemoryView& Layer::getMemorySourceActsGrad(int deviceID) { return *_memSrcActsGrad[deviceID]; } int Layer::getNumOutputs() { return _numOutputs; } void Layer::setInputIdx(std::string& parentName, int idx) { _inputIndices[parentName] = idx; } int Layer::getInputIdx(std::string& parentName) { return _inputIndices[parentName]; } /* * ======================= * NeuronLayer * ======================= */ NeuronLayer::NeuronLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID) : Layer(convNetThread, paramsDict, replicaID, true) { PyObject* neuronDict = PyDict_GetItemString(paramsDict, "neuron"); _neuronType = pyDictGetString(neuronDict, "type"); _neuron = &Neuron::makeNeuron(neuronDict); } NeuronLayer::~NeuronLayer() { delete _neuron; } void NeuronLayer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) { assert(inpIdx == 0); if (!bpropSpecial(v, replicaIdx, inpIdx, scaleTargets, passType)) { _neuron->computeInputGrad(v, _prev[replicaIdx][0]->getActsGrad(), scaleTargets > 0); } } bool NeuronLayer::bpropSpecial(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) { // Special optimization for cross-entropy objective with logistic units. // Better to just compute the input gradient in one go to avoid division by small numbers. bool doCrossEntGrad = _neuronType == "logistic" && _next.size() == 1 && (_next[0]->getType() == "cost.bce" || _next[0]->getType() == "cost.dce") && _next[0]->getDeviceID() == getDeviceID() && _next[0]->getNumReplicas() == getNumReplicas(); LayerV& prev = _prev[replicaIdx]; if (doCrossEntGrad) { NVMatrix& labels = _next[0]->getPrev()[replicaIdx][0]->getActs(getDeviceID()); BinomialCrossEntropyCostLayer& cost = *static_cast(_next[0]); float gradCoeff = cost.getCoeff(); labels.transpose(_trans); if (cost.getPosWeight() == 1) { if (scaleTargets == 0) { getActs().add(labels, -gradCoeff, gradCoeff, prev[0]->getActsGrad()); } else { getActs().applyTernary(AddGradientBinaryOperator(NVMatrixBinaryOps::WeightedAdd(-gradCoeff, gradCoeff)), labels, prev[0]->getActsGrad(), prev[0]->getActsGrad()); } } else { if (scaleTargets == 0) { getActs().applyBinary(CrossEntLogisticGradientOperator(gradCoeff, cost.getPosWeight()), labels, prev[0]->getActsGrad()); } else { getActs().applyTernary(AddGradientBinaryOperator(CrossEntLogisticGradientOperator(gradCoeff, cost.getPosWeight())), labels, prev[0]->getActsGrad(), prev[0]->getActsGrad()); } } } return doCrossEntGrad; } void NeuronLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) { _neuron->activate(*_inputs[0], getActs()); } std::string& NeuronLayer::getNeuronType() { return _neuronType; } /* * ======================= * WeightLayer * ======================= * * The useGrad parameter here merely expresses a preference by the subclass. It may * be overridden by the superclass (WeightLayer) and in that case the subclass must follow its wishes. * So when computing gradient updates, the subclass must always first check weights.isUseGrad(). * * Note: biases always useGrad. */ WeightLayer::WeightLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID, bool trans, bool useGrad) : Layer(convNetThread, paramsDict, replicaID, trans) { _weightUpdatePassPeriod = pyDictGetInt(paramsDict, "updatePeriod"); MatrixV& hWeights = *pyDictGetMatrixV(paramsDict, "weights"); MatrixV& hWeightsInc = *pyDictGetMatrixV(paramsDict, "weightsInc"); Matrix& hBiases = *pyDictGetMatrix(paramsDict, "biases"); Matrix& hBiasesInc = *pyDictGetMatrix(paramsDict, "biasesInc"); PyObject* pyEpsWList = PyDict_GetItemString(paramsDict, "epsW"); PyObject* pyEpsB = PyDict_GetItemString(paramsDict, "epsB"); floatv& momW = *pyDictGetFloatV(paramsDict, "momW"); float momB = pyDictGetFloat(paramsDict, "momB"); floatv& wc = *pyDictGetFloatV(paramsDict, "wc"); floatv& wball = *pyDictGetFloatV(paramsDict, "wballNormed"); /* * When there are multiple replicas, the present implementation * requires that useGrad is true. This is because weights.update() * performs a simultaneous write to both replicas' weightsInc matrix, * which means that the read should come from somewhere else (i.e. a * grads matrix). */ useGrad |= _numReplicas > 1; // Source layers for shared weights stringv& weightSourceLayers = *pyDictGetStringV(paramsDict, "weightSourceLayers"); // Weight matrix indices (inside the above source layers) for shared weights intv& weightSourceMatrixIndices = *pyDictGetIntV(paramsDict, "weightSourceMatrixIndices"); _weights = new WeightList(); for (int i = 0; i < weightSourceLayers.size(); i++) { std::string& srcLayerName = weightSourceLayers[i]; int matrixIdx = weightSourceMatrixIndices[i]; PyObject* pyEpsW = PyList_GetItem(pyEpsWList, i); ParameterSchedule& lrs = ParameterSchedule::make(pyEpsW); // Learning rate schedule if (srcLayerName == _name) { // Current layer _weights->addWeights(*new Weights(_weights->at(matrixIdx), lrs, *this)); } else if (srcLayerName != "") { WeightLayer& srcLayer = *static_cast(&convNetThread->getLayer(srcLayerName)); Weights* srcWeights = &srcLayer.getWeights(matrixIdx); _weights->addWeights(*new Weights(*srcWeights, lrs, *this)); } else { _weights->addWeights(*new Weights(*hWeights[i], *hWeightsInc[i], lrs, *this, wc[i], wball[i], momW[i], useGrad)); } } _biases = new Weights(hBiases, hBiasesInc, ParameterSchedule::make(pyEpsB), *this, 0, 0, momB, true); delete &weightSourceLayers; delete &weightSourceMatrixIndices; delete &hWeights; delete &hWeightsInc; delete &momW; delete &wc; delete &wball; _wStep = 0.02; _bStep = 0.005; } WeightLayer::~WeightLayer() { delete _weights; delete _biases; } bool WeightLayer::postInit() { if (Layer::postInit()) { _weightUpdatePassPeriod = max(_weightUpdatePassPeriod, getActivePassPeriod()); assert(_weightUpdatePassPeriod % getActivePassPeriod() == 0); return true; } return false; } void WeightLayer::fpropCommon(PASS_TYPE passType) { } void WeightLayer::bpropCommon(NVMatrix& v, int replicaIdx, PASS_TYPE passType) { if (_biases->getLearningRateSchedule().getBaseValue() > 0) { if (v.getNumElements() > 0) { bpropBiases(v, passType); } else { _biases->getGrad().resize(_biases->getW()); _biases->getGrad().scale(getBIncScale()); } _biases->incNumUpdates(); } for (int i = 0; i < _weights->getSize(); i++) { if (_weights->at(i).getLearningRateSchedule().getBaseValue() > 0) { if (v.getNumElements() > 0) { bpropWeights(v, replicaIdx, i, passType); } else { _weights->at(i).getGrad().resize(_weights->at(i).getW()); // This will cause it to forget momentum when shown 0 training cases // and _useGrad = false but it's not too important. _weights->at(i).getGrad().scale(getIncScale(i, passType)); } // Increment its number of updates _weights->at(i).incNumUpdates(); } } } bool WeightLayer::updateWeights() { if (getConvNet().getTotalPassesDone() % _weightUpdatePassPeriod == 0) { _weights->update(getConvNet().getTrainingProgress()); _biases->update(getConvNet().getTrainingProgress()); // constrainWeights(); return true; } return false; } bool WeightLayer::constrainWeights() { if (getConvNet().getTotalPassesDone() % _weightUpdatePassPeriod == 0) { _constrainWeights(); return true; } return false; } void WeightLayer::_constrainWeights() { } void WeightLayer::copyToCPU() { _weights->copyToCPU(); _biases->copyToCPU(); } void WeightLayer::copyToGPU() { _weights->copyToGPU(); _biases->copyToGPU(); } void WeightLayer::checkGradient() { for (int i = 0; i < _weights->getSize(); i++) { getConvNet().checkGradient(_name + " weights[" + tostr(i) + "]", _wStep, _weights->at(i)); } getConvNet().checkGradient(_name + " biases", _bStep, *_biases); } void WeightLayer::addReplica(Layer& l) { Layer::addReplica(l); _weights->addReplica(*static_cast(&l)->_weights); _biases->addReplica(*static_cast(&l)->_biases); } Weights& WeightLayer::getWeights(int idx) { return _weights->at(idx); } float WeightLayer::getGradScale(int inpIdx, PASS_TYPE passType) { // weight update period must be multiple of activation period // TODO: simply accumulate # of cases seen between weight updates. simpler and more accurate. double numCases = _weightUpdatePassPeriod * (getConvNet().getMinibatchSize() / double(getConvNet().getNumPasses())); if (_weights->at(inpIdx).isUseGrad()) { return passType == PASS_GC ? 1.0f : 1.0f / numCases; } return passType == PASS_GC ? 1.0f : _weights->at(inpIdx).getEps(getConvNet().getTrainingProgress()) / numCases; } float WeightLayer::getIncScale(int inpIdx, PASS_TYPE passType) { if (_weights->at(inpIdx).isUseGrad()) { return _weights->at(inpIdx).getNumUpdates() > 0; } return (passType == PASS_GC ? _weights->at(inpIdx).getNumUpdates() > 0 : (_weights->at(inpIdx).getNumUpdates() == 0 ? _weights->at(inpIdx).getMom() : 1.0f)); } NVMatrix& WeightLayer::getGradTarget(int inpIdx) { return _weights->at(inpIdx).getGrad(); } float WeightLayer::getBGradScale(PASS_TYPE passType) { int numCases = _weightUpdatePassPeriod * DIVUP(getConvNet().getMinibatchSize(), getConvNet().getNumPasses()); return passType == PASS_GC ? 1.0f : 1.0f / numCases; } float WeightLayer::getBIncScale() { return _biases->getNumUpdates() > 0; } NVMatrix& WeightLayer::getWeightMatrix(PASS_TYPE passType, int inpIdx) { return _weights->at(inpIdx).getW(); } NVMatrix& WeightLayer::getBiasMatrix(PASS_TYPE passType) { return _biases->getW(); } /* * ======================= * FCLayer * ======================= */ FCLayer::FCLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID, bool useGrad) : WeightLayer(convNetThread, paramsDict, replicaID, true, useGrad) { _wStep = 0.01; _bStep = 0.01; } void FCLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) { getActs().addProduct(*_inputs[inpIdx], getWeightMatrix(passType, inpIdx), scaleTargets, 1); if (scaleTargets == 0) { getActs().addVector(getBiasMatrix(passType), 1, getActs()); } } void FCLayer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) { NVMatrix& weights_T = getWeightMatrix(passType, inpIdx).getTranspose(); _prev[replicaIdx][inpIdx]->getActsGrad().addProduct(v, weights_T, scaleTargets, 1); delete &weights_T; } void FCLayer::bpropBiases(NVMatrix& v, PASS_TYPE passType) { _biases->getGrad().addSum(v, 0, getBIncScale(), getBGradScale(passType)); } void FCLayer::bpropWeights(NVMatrix& v, int replicaIdx, int inpIdx, PASS_TYPE passType) { NVMatrix& prevActs_T = _inputs[inpIdx]->getTranspose(); float scaleGrad = getGradScale(inpIdx, passType); float scaleInc = getIncScale(inpIdx, passType); getGradTarget(inpIdx).addProduct(prevActs_T, v, scaleInc, scaleGrad); delete &prevActs_T; } void FCLayer::_constrainWeights() { for (int i = 0; i < _weights->getSize(); i++) { if (_weights->at(i).getWBall() > 0 && _weights->at(i).isOwner() && _weights->at(i).getLearningRateSchedule().getBaseValue() > 0) { // NVMatrix norm2; // Unfortunate extra weight matrix... _weights->at(i).getW().sumOfSquares(0, _norm2); // norm2.apply(MaxWeightConstraintOperator(_weights->at(i).getWBall())); _norm2.apply(HardWeightConstraintOperator(_weights->at(i).getWBall())); _weights->at(i).getW().eltwiseMultByVector(_norm2); } } } /* * ======================= * SplitFCLayer * ======================= */ SplitFCLayer::SplitFCLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID, bool useGrad) : FCLayer(convNetThread, paramsDict, replicaID, useGrad) { _numParts = pyDictGetInt(paramsDict, "parts"); } void SplitFCLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) { getActs().resize(_inputs[inpIdx]->getNumRows(), _numOutputs, true); NVMatrixV& splitInput = _inputs[inpIdx]->splitCols(_numParts); NVMatrixV& splitWeights = getWeightMatrix(passType, inpIdx).splitRows(_numParts); NVMatrixV& splitTarget = getActs().splitCols(_numParts); NVMatrix::batchedMatrixMultiply(splitInput, splitWeights, splitTarget, scaleTargets, 1); if (scaleTargets == 0) { getActs().addVector(getBiasMatrix(passType), 1, getActs()); } deleteElements(splitInput, true); deleteElements(splitWeights, true); deleteElements(splitTarget, true); } void SplitFCLayer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) { NVMatrix& weights_T = getWeightMatrix(passType, inpIdx).getTranspose(); _prev[replicaIdx][inpIdx]->getActsGrad().resize(*_inputs[inpIdx]); NVMatrixV& splitV = v.splitCols(_numParts); NVMatrixV& splitWeights_T = weights_T.splitCols(_numParts); NVMatrixV& splitTarget = _prev[replicaIdx][inpIdx]->getActsGrad().splitCols(_numParts); NVMatrix::batchedMatrixMultiply(splitV, splitWeights_T, splitTarget, scaleTargets, 1); delete &weights_T; deleteElements(splitV, true); deleteElements(splitWeights_T, true); deleteElements(splitTarget, true); } void SplitFCLayer::bpropWeights(NVMatrix& v, int replicaIdx, int inpIdx, PASS_TYPE passType) { NVMatrix& prevActs_T = _inputs[inpIdx]->getTranspose(); NVMatrixV& splitPrevActs_T = prevActs_T.splitRows(_numParts); NVMatrixV& splitV = v.splitCols(_numParts); NVMatrixV& splitGradTarget = getGradTarget(inpIdx).splitRows(_numParts); NVMatrix::batchedMatrixMultiply(splitPrevActs_T, splitV, splitGradTarget, getIncScale(inpIdx, passType), getGradScale(inpIdx, passType)); delete &prevActs_T; deleteElements(splitPrevActs_T, true); deleteElements(splitV, true); deleteElements(splitGradTarget, true); } /* * ======================= * TwoDLayerInterface * ======================= */ TwoDLayerInterface::TwoDLayerInterface(PyObject* paramsDict) { _channels = pyDictGetInt(paramsDict, "channels"); _imgSize = pyDictGetInt(paramsDict, "imgSize"); _imgPixels = _imgSize * _imgSize; } /* * ======================= * LocalLayer * ======================= */ LocalLayer::LocalLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID, bool useGrad) : WeightLayer(convNetThread, paramsDict, replicaID, false, useGrad) { _padding = pyDictGetIntV(paramsDict, "padding"); _stride = pyDictGetIntV(paramsDict, "stride"); _filterSize = pyDictGetIntV(paramsDict, "filterSize"); _channels = pyDictGetIntV(paramsDict, "channels"); _imgSize = pyDictGetIntV(paramsDict, "imgSize"); _numFilters = pyDictGetInt(paramsDict, "filters"); _groups = pyDictGetIntV(paramsDict, "groups"); _filterChannels = pyDictGetIntV(paramsDict, "filterChannels"); _filterPixels = pyDictGetIntV(paramsDict, "filterPixels"); _imgPixels = pyDictGetIntV(paramsDict, "imgPixels"); _modulesX = pyDictGetInt(paramsDict, "modulesX"); _modules = pyDictGetInt(paramsDict, "modules"); } LocalLayer::~LocalLayer() { delete _padding; delete _stride; delete _filterSize; delete _channels; delete _imgSize; delete _groups; delete _filterChannels; delete _filterPixels; delete _imgPixels; } /* * ======================= * ConvLayer * ======================= */ ConvLayer::ConvLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID) : LocalLayer(convNetThread, paramsDict, replicaID, true) { _sumWidth = pyDictGetInt(paramsDict, "sumWidth"); _sharedBiases = pyDictGetInt(paramsDict, "sharedBiases"); _weightContrastNormMin = pyDictGetFloatV(paramsDict, "wcNormMin"); _weightContrastNormMax = pyDictGetFloatV(paramsDict, "wcNormMax"); } ConvLayer::~ConvLayer() { delete _weightContrastNormMin; delete _weightContrastNormMax; } void ConvLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) { convFilterActs(*_inputs[inpIdx], getWeightMatrix(passType, inpIdx), getActs(), _imgSize->at(inpIdx), _modulesX, _modulesX, _padding->at(inpIdx), _stride->at(inpIdx), _channels->at(inpIdx), _groups->at(inpIdx), scaleTargets, 1); if (scaleTargets == 0) { if (_sharedBiases) { getActs().reshape(_numFilters, getActs().getNumElements() / _numFilters); getActs().addVector(getBiasMatrix(passType)); getActs().reshape(_numFilters * _modules, getActs().getNumElements() / (_numFilters * _modules)); } else { getActs().addVector(getBiasMatrix(passType)); } } } void ConvLayer::bpropBiases(NVMatrix& v, PASS_TYPE passType) { float scaleBGrad = getBGradScale(passType); float scaleInc = getBIncScale(); if (_sharedBiases) { v.reshape(_numFilters, v.getNumElements() / _numFilters); _biases->getGrad().addSum(v, 1, scaleInc, scaleBGrad); v.reshape(_numFilters * _modules, v.getNumElements() / (_numFilters * _modules)); } else { _biases->getGrad().addSum(v, 1, scaleInc, scaleBGrad); } } void ConvLayer::bpropWeights(NVMatrix& v, int replicaIdx, int inpIdx, PASS_TYPE passType) { assert(_weights->at(inpIdx).isUseGrad()); bool doPartialSum = _sumWidth < _modulesX; NVMatrix& tgt = doPartialSum ? _weightGradTmp : _weights->at(inpIdx).getGrad(); float scaleWGrad = getGradScale(inpIdx, passType); float scaleTargets = getIncScale(inpIdx, passType) * !doPartialSum; convWeightActs(*_inputs[inpIdx], v, tgt, _imgSize->at(inpIdx), _modulesX, _modulesX, _filterSize->at(inpIdx), _padding->at(inpIdx), _stride->at(inpIdx), _channels->at(inpIdx), _groups->at(inpIdx), _sumWidth, scaleTargets, scaleWGrad); if (doPartialSum) { scaleTargets = _weights->at(inpIdx).getNumUpdates() > 0; int outWidth = DIVUP(_modulesX, _sumWidth); _weightGradTmp.reshape(outWidth*outWidth, _filterChannels->at(inpIdx) * _filterPixels->at(inpIdx) * _numFilters); _weights->at(inpIdx).getGrad().addSum(_weightGradTmp, 0, scaleTargets, 1); _weights->at(inpIdx).getGrad().reshape(_filterChannels->at(inpIdx) * _filterPixels->at(inpIdx), _numFilters); } } void ConvLayer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) { convImgActs(v, getWeightMatrix(passType, inpIdx), _prev[replicaIdx][inpIdx]->getActsGrad(), _imgSize->at(inpIdx), _imgSize->at(inpIdx), _modulesX, _padding->at(inpIdx), _stride->at(inpIdx), _channels->at(inpIdx), _groups->at(inpIdx), scaleTargets, 1); } void ConvLayer::truncBwdActs() { LocalLayer::truncBwdActs(); _weightGradTmp.truncate(); } void ConvLayer::_constrainWeights() { for (int i = 0; i < _weights->getSize(); i++) { if (_weightContrastNormMax->at(i) > 0 && _weights->at(i).isOwner() && _weights->at(i).getLearningRateSchedule().getBaseValue() > 0) { float fz = _weights->at(i).getW().getNumRows(); NVMatrix tmp; _weights->at(i).getW().sum(0, tmp); _weights->at(i).getW().addVector(tmp, -1.0f / fz, _weights->at(i).getGrad()); // Now _weights->at(i).getGrad() contains zero-mean filters _weights->at(i).getGrad().apply(NVMatrixOps::Square()); _weights->at(i).getGrad().sum(0, tmp); tmp.apply(WeightContrastNormOperator(_weightContrastNormMin->at(i), _weightContrastNormMax->at(i), 1.0f / fz)); // Now tmp has the stdev _weights->at(i).getW().eltwiseMultByVector(tmp); } // It's pretty silly to do both these things but whatever if (_weights->at(i).getWBall() > 0 && _weights->at(i).isOwner() && _weights->at(i).getLearningRateSchedule().getBaseValue() > 0) { // NVMatrix norm2; _weights->at(i).getW().sumOfSquares(0, _norm2); // norm.apply(MaxWeightConstraintOperator(_weights->at(i).getWBall())); _norm2.apply(HardWeightConstraintOperator(_weights->at(i).getWBall())); _weights->at(i).getW().eltwiseMultByVector(_norm2); } } } /* * ======================= * LocalUnsharedLayer * ======================= */ LocalUnsharedLayer::LocalUnsharedLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID) : LocalLayer(convNetThread, paramsDict, replicaID, false) { } void LocalUnsharedLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) { localFilterActs(*_inputs[inpIdx], getWeightMatrix(passType, inpIdx), getActs(), _imgSize->at(inpIdx), _modulesX, _modulesX, _padding->at(inpIdx), _stride->at(inpIdx), _channels->at(inpIdx), _groups->at(inpIdx), scaleTargets, 1); if (scaleTargets == 0) { getActs().addVector(getBiasMatrix(passType)); } } void LocalUnsharedLayer::bpropBiases(NVMatrix& v, PASS_TYPE passType) { _biases->getGrad().addSum(v, 1, getBIncScale(), getBGradScale(passType)); } void LocalUnsharedLayer::bpropWeights(NVMatrix& v, int replicaIdx, int inpIdx, PASS_TYPE passType) { float scaleWGrad = getGradScale(inpIdx, passType); float scaleInc = getIncScale(inpIdx, passType); localWeightActs(*_inputs[inpIdx], v, getGradTarget(inpIdx), _imgSize->at(inpIdx), _modulesX, _modulesX, _filterSize->at(inpIdx), _padding->at(inpIdx), _stride->at(inpIdx), _channels->at(inpIdx), _groups->at(inpIdx), scaleInc, scaleWGrad); } void LocalUnsharedLayer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) { localImgActs(v, getWeightMatrix(passType, inpIdx), _prev[replicaIdx][inpIdx]->getActsGrad(),_imgSize->at(inpIdx), _imgSize->at(inpIdx), _modulesX, _padding->at(inpIdx), _stride->at(inpIdx), _channels->at(inpIdx), _groups->at(inpIdx), scaleTargets, 1); } void LocalUnsharedLayer::_constrainWeights() { for (int i = 0; i < _weights->getSize(); i++) { if (_weights->at(i).getWBall() > 0 && _weights->at(i).isOwner() && _weights->at(i).getLearningRateSchedule().getBaseValue() > 0) { normalizeLocalWeights(*_weights->at(i), _modules, _weights->at(i).getWBall()); } } } /* * ======================= * SoftmaxLayer * ======================= */ SoftmaxLayer::SoftmaxLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID) : Layer(convNetThread, paramsDict, replicaID, true), _doUpperGrad(false) { } void SoftmaxLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) { NVMatrix& input = *_inputs[0]; input.max(1, _max); input.addVector(_max, -1, getActs()); getActs().apply(NVMatrixOps::Exp()); getActs().sum(1, _sum); getActs().eltwiseDivideByVector(_sum); } void SoftmaxLayer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) { assert(inpIdx == 0); LayerV& prev = _prev[replicaIdx]; if (_doUpperGrad) { // Todo: rethink replica IDs or idxes... this here doesn't make a huge amount of sense for (int i = 0; i < _next.size(); ++i) { if (_next[i]->isGradProducer(getName())) { NVMatrix& labels = _next[i]->getPrev()[replicaIdx][0]->getActs(getDeviceID()); // Get cost's labels float gradCoeff = dynamic_cast(_next[i])->getCoeff(); computeLogregSoftmaxGrad(labels, getActs(), prev[0]->getActsGrad(), scaleTargets == 1, gradCoeff); break; } } } else { computeSoftmaxGrad(getActs(), v, prev[0]->getActsGrad(), scaleTargets, 1); } } void SoftmaxLayer::setDoUpperGrad(bool b) { _doUpperGrad = b; } /* * ======================= * ConcatenationLayer * ======================= */ ConcatenationLayer::ConcatenationLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID) : Layer(convNetThread, paramsDict, replicaID, false) { _copyOffsets = pyDictGetIntV(paramsDict, "copyOffsets"); _copyOffsets->push_back(_numOutputs); } ConcatenationLayer::~ConcatenationLayer() { delete _copyOffsets; } void ConcatenationLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) { getActs().resize(_numOutputs, _inputs[inpIdx]->getNumCols()); _inputs[inpIdx]->copy(getActs(), 0, -1, 0, -1, _copyOffsets->at(inpIdx), 0); } void ConcatenationLayer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) { NVMatrix& copySrc = v.sliceRows(_copyOffsets->at(inpIdx), _copyOffsets->at(inpIdx + 1)); // view _prev[replicaIdx][inpIdx]->getActsGrad().add(copySrc, scaleTargets, 1); delete ©Src; } /* * ======================= * PassThroughLayer * ======================= */ PassThroughLayer::PassThroughLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID) : Layer(convNetThread, paramsDict, replicaID, false) { } void PassThroughLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) { // No-op } void PassThroughLayer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) { // No-op } bool PassThroughLayer::postInit() { if (Layer::postInit()) { assert(getNumInputReplicas() == 1); for (int i = 0, offset = 0; i < _prev[0].size(); offset += _prev[0][i]->getNumOutputs(), i++) { MemoryView& vActs = _memSrcActs[getDeviceID()]->getMemorySource().addUser(_prev[0][i]->getName(), pair(offset, offset + _prev[0][i]->getNumOutputs())); MemoryView& vActsGrad = _memSrcActsGrad[getDeviceID()]->getMemorySource().addUser(_prev[0][i]->getName(), pair(offset, offset + _prev[0][i]->getNumOutputs())); _prev[0][i]->setMemorySourceActs(getDeviceID(), vActs); _prev[0][i]->setMemorySourceActsGrad(getDeviceID(), vActsGrad); } return true; } return false; } /* * ======================= * EltwiseSumLayer * ======================= */ EltwiseSumLayer::EltwiseSumLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID) : Layer(convNetThread, paramsDict, replicaID, false) { _coeffs = pyDictGetFloatV(paramsDict, "coeffs"); } EltwiseSumLayer::~EltwiseSumLayer() { delete _coeffs; } void EltwiseSumLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) { getActs().add(*_inputs[inpIdx], scaleTargets, _coeffs->at(inpIdx)); } void EltwiseSumLayer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) { _prev[replicaIdx][inpIdx]->getActsGrad().add(v, scaleTargets, _coeffs->at(inpIdx)); } /* * ======================= * EltwiseMaxLayer * ======================= */ EltwiseMaxLayer::EltwiseMaxLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID) : Layer(convNetThread, paramsDict, replicaID, false) { } void EltwiseMaxLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) { if (inpIdx == 1) { // First input, do nothing _inputs[inpIdx]->applyBinary(NVMatrixAggs::Max(), *_inputs[0], getActs()); } else if (inpIdx > 1) { getActs().applyBinary(NVMatrixAggs::Max(), *_inputs[inpIdx]); } } void EltwiseMaxLayer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) { computeEltwiseMaxGrad(v, *_inputs[inpIdx], getActs(), _prev[replicaIdx][inpIdx]->getActsGrad(), scaleTargets != 0); } /* * ======================= * DropoutLayer * ======================= * * TODO: optimize away the case when using dopout over relus. Don't need the keepmask. */ DropoutLayer::DropoutLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID) : Layer(convNetThread, paramsDict, replicaID, false) { _enable = pyDictGetInt(paramsDict, "enable"); _keep = pyDictGetFloat(paramsDict, "keep"); } void DropoutLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) { if (_enable && passType == PASS_TRAIN) { _keepMask.resize(*_inputs[inpIdx]); _keepMask.randomizeUniform(); _keepMask.apply(DropoutSmallerThanOperator(_keep)); _inputs[inpIdx]->eltwiseMult(_keepMask, getActs()); } else { _inputs[inpIdx]->copy(getActs()); } } void DropoutLayer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) { LayerV& prev = _prev[replicaIdx]; if (_enable && passType == PASS_TRAIN) { if (scaleTargets != 0) { v.applyTernary(AddGradientBinaryOperator(NVMatrixBinaryOps::Multiply()), _keepMask, prev[inpIdx]->getActsGrad(), prev[inpIdx]->getActsGrad()); } else { v.eltwiseMult(_keepMask, prev[inpIdx]->getActsGrad()); } } else { prev[inpIdx]->getActsGrad().add(v, scaleTargets, 1); } } void DropoutLayer::truncBwdActs() { Layer::truncBwdActs(); _keepMask.truncate(); } /* * ======================= * Dropout2Layer * ======================= * * TODO: optimize away the case when using dopout over relus. Don't need the keepmask. */ Dropout2Layer::Dropout2Layer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID) : DropoutLayer(convNetThread, paramsDict, replicaID) { } void Dropout2Layer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) { if (_enable && passType == PASS_TRAIN) { _keepMask.resize(*_inputs[inpIdx]); _keepMask.randomizeUniform(); _keepMask.smallerThanScalar(_keep); _inputs[inpIdx]->eltwiseMult(_keepMask, getActs()); } else { _inputs[inpIdx]->scale(_keep, getActs()); } } void Dropout2Layer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) { LayerV& prev = _prev[replicaIdx]; if (_enable && passType == PASS_TRAIN) { if (scaleTargets != 0) { v.applyTernary(AddGradientBinaryOperator(NVMatrixBinaryOps::Multiply()), _keepMask, prev[inpIdx]->getActsGrad(), prev[inpIdx]->getActsGrad()); } else { v.eltwiseMult(_keepMask, prev[inpIdx]->getActsGrad()); } } else { if (scaleTargets != 0) { v.applyBinary(AddGradientOperator(NVMatrixOps::MultByScalar(_keep)), prev[inpIdx]->getActsGrad(), prev[inpIdx]->getActsGrad()); } else { v.scale(_keep, prev[inpIdx]->getActsGrad()); } } } /* * ======================= * DataLayer * ======================= */ DataLayer::DataLayer(ConvNet* convNet, PyObject* paramsDict, int replicaID) : Layer(NULL, paramsDict, replicaID, false) { _dataIdx = pyDictGetInt(paramsDict, "dataIdx"); _start = pyDictGetInt(paramsDict, "start"); _end = pyDictGetInt(paramsDict, "end"); _useBuffer = false; _outstandingCopyRequest = false; _convNet = convNet; } DataLayer::~DataLayer() { for (map::const_iterator it = _copyStreams.begin(); it != _copyStreams.end(); ++it) { checkCudaErrors(cudaStreamDestroy(it->second)); } for (std::map::iterator it = _memSrcActs2.begin(); it != _memSrcActs2.end(); ++it) { if (it->second->getMemorySource().truncate(_name)) { delete &it->second->getMemorySource(); } } _copier->stop(); delete _copier; } void DataLayer::fprop(PASS_TYPE passType, int passIdx, bool fromBuffer) { waitForCopyFinish(); if (fromBuffer && getFwdActiveInputReplicaIdx(passIdx) >= 0) { _useBuffer = !_useBuffer; } for (int i = 0; i < _next.size(); i++) { _next[i]->getConvNetThread().getMessageQueue().enqueue(new FpropMessage(*_next[i], passType, passIdx)); } } void DataLayer::waitForCopyFinish() { if (_outstandingCopyRequest) { _copyFinishQueue.dequeue(); assert(_copyFinishQueue.getNumElements() == 0); _outstandingCopyRequest = false; } } cudaStream_t DataLayer::getCopyStream(int deviceID) { if (_copyStreams.count(deviceID) == 0) { NVMatrix::setDeviceID(deviceID); checkCudaErrors(cudaStreamCreateWithFlags(&_copyStreams[deviceID], cudaStreamNonBlocking)); } return _copyStreams[deviceID]; } void DataLayer::copyData(CPUData& data, bool other, int passIdx) { assert(!_outstandingCopyRequest); assert(_copyFinishQueue.getNumElements() == 0); _copier->getQueue().enqueue(new DataCopyMessage(data, other, passIdx)); _outstandingCopyRequest = true; } int DataLayer::getNumInputReplicas() { return _convNet->getNumReplicasMax() / getNumReplicas(); } void DataLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) { } NVMatrix& DataLayer::getActs(int deviceID) { return getActs(deviceID, false, -1); } NVMatrix& DataLayer::getActs(int deviceID, bool other, int numCases) { // printf("%s[%d] getActs(%d, %d, %d)\n", _name.c_str(), getReplicaID(), deviceID, other, numCases); assert(_memSrcActs.count(deviceID) > 0); assert(_memSrcActs2.count(deviceID) > 0); return (_useBuffer != other ? _memSrcActs2[deviceID]->getMemory(numCases) : _memSrcActs[deviceID]->getMemory(numCases)); } ConvNet& DataLayer::getConvNet() { return *_convNet; } bool DataLayer::postInit() { if (Layer::postInit()) { for (int i = 0; i < _next.size(); ++i) { int d = _next[i]->getDeviceID(); if (_memSrcActs2.count(d) == 0) { _memSrcActs2[d] = &MemorySource::make(_numOutputs, d, getName()); } } intv cpus = getDeviceCPUs(_next[0]->getDeviceID()); _copier = new DataCopyThread(*this, cpus); _copier->start(); return true; } return false; } bool DataLayer::isGradProducer() { return false; } /* * ======================= * DataCopyThread * ======================= */ DataCopyThread::DataCopyThread(DataLayer& parent, intv& cpus) : _parent(&parent), _sleepUsec(0), Thread(true, cpus) { } Queue& DataCopyThread::getQueue() { return _queue; } void DataCopyThread::stop() { getQueue().enqueue(new DataCopyExitMessage()); join(); } void* DataCopyThread::run() { NVMatrix::setDeviceID(*_parent->getNextDeviceIDs().begin()); bool exit = false; while(!exit) { DataCopyMessage& msg = *_queue.dequeue(); exit = msg.getType() == DataCopyMessage::EXIT; if (!exit) { CPUData& data = msg.getData(); int passIdx = msg.getPassIdx(); bool other = msg.isOther(); Matrix& dataMatrix = data.getData(_parent->getDataIdx()); // How many times is this layer going to process microbatches from this minibatch? assert(_parent->getNumReplicasNext() == _parent->getNumReplicas()); int microIdx = _parent->getFwdActiveInputReplicaIdx(passIdx); if (microIdx >= 0) { if (_requestTimer.isStarted()) { double requestIntervalMsec = _requestTimer.stop(); // Sleep for up to 1/20th the average request interval _sleepUsec = int(round(0.95 * _sleepUsec + 0.05 * (_parent->getReplicaID() / double(_parent->getNumReplicas())) * requestIntervalMsec * 1000.0 / 20.0)); } _requestTimer.start(); if (other) { // Sleeping a bit is helpful because in typical nets, copying input data // as soon as it's available will produce contention with other communications // that are happening at the time. This is very much a hack, so in the future // it might be good to replace it with something smarter which schedules access // to communication links. usleep(_sleepUsec); } microIdx += _parent->getReplicaID() * _parent->getNumInputReplicas(); // Safer to divup because this way you won't get a minibatch size of 0 int microbatchSize = DIVUP(data.getNumCases(), _parent->getConvNet().getNumReplicasMax()); int microStart = microIdx * microbatchSize; int microEnd = min(data.getNumCases(), (microIdx + 1) * microbatchSize); // Check that this replica has some data. This can be false when, for example, // there are only 7 examples in the minibatch but 8 replicas. if (microStart < microEnd) { assert(dataMatrix.isView() == dataMatrix.isTrans()); int pipe = _parent->getConvNet().getDataCopyPD().getPipe(_parent->getReplicaID()/2); if (dataMatrix.isTrans()) { Matrix& replicaDataMatrix = dataMatrix.sliceCols(microStart, microEnd); // In this case, dataMatrix is a view on memory allocated by Python. //_hostMemFwd.copyFromHost(replicaDataMatrix, true); _hostMemFwd.resize(replicaDataMatrix.getNumRows(), replicaDataMatrix.getNumCols(), true); memcpy(_hostMemFwd.getDevData(), replicaDataMatrix.getData(), replicaDataMatrix.getNumDataBytes()); delete &replicaDataMatrix; // view NVMatrix& hostMemFwdSlice = _hostMemFwd.sliceRows(_parent->getStart(), _parent->getEnd()); for (intv::iterator it = _parent->getNextDeviceIDs().begin(); it != _parent->getNextDeviceIDs().end(); ++it) { int deviceID = *it; // Copy my output to this guy's GPU NVMatrix::setDeviceID(deviceID); // Note to self: this is the path that gets executed in practice // in my models. It does a transpose & copy simultaneously. hostMemFwdSlice.flipTrans(_parent->getActs(deviceID, other, microEnd - microStart), _parent->getCopyStream(deviceID)); } delete &hostMemFwdSlice; } else { // Hacky way to copy a slice to _hostMemFwd _hostMemFwd.resize(dataMatrix.getNumRows(), microEnd - microStart); Matrix tmp(_hostMemFwd.getDevData(), _hostMemFwd.getNumRows(), _hostMemFwd.getNumCols(), _hostMemFwd.isTrans()); dataMatrix.sliceCols(microStart, microEnd, tmp); NVMatrix& hostMemFwdSlice = _hostMemFwd.sliceRows(_parent->getStart(), _parent->getEnd()); for (intv::iterator it = _parent->getNextDeviceIDs().begin(); it != _parent->getNextDeviceIDs().end(); ++it) { int deviceID = *it; // Copy my output to this guy's GPU NVMatrix::setDeviceID(deviceID); hostMemFwdSlice.copy(_parent->getActs(deviceID, other, microEnd - microStart), _parent->getCopyStream(deviceID)); } delete &hostMemFwdSlice; } for (intv::iterator it = _parent->getNextDeviceIDs().begin(); it != _parent->getNextDeviceIDs().end(); ++it) { int deviceID = *it; NVMatrix::setDeviceID(deviceID); NVMatrix::syncStream(_parent->getCopyStream(deviceID)); } _parent->getConvNet().getDataCopyPD().freePipe(pipe); } else { for (intv::iterator it = _parent->getNextDeviceIDs().begin(); it != _parent->getNextDeviceIDs().end(); ++it) { int deviceID = *it; _parent->getActs(deviceID, other, 0); } } } _parent->getCopyFinishQueue().enqueue(1); } delete &msg; } return NULL; } /* * ===================== * PoolLayer * ===================== */ PoolLayer::PoolLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID, bool trans) : Layer(convNetThread, paramsDict, replicaID, trans), TwoDLayerInterface(paramsDict) { _sizeX = pyDictGetInt(paramsDict, "sizeX"); _start = pyDictGetInt(paramsDict, "start"); _stride = pyDictGetInt(paramsDict, "stride"); _outputsX = pyDictGetInt(paramsDict, "outputsX"); _pool = pyDictGetString(paramsDict, "pool"); } PoolLayer& PoolLayer::make(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID) { std::string _pool = pyDictGetString(paramsDict, "pool"); if (_pool == "max") { return *new MaxPoolLayer(convNetThread, paramsDict, replicaID, false); } else if(_pool == "maxabs") { return *new MaxPoolLayer(convNetThread, paramsDict, replicaID, true); } else if(_pool == "avg") { return *new AvgPoolLayer(convNetThread, paramsDict, replicaID); } throw std::string("Unknown pooling layer type ") + _pool; } /* * ===================== * AvgPoolLayer * ===================== */ AvgPoolLayer::AvgPoolLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID) : PoolLayer(convNetThread, paramsDict, replicaID, false) { _sum = pyDictGetInt(paramsDict, "sum"); } void AvgPoolLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) { if (_sum) { convLocalPool(*_inputs[0], getActs(), _channels, _sizeX, _start, _stride, _outputsX, AvgPooler()); } else { convLocalPool(*_inputs[0], getActs(), _channels, _sizeX, _start, _stride, _outputsX, AvgPooler()); } } void AvgPoolLayer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) { convLocalAvgUndo(v, _prev[replicaIdx][0]->getActsGrad(), _sizeX, _start, _stride, _outputsX, _imgSize, _sum, scaleTargets, 1); } /* * ===================== * MaxPoolLayer * ===================== */ MaxPoolLayer::MaxPoolLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID, bool abs) : PoolLayer(convNetThread, paramsDict, replicaID, false), _abs(abs) { } void MaxPoolLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) { if (_abs) { convLocalPool(*_inputs[0], getActs(), _channels, _sizeX, _start, _stride, _outputsX, MaxAbsPooler()); } else { convLocalPool(*_inputs[0], getActs(), _channels, _sizeX, _start, _stride, _outputsX, MaxPooler()); } } void MaxPoolLayer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) { assert(inpIdx == 0); convLocalMaxUndo(*_inputs[0], v, getActs(), _prev[replicaIdx][inpIdx]->getActsGrad(), _sizeX, _start, _stride, _outputsX, scaleTargets, 1); } /* * ===================== * CrossMapPoolLayer * ===================== */ CrossMapPoolLayer::CrossMapPoolLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID, bool trans) : Layer(convNetThread, paramsDict, replicaID, trans), TwoDLayerInterface(paramsDict) { _size = pyDictGetInt(paramsDict, "size"); _start = pyDictGetInt(paramsDict, "start"); _stride = pyDictGetInt(paramsDict, "stride"); _outputs = pyDictGetInt(paramsDict, "outputChannels"); _pool = pyDictGetString(paramsDict, "pool"); } CrossMapPoolLayer& CrossMapPoolLayer::make(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID) { std::string _pool = pyDictGetString(paramsDict, "pool"); if (_pool == "max") { return *new CrossMapMaxPoolLayer(convNetThread, paramsDict, replicaID); } throw std::string("Unknown pooling layer type ") + _pool; } /* * ===================== * CrossMapMaxPoolLayer * ===================== */ CrossMapMaxPoolLayer::CrossMapMaxPoolLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID) : CrossMapPoolLayer(convNetThread, paramsDict, replicaID, false) { } void CrossMapMaxPoolLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) { convPoolCrossMap(*_inputs[0], getActs(), _start, _size, _outputs, _stride, _imgSize, MaxPooler()); } void CrossMapMaxPoolLayer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) { assert(inpIdx == 0); convCrossMapMaxPoolUndo(*_inputs[0], v, getActs(), _prev[replicaIdx][0]->getActsGrad(), _imgSize, _start, _size, _stride, scaleTargets, 1); } /* * ===================== * RandomScaleLayer * ===================== */ RandomScaleLayer::RandomScaleLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID) : Layer(convNetThread, paramsDict, replicaID, false), TwoDLayerInterface(paramsDict) { _maxScale = pyDictGetFloat(paramsDict, "maxScale"); _tgtSize = pyDictGetInt(paramsDict, "tgtSize"); // The smallest size the image could be after rescaling _minScaledSize = _imgSize / _maxScale; // The number of discrete scales we're considering int numScales = _imgSize - _minScaledSize + 1; // The total number of squares of size _tgtSize that we can extract // from all these scales double numCrops = numScales * (numScales + 1) * (2 * numScales + 1) / 6; // For each scale, record the fraction of the squares that it has. // This will be the probability of sampling this scale. _scaleProbs.push_back(1.0 / numCrops); for (int s = 1; s < numScales; ++s) { _scaleProbs.push_back(_scaleProbs[s-1] + (s + 1) * (s + 1) / numCrops); } } void RandomScaleLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) { if (IS_TRAIN(passType)) { // _maxScale is in the range [1, 2) float r = randf; int rescaledSize = _tgtSize; float scaleFactor = _maxScale; // Find which scale we have sampled for (int s = 0; s < _scaleProbs.size(); ++s) { if (r <= _scaleProbs[s]) { rescaledSize += s; float scaleFactorEnd = _imgSize / float(rescaledSize); float scaleFactorStart = max(1.0, _imgSize / (1.0 + rescaledSize)); scaleFactor = scaleFactorStart + randf * (scaleFactorEnd - scaleFactorStart); break; } } assert(rescaledSize >= _tgtSize); int maxStart = rescaledSize - _tgtSize; int startY = rand() % (1 + maxStart), startX = rand() % (1 + maxStart); if (rescaledSize == _imgSize) { convCrop(*_inputs[0], getActs(), rescaledSize, _tgtSize, startY, startX); } else { convResizeBilinear(*_inputs[0], _rescaledActs, _imgSize, rescaledSize, scaleFactor); convCrop(_rescaledActs, getActs(), rescaledSize, _tgtSize, startY, startX); } _rescaledActs.truncate(); // this'll have a different size each time so may as well truncate it. } else if (IS_MULTIVIEW_TEST(passType)) { // for now... _inputs[0]->copy(getActs()); } else if (IS_TEST(passType)) { // Test on center patch convResizeBilinear(*_inputs[0], getActs(), _imgSize, _tgtSize, _maxScale); } } void RandomScaleLayer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) { assert(false); } /* * ===================== * CropLayer * ===================== */ CropLayer::CropLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID) : Layer(convNetThread, paramsDict, replicaID, false), TwoDLayerInterface(paramsDict) { _startX = pyDictGetInt(paramsDict, "startX"); _startY = pyDictGetInt(paramsDict, "startY"); _tgtSize = pyDictGetInt(paramsDict, "sizeX"); } void CropLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) { convCrop(*_inputs[0], getActs(), _imgSize, _tgtSize, _startY, _startX); } void CropLayer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) { assert(false); } /* * ===================== * NailbedLayer * ===================== */ NailbedLayer::NailbedLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID) : Layer(convNetThread, paramsDict, replicaID, false), TwoDLayerInterface(paramsDict) { _start = pyDictGetInt(paramsDict, "start"); _stride = pyDictGetInt(paramsDict, "stride"); _outputsX = pyDictGetInt(paramsDict, "outputsX"); } void NailbedLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) { convBedOfNails(*_inputs[0], getActs(), _channels, _imgSize, _start, _stride, 0, 1); } void NailbedLayer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) { convBedOfNailsUndo(v, _prev[replicaIdx][0]->getActsGrad(), _channels, _imgSize, _start, _stride, scaleTargets, 1); } /* * ===================== * GaussianBlurLayer * ===================== */ GaussianBlurLayer::GaussianBlurLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID) : Layer(convNetThread, paramsDict, replicaID, false), TwoDLayerInterface(paramsDict) { _hFilter = pyDictGetMatrix(paramsDict, "filter"); } GaussianBlurLayer::~GaussianBlurLayer() { delete _hFilter; } void GaussianBlurLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) { convGaussianBlur(*_inputs[0], _filter, getActs(), true, _channels, 0, 1); convGaussianBlur(getActs(), _filter, getActs(), false, _channels, 0, 1); } void GaussianBlurLayer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) { NVMatrix& tgt = _prev[replicaIdx][0]->getNumComputedActsGrads(getDeviceID()) > 0 ? _actGradsTmp : _prev[replicaIdx][0]->getActsGrad(); convGaussianBlur(v, _filter, tgt, true, _channels, 0, 1); convGaussianBlur(tgt, _filter, _prev[replicaIdx][0]->getActsGrad(), false, _channels, scaleTargets, 1); } void GaussianBlurLayer::copyToGPU() { _filter.copyFromHost(*_hFilter, true); } /* * ===================== * HorizontalReflectionLayer * ===================== */ HorizontalReflectionLayer::HorizontalReflectionLayer(ConvNetThread* convNet, PyObject* paramsDict, int replicaID) : Layer(convNet, paramsDict, replicaID, false), TwoDLayerInterface(paramsDict) { assert(_channels >= 1 && _channels <= 3); } void HorizontalReflectionLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) { convReflectHorizontal(*_inputs[0], getActs(), _imgSize); } void HorizontalReflectionLayer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) { convReflectHorizontal(v, _prev[replicaIdx][0]->getActsGrad(), _imgSize); } /* * ===================== * ResizeLayer * ===================== */ ResizeLayer::ResizeLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID) : Layer(convNetThread, paramsDict, replicaID, false), TwoDLayerInterface(paramsDict) { _tgtSize = pyDictGetInt(paramsDict, "tgtSize"); _scale = pyDictGetFloat(paramsDict, "scale"); } void ResizeLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) { convResizeBilinear(*_inputs[0], getActs(), _imgSize, _tgtSize, _scale); } // Can't do this void ResizeLayer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) { assert(false); } /* * ===================== * RGBToYUVLayer * ===================== */ RGBToYUVLayer::RGBToYUVLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID) : Layer(convNetThread, paramsDict, replicaID, false) { } void RGBToYUVLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) { convRGBToYUV(*_inputs[0], getActs()); } // Can't do this void RGBToYUVLayer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) { assert(false); } /* * ===================== * RGBToLABLayer * ===================== */ RGBToLABLayer::RGBToLABLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID) : Layer(convNetThread, paramsDict, replicaID, false) { _center = pyDictGetInt(paramsDict, "center"); } void RGBToLABLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) { convRGBToLAB(*_inputs[0], getActs(), _center); } // Can't do this void RGBToLABLayer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) { assert(false); } /* * ===================== * ResponseNormLayer * ===================== */ ResponseNormLayer::ResponseNormLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID) : Layer(convNetThread, paramsDict, replicaID, false), TwoDLayerInterface(paramsDict) { _size = pyDictGetInt(paramsDict, "size"); _scale = pyDictGetFloat(paramsDict, "scale"); _pow = pyDictGetFloat(paramsDict, "pow"); _minDiv = pyDictGetFloat(paramsDict, "minDiv"); } void ResponseNormLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) { convResponseNorm(*_inputs[0], _denoms, getActs(), _channels, _size, _scale, _pow, _minDiv); } void ResponseNormLayer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) { convResponseNormUndo(v, _denoms, *_inputs[0], getActs(), _prev[replicaIdx][0]->getActsGrad(), _channels, _size, _scale, _pow, scaleTargets, 1); } void ResponseNormLayer::truncBwdActs() { Layer::truncBwdActs(); _denoms.truncate(); } /* * ===================== * CrossMapResponseNormLayer * ===================== */ CrossMapResponseNormLayer::CrossMapResponseNormLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID) : ResponseNormLayer(convNetThread, paramsDict, replicaID) { _blocked = pyDictGetInt(paramsDict, "blocked"); } void CrossMapResponseNormLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) { assert(inpIdx == 0); convResponseNormCrossMap(*_inputs[0], getActs(), _channels, _size, _scale, _pow, _minDiv, _blocked); } void CrossMapResponseNormLayer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) { convResponseNormCrossMapUndo(v, *_inputs[0], getActs(), _prev[replicaIdx][0]->getActsGrad(), _channels, _size, _scale, _pow, _minDiv, _blocked, scaleTargets, 1); } /* * ===================== * ContrastNormLayer * ===================== */ ContrastNormLayer::ContrastNormLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID) : ResponseNormLayer(convNetThread, paramsDict, replicaID) { } void ContrastNormLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) { NVMatrix& images = *_inputs[0]; convLocalPool(images, _meanDiffs, _channels, _size, -_size/2, 1, _imgSize, AvgPooler()); _meanDiffs.add(images, -1, 1); convContrastNorm(images, _meanDiffs, _denoms, getActs(), _channels, _size, _scale, _pow, _minDiv); } void ContrastNormLayer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) { convContrastNormUndo(v, _denoms, _meanDiffs, getActs(), _prev[replicaIdx][inpIdx]->getActsGrad(), _channels, _size, _scale, _pow, scaleTargets, 1); } void ContrastNormLayer::truncBwdActs() { ResponseNormLayer::truncBwdActs(); _meanDiffs.truncate(); } /* * ===================== * CostLayer * ===================== */ CostLayer::CostLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID, bool trans) : Layer(convNetThread, paramsDict, replicaID, trans) { _coeff = pyDictGetFloat(paramsDict, "coeff"); _numCases = 0; _aggregated = pyDictGetInt(paramsDict, "aggregated") != 0; } float CostLayer::getCoeff() { return _coeff; } void CostLayer::bprop(NVMatrix& v, PASS_TYPE passType, int passIdx) { if (_coeff != 0) { Layer::bprop(v, passType, passIdx); } } bool CostLayer::fprop(PASS_TYPE passType, int passIdx) { if (Layer::fprop(passType, passIdx)) { syncStream(); getConvNet().getMessageQueue().enqueue(new Message(FPROP_TERMINAL)); return true; } return false; } void CostLayer::fpropCommon(PASS_TYPE passType) { _numCases = Layer::getNumCases(*_inputs[0]); } int CostLayer::getNumCases() { return _numCases; } bool CostLayer::isGradProducer() { return _coeff != 0; } doublev& CostLayer::getCost() { return *new doublev(_costv); } // This is called between microbatches void CostLayer::resetPassIdx() { Layer::resetPassIdx(); _costv.clear(); } CostLayer& CostLayer::make(ConvNetThread* convNetThread, PyObject* paramsDict, std::string& type, int replicaID) { if (type == "cost.crossent") { return *new CrossEntCostLayer(convNetThread, paramsDict, replicaID); } else if (type == "cost.bce") { return *new BinomialCrossEntropyCostLayer(convNetThread, paramsDict, replicaID); } else if (type == "cost.dce") { return *new DetectionCrossEntropyCostLayer(convNetThread, paramsDict, replicaID); } else if (type == "cost.logreg") { return *new LogregCostLayer(convNetThread, paramsDict, replicaID); } else if (type == "cost.sum2") { return *new SumOfSquaresCostLayer(convNetThread, paramsDict, replicaID); } throw std::string("Unknown cost layer type ") + type; } /* * ===================== * CrossEntCostLayer * ===================== */ CrossEntCostLayer::CrossEntCostLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID) : CostLayer(convNetThread, paramsDict, replicaID, false) { } void CrossEntCostLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) { // This layer uses its two inputs together if (inpIdx == 0) { NVMatrix& labels = *_inputs[0]; NVMatrix& probs = *_inputs[1]; int numCases = labels.getLeadingDim(); computeCrossEntCost(labels, probs, _trueLabelLogProbs, _correctProbs); _costv.clear(); _costv.push_back(-_trueLabelLogProbs.sum()); _costv.push_back(numCases - _correctProbs.sum()); } } void CrossEntCostLayer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) { assert(inpIdx == 1); LayerV& prev = _prev[replicaIdx]; NVMatrix& labels = *_inputs[0]; NVMatrix& probs = *_inputs[1]; NVMatrix& target = prev[1]->getActsGrad(); // Numerical stability optimization: if the layer below me is a softmax layer, let it handle // the entire gradient computation to avoid multiplying and dividing by a near-zero quantity. bool doWork = prev[1]->getNext().size() > 1 || prev[1]->getType() != "softmax" || prev[1]->getDeviceID() != getDeviceID(); if (doWork) { computeCrossEntGrad(labels, probs, target, scaleTargets == 1, _coeff); } } /* * ===================== * BinomialCrossEntropyCostLayer * ===================== */ BinomialCrossEntropyCostLayer::BinomialCrossEntropyCostLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID) : CostLayer(convNetThread, paramsDict, replicaID, false) { _computeSoftmaxErrorRate = pyDictGetInt(paramsDict, "computeSoftmaxErrorRate"); _posWeight = pyDictGetFloat(paramsDict, "posWeight"); } void BinomialCrossEntropyCostLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) { // This layer uses its two inputs together if (inpIdx == 0) { NVMatrix& labels = *_inputs[0]; NVMatrix& probs = *_inputs[1]; int numCases = labels.getLeadingDim(); labels.applyBinary(BinomialCrossEntOperator(_posWeight), probs, _tmpProbs); _costv.clear(); // Cross-entropy cost _costv.push_back(-_tmpProbs.sum(_tmpbuf));// / labels.getFollowingDim()); // If aggregated, we don't produce these outputs because they're not additive. // They have no meaning if this is just a partial cost. if (!_aggregated) { // "Correct" classifications. To compute these we threshold probs // and just count the number of entries that agree with labels. probs.biggerThanScalar(0.5, _tmpProbs); _tmpProbs.equals(labels); _costv.push_back((_tmpProbs.getNumElements() - _tmpProbs.sum(_tmpbuf)) / double(labels.getFollowingDim())); if (_computeSoftmaxErrorRate) { // Also compute top-1 error as if this is softmax and there's only one correct class probs.max(0, _tmpVec); assert(_tmpVec.getNumElements() == numCases); // Make sure we did max on correct axis probs.equalsVector(_tmpVec, _correctProbs); _correctProbs.sum(0, _tmpVec); // Divide by the # of labels that we predict as being present float m = _tmpVec.max(); _correctProbs.eltwiseDivideByVector(_tmpVec); _correctProbs.eltwiseMult(labels); _costv.push_back(numCases - _correctProbs.sum(_tmpbuf)); } } } } void BinomialCrossEntropyCostLayer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) { assert(inpIdx == 1); LayerV& prev = _prev[replicaIdx]; NVMatrix& labels = *_inputs[0]; NVMatrix& probs = *_inputs[1]; NVMatrix& target = prev[1]->getActsGrad(); // Numerical stability optimization: if the layer below me is a logistic neuron layer, let it handle // the entire gradient computation to avoid multiplying and dividing by a near-zero quantity. bool doWork = prev[1]->getNext().size() > 1 || prev[1]->getType() != "neuron" || static_cast(prev[1])->getNeuronType() != "logistic" || prev[1]->getDeviceID() != getDeviceID() || prev[1]->getNumReplicas() != getNumReplicas(); if (doWork) { printf("Computing cross-entropy gradient the stupid way\n"); if (scaleTargets == 0) { labels.applyBinary(BinomialCrossEntGradientOperator(_coeff, _posWeight), probs, target); } else { labels.applyTernary(AddGradientBinaryOperator(BinomialCrossEntGradientOperator(_coeff, _posWeight)), probs, target, target); } } } float BinomialCrossEntropyCostLayer::getPosWeight() { return _posWeight; } /* * ===================== * DetectionCrossEntropyCostLayer * ===================== */ DetectionCrossEntropyCostLayer::DetectionCrossEntropyCostLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID) : BinomialCrossEntropyCostLayer(convNetThread, paramsDict, replicaID) { assert(!_aggregated); } void DetectionCrossEntropyCostLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) { BinomialCrossEntropyCostLayer::fpropActs(inpIdx, scaleTargets, passType, passIdx); // This layer uses its two inputs together if (inpIdx == 0) { NVMatrix& labels = *_inputs[0]; NVMatrix& probs = *_inputs[1]; int numCases = labels.getLeadingDim(); /* * Add information sufficient to compute precision and recall for each class. */ // NOTE: _tmpProbs contains ((probs > 0.5) == labels) labels.sum(1, _numPositive); // sum(labels, 1) _tmpProbs.eltwiseMult(labels); // labels * ((probs > 0.5) == labels) _tmpProbs.sum(1, _numTruePositive); probs.biggerThanScalar(0.5, _tmpProbs); _tmpProbs.sum(1, _numDeclaredPositive); _numDeclaredPositive.copyToHost(_hNumDeclaredPositive, true); _numPositive.copyToHost(_hNumPositive, true); _numTruePositive.copyToHost(_hNumTruePositive, true); for (int i = 0; i < labels.getFollowingDim(); ++i) { _costv.push_back(_hNumDeclaredPositive(i, 0)); // 2 _costv.push_back(_hNumPositive(i, 0)); // 3 _costv.push_back(_hNumTruePositive(i, 0)); // 4 } } } /* * ===================== * LogregCostLayer * ===================== */ LogregCostLayer::LogregCostLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID) : CostLayer(convNetThread, paramsDict, replicaID, false) { _topk = pyDictGetInt(paramsDict, "topk"); // _numAccumed = 0; } void LogregCostLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) { // This layer uses its two inputs together if (inpIdx == 0) { NVMatrix& labels = *_inputs[0]; NVMatrix* probs = _inputs[1]; _doCompute = !IS_MULTIVIEW_TEST(passType); if (!_doCompute) { if (IS_MULTIVIEW_TEST_START(passType)) { if (_probsAccum.count(passIdx) == 0) { _probsAccum[passIdx] = new NVMatrix(*probs); } probs->copy(*_probsAccum[passIdx]); _numAccumed[passIdx] = 1; } else { _probsAccum[passIdx]->add(*probs); _numAccumed[passIdx] += 1; } if (IS_MULTIVIEW_TEST_END(passType)) { probs = _probsAccum[passIdx]; probs->scale(1.0 / _numAccumed[passIdx]); _doCompute = true; } } if (_doCompute) { int numCases = labels.getNumElements(); probs->max(0,_maxProbs); if (_topk == 1) { computeLogregCost(labels, *probs, _maxProbs, _trueLabelLogProbs, _correctProbs); } else { computeMultiSoftmaxCost(labels, *probs, _maxProbs, _trueLabelLogProbs, _correctProbs, _topkProbs, _topk); } _costv.clear(); double top1 = _correctProbs.sum(_tmpbuf); _costv.push_back(-_trueLabelLogProbs.sum(_tmpbuf)); _costv.push_back(numCases - top1); _costv.push_back(numCases - (_topk == 1 ? top1 : _topkProbs.sum(_tmpbuf))); } } } NVMatrix& LogregCostLayer::getProbsAccum(int replicaIdx) { return *_probsAccum[replicaIdx]; } void LogregCostLayer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) { if (inpIdx == 1) { LayerV& prev = _prev[replicaIdx]; NVMatrix& labels = *_inputs[0]; NVMatrix& probs = *_inputs[1]; NVMatrix& target = prev[1]->getActsGrad(); // Numerical stability optimization: if the layer below me is a softmax layer, let it handle // the entire gradient computation to avoid multiplying and dividing by a near-zero quantity. bool doWork = prev[1]->getNext().size() > 1 || prev[1]->getType() != "softmax" || prev[1]->getDeviceID() != getDeviceID() || prev[1]->getNumReplicas() != getNumReplicas(); if (prev[1]->getType() == "softmax") { static_cast(prev[1])->setDoUpperGrad(!doWork); } if (doWork) { computeLogregGrad(labels, probs, target, scaleTargets == 1, _coeff); } } } /* * ===================== * SumOfSquaresCostLayer * ===================== */ SumOfSquaresCostLayer::SumOfSquaresCostLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID) : CostLayer(convNetThread, paramsDict, replicaID, false) { } void SumOfSquaresCostLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) { _inputs[0]->apply(NVMatrixOps::Square(), _tmp); _costv.clear(); _costv.push_back(_tmp.sum()); } void SumOfSquaresCostLayer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) { _prev[replicaIdx][inpIdx]->getActsGrad().add(*_inputs[0], scaleTargets, -2 * _coeff); } ================================================ FILE: caffe2/contrib/cuda-convnet2/cudaconvnet/src/layer_kernels.cu ================================================ /* * Copyright 2014 Google Inc. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include #include #include #include "../include/layer_kernels.cuh" using namespace std; /* * E = -log(y_t) * probs: (numOut, numCases) * labels: (1, numCases) * maxEnergies: (1, numCases) * labelLogProbs: (1, numCases) (*out) * correctProbs: (1, numCases) (*out) * top5Probs: (1, numCases) (*out) * * target: (1, numCases) * */ __global__ void kMultiSoftmaxCost(float* probs, float* labels, float* maxProbs, float* labelLogProbs, float* correctProbs, float* top5Probs, const int numCases, const int numOut, const int setSize) { const int tx = blockIdx.x * LOGREG_ERR_THREADS_X + threadIdx.x; if (tx < numCases) { const int label = int(labels[tx]); const float maxp = maxProbs[tx]; const float labelp = probs[label * numCases + tx]; labelLogProbs[tx] = __logf(labelp); int numBiggerProbs = 0, numEqualsProbs = 0; for (int i = 0; i < numOut; ++i) { numBiggerProbs += probs[i * numCases + tx] > labelp; numEqualsProbs += probs[i * numCases + tx] == labelp; } const int slotsLeft = setSize - numBiggerProbs; top5Probs[tx] = slotsLeft <= 0.0f ? 0.0f : (numEqualsProbs <= slotsLeft ? 1.0f : float(slotsLeft) / numEqualsProbs); correctProbs[tx] = labelp != maxp ? 0.0f : 1.0f / float(numEqualsProbs); } } /* * E = -log(y_t) * probs: (numOut, numCases) * labels: (1, numCases) * maxProbs: (1, numCases) * labelLogProbs: (1, numCases) (*out) * correctProbs: (1, numCases) (*out) * top5Probs: (1, numCases) (*out) * * target: (1, numCases) == log(y_l[labels,:] */ void computeMultiSoftmaxCost(NVMatrix& labels, NVMatrix& probs, NVMatrix& maxProbs, NVMatrix& labelLogProbs_out, NVMatrix& correctProbs_out, NVMatrix& top5Probs_out, int setSize) { int numCases = probs.getNumCols(); int numOut = probs.getNumRows(); assert(labels.getNumElements() == numCases); assert(!labels.isTrans()); assert(!probs.isTrans()); assert(labels.isContiguous()); assert(probs.isContiguous()); // NVMatrix& maxProbs = probs.max(0); labelLogProbs_out.resize(1, numCases); correctProbs_out.resize(1, numCases); top5Probs_out.resize(1, numCases); dim3 threads(LOGREG_ERR_THREADS_X, 1); dim3 blocks(DIVUP(numCases, LOGREG_ERR_THREADS_X), 1); cudaStream_t stream = NVMatrix::getDefaultStream(); cudaFuncSetCacheConfig(kMultiSoftmaxCost, cudaFuncCachePreferL1); kMultiSoftmaxCost<<>>(probs.getDevData(), labels.getDevData(), maxProbs.getDevData(), labelLogProbs_out.getDevData(), correctProbs_out.getDevData(), top5Probs_out.getDevData(), numCases, numOut, setSize); getLastCudaError("kMultiSoftmaxCost: Kernel execution failed"); // cudaThreadSynchronize(); } /* * E = sum(p_l * log(y_l)) * probs: (numOut, numCases) * labels: (numOut, numCases) * maxProbs: (1, numCases) * labelLogProbs: (1, numCases) (*out) * correctProbs: (1, numCases) (*out) * * target: (1, numCases) */ __global__ void kCrossEntCost(float* probs, float* labels, float* maxProbs, float* labelLogProbs, float* correctProbs, const int numCases, const int numOut) { const int tx = blockIdx.x * LOGREG_ERR_THREADS_X + threadIdx.x; if (tx < numCases) { probs += tx; labels += tx; maxProbs += tx; labelLogProbs += tx; correctProbs += tx; const float maxp = maxProbs[0]; /* * Compute the probability of guessing the correct case if you take the most-probable label. * * This is done like this: * * - If the most probable label is not equal to the true label, then the probability is zero. * - Otherwise, the probability is 1 / (number of labels whose probability is equal to the maximum). * * This is certainly overkill -- in practice, it's just about impossible for two labels to get assigned * maximum probability. But it's a safety measure to prevent over-estimating your accuracy. * Though it could never happen in reality. Well it could. But it wouldn't. Cool? */ float crossEnt = 0.0f; int numMax = 0; bool correctLabel = false; for (int i = 0; i < numOut; i++) { const float label_prob = labels[i * numCases]; const float model_prob = probs[i * numCases]; numMax += model_prob == maxp; crossEnt += label_prob * safelog(model_prob); correctLabel |= model_prob == maxp && label_prob > 0.0f; } labelLogProbs[0] = crossEnt; if (!correctLabel) { correctProbs[0] = 0.0f; } else { correctProbs[0] = 1.0f / float(numMax); } } } /* * E = sum(p_l * log(y_l)) * y_l: (numOut, numCases) * labels: (numOut, numCases) * * dE_dy_l: (numOut, numCases) */ template __global__ void kCrossEntGrad(float* y_l, float* labels, float* dE_dy_l, const int numCases, const int numOut, const float gradCoeff) { const int tx = blockIdx.x * LOGREG_GRAD_THREADS_X + threadIdx.x; const int ty = blockIdx.y * LOGREG_GRAD_THREADS_Y + threadIdx.y; const int tidx = ty * numCases + tx; if (ty < numOut && tx < numCases) { const float label_prob = labels[tidx]; const float model_prob = y_l[tidx]; const float v = gradCoeff * __fdividef(label_prob, model_prob); if (add) { dE_dy_l[tidx] += v; } else { dE_dy_l[tidx] = v; } } } /* * E = sum(p_l * log(y_l)) * y_l: (numOut, numCases) * labels: (numOut, numCases) * * dE_dx_l: (numOut, numCases) */ template __global__ void kCrossEntSoftmaxGrad(float* y_l, float* labels, float* dE_dx_l, const int numCases, const int numOut, const float gradCoeff) { const int tx = blockIdx.x * LOGREG_GRAD_THREADS_X + threadIdx.x; const int ty = blockIdx.y * LOGREG_GRAD_THREADS_Y + threadIdx.y; const int tidx = ty * numCases + tx; if (ty < numOut && tx < numCases) { const float model_prob = y_l[tidx]; const float label_prob = labels[tidx]; float v = gradCoeff * (label_prob - model_prob); if (add) { dE_dx_l[tidx] += v; } else { dE_dx_l[tidx] = v; } } } /* * E = -log(y_t) * probs: (numOut, numCases) * labels: (1, numCases) * maxProbs: (1, numCases) * labelLogProbs: (1, numCases) (*out) * correctProbs: (1, numCases) (*out) * * target: (1, numCases) */ __global__ void kLogregCost(float* probs, float* labels, float* maxProbs, float* labelLogProbs, float* correctProbs, const int numCases, const int numOut) { const int tx = blockIdx.x * LOGREG_ERR_THREADS_X + threadIdx.x; if (tx < numCases) { const int label = int(labels[tx]); const float maxp = maxProbs[tx]; const float labelp = probs[label * numCases + tx]; labelLogProbs[tx] = __logf(labelp); /* * Compute the probability of guessing the correct case if you take the most-probable label. * * This is done like this: * * - If the most probable label is not equal to the true label, then the probability is zero. * - Otherwise, the probability is 1 / (number of labels whose probability is equal to the maximum). * * This is certainly overkill -- in practice, it's just about impossible for two labels to get assigned * maximum probability. But it's a safety measure to prevent over-estimating your accuracy. * Though it could never happen in reality. Well it could. But it wouldn't. Cool? */ if (labelp != maxp) { correctProbs[tx] = 0; } else { int numMax = 0; for (int i = 0; i < numOut; i++) { numMax += probs[i * numCases + tx] == maxp; } correctProbs[tx] = 1.0f / float(numMax); } } } /* * E = -log(y_t) * y_l: (numOut, numCases) * labels: (1, numCases) * * dE_dy_l: (numOut, numCases) */ template __global__ void kLogregCostGrad(float* y_l, float* labels, float* dE_dy_l, const int numCases, const int numOut, const float gradCoeff) { const int tx = blockIdx.x * LOGREG_GRAD_THREADS_X + threadIdx.x; const int ty = blockIdx.y * LOGREG_GRAD_THREADS_Y + threadIdx.y; const int tidx = ty * numCases + tx; if (ty < numOut && tx < numCases) { const int label = int(labels[tx]); float v = gradCoeff * (label == ty); v = __fdividef(v, y_l[tidx]); if (add) { dE_dy_l[tidx] += v; } else { dE_dy_l[tidx] = v; } } } /* * E = -log(y_t) * y_l: (numOut, numCases) * labels: (1, numCases) * * dE_dx_l: (numOut, numCases) */ template __global__ void kLogregSoftmaxGrad(float* y_l, float* labels, float* dE_dx_l, const int numCases, const int numOut, const float gradCoeff) { const int tx = blockIdx.x * LOGREG_GRAD_THREADS_X + threadIdx.x; const int ty = blockIdx.y * LOGREG_GRAD_THREADS_Y + threadIdx.y; const int tidx = ty * numCases + tx; if (ty < numOut && tx < numCases) { const int label = int(labels[tx]); float v = gradCoeff * ((label == ty) - y_l[tidx]); if (add) { dE_dx_l[tidx] += v; } else { dE_dx_l[tidx] = v; } } } /* * dE_dy_l: (numOut, numCases) * y_l: (numOut, numCases) * * dE_dx_l: (numOut, numCases) */ template __global__ void kSoftmaxGrad(float* dE_dy_l, float* y_l, float* dE_dx_l, const int numCases, const int numOut, const float scaleTarget, const float scaleGrad) { const int tx = blockIdx.x * LOGREG_GRAD_THREADS_X + threadIdx.x; const int ty = blockIdx.y * LOGREG_GRAD_THREADS_Y + threadIdx.y; const int tidx = ty * numCases + tx; if (ty < numOut && tx < numCases) { float v = 0; for (int j = 0; j < numOut; j++) { v += dE_dy_l[j * numCases + tx] * ((j == ty) - y_l[j * numCases + tx]); } v *= y_l[tidx]; if (add) { dE_dx_l[tidx] = scaleTarget * dE_dx_l[tidx] + scaleGrad * v; } else { dE_dx_l[tidx] = scaleGrad * v; } } } template __global__ void kEltwiseMaxGrad(float* actGrad, float* input, float* output, float* target, const int numElements) { for (int i = B_X * blockIdx.x + threadIdx.x; i < numElements; i += B_X * gridDim.x) { if (add) { target[i] += actGrad[i] * (output[i] == input[i]); } else { target[i] = actGrad[i] * (output[i] == input[i]); } } } void computeEltwiseMaxGrad(NVMatrix& actGrad, NVMatrix& input, NVMatrix& output, NVMatrix& target, bool add) { assert(actGrad.isContiguous()); assert(output.isContiguous()); assert(input.isContiguous()); assert(actGrad.isSameDims(input)); assert(actGrad.isSameDims(output)); dim3 blocks(DIVUP(actGrad.getNumElements(), 128)); dim3 threads(128); cudaStream_t stream = NVMatrix::getDefaultStream(); if (add) { assert(actGrad.isSameDims(target)); cudaFuncSetCacheConfig(kEltwiseMaxGrad<128, true>, cudaFuncCachePreferL1); kEltwiseMaxGrad<128, true><<>>(actGrad.getDevData(), input.getDevData(), output.getDevData(), target.getDevData(), actGrad.getNumElements()); } else { target.resize(actGrad); cudaFuncSetCacheConfig(kEltwiseMaxGrad<128, false>, cudaFuncCachePreferL1); kEltwiseMaxGrad<128, false><<>>(actGrad.getDevData(), input.getDevData(), output.getDevData(), target.getDevData(), actGrad.getNumElements()); } getLastCudaError("computeEltwiseMaxGrad: Kernel execution failed"); } /* * E = sum_i{-p_i*log(y_i)} * probs: (numOut, numCases) * labels: (numOut, numCases) * maxProbs: (1, numCases) * labelLogProbs: (1, numCases) (*out) * correctProbs: (1, numCases) (*out) * * target: (1, numCases) */ void computeCrossEntCost(NVMatrix& labels, NVMatrix& probs, NVMatrix& labelLogProbs_out, NVMatrix& correctProbs_out) { int numCases = probs.getNumCols(); int numOut = probs.getNumRows(); assert(labels.isSameDims(probs)); assert(!labels.isTrans()); assert(!probs.isTrans()); assert(labels.isContiguous()); assert(probs.isContiguous()); NVMatrix& maxProbs = probs.max(0); labelLogProbs_out.resize(1, numCases); correctProbs_out.resize(1, numCases); dim3 threads(LOGREG_ERR_THREADS_X, 1); dim3 blocks(DIVUP(numCases, LOGREG_ERR_THREADS_X), 1); cudaStream_t stream = NVMatrix::getDefaultStream(); cudaFuncSetCacheConfig(kCrossEntCost, cudaFuncCachePreferL1); kCrossEntCost<<>>(probs.getDevData(), labels.getDevData(), maxProbs.getDevData(), labelLogProbs_out.getDevData(), correctProbs_out.getDevData(), numCases, numOut); getLastCudaError("kCrossEntCost: Kernel execution failed"); delete &maxProbs; } void computeCrossEntGrad(NVMatrix& labels, NVMatrix& probs, NVMatrix& target, bool add, float coeff) { int numCases = probs.getLeadingDim(); int numOut = probs.getFollowingDim(); assert(labels.isSameDims(probs)); assert(probs.isContiguous()); assert(target.isContiguous()); assert(labels.isContiguous()); assert(!labels.isTrans()); assert(!probs.isTrans()); dim3 threads(LOGREG_GRAD_THREADS_X, LOGREG_GRAD_THREADS_Y); dim3 blocks(DIVUP(numCases, LOGREG_GRAD_THREADS_X), DIVUP(numOut, LOGREG_GRAD_THREADS_Y)); cudaStream_t stream = NVMatrix::getDefaultStream(); if (!add) { target.resize(probs); kCrossEntGrad<<>>(probs.getDevData(), labels.getDevData(), target.getDevData(), numCases, numOut, coeff); } else { kCrossEntGrad<<>>(probs.getDevData(), labels.getDevData(), target.getDevData(), numCases, numOut, coeff); } getLastCudaError("kCrossEntGrad: Kernel execution failed"); } void computeSoftmaxGrad(NVMatrix& acts, NVMatrix& actsGrad, NVMatrix& target, float scaleTarget, float scaleGrad) { int numCases = acts.getLeadingDim(); int numOut = acts.getFollowingDim(); assert(acts.isSameDims(actsGrad)); assert(acts.isContiguous()); assert(actsGrad.isContiguous()); assert(target.isContiguous()); assert(acts.isTrans()); assert(actsGrad.isTrans()); dim3 threads(LOGREG_GRAD_THREADS_X, LOGREG_GRAD_THREADS_Y); dim3 blocks(DIVUP(numCases, LOGREG_GRAD_THREADS_X), DIVUP(numOut, LOGREG_GRAD_THREADS_Y)); cudaStream_t stream = NVMatrix::getDefaultStream(); if (scaleTarget == 0) { target.resize(acts); kSoftmaxGrad<<>>(actsGrad.getDevData(), acts.getDevData(), target.getDevData(), numCases, numOut, scaleTarget, scaleGrad); } else { kSoftmaxGrad<<>>(actsGrad.getDevData(), acts.getDevData(), target.getDevData(), numCases, numOut, scaleTarget, scaleGrad); } getLastCudaError("computeSoftmaxGrad: Kernel execution failed"); } void computeCrossEntSoftmaxGrad(NVMatrix& labels, NVMatrix& probs, NVMatrix& target, bool add, float coeff) { int numCases = probs.getLeadingDim(); int numOut = probs.getFollowingDim(); assert(labels.getLeadingDim() == probs.getLeadingDim() && labels.getFollowingDim() == probs.getFollowingDim()); assert(probs.isContiguous()); assert(target.isContiguous()); assert(labels.isContiguous()); assert(probs.isTrans()); assert(!labels.isTrans()); dim3 threads(LOGREG_GRAD_THREADS_X, LOGREG_GRAD_THREADS_Y); dim3 blocks(DIVUP(numCases, LOGREG_GRAD_THREADS_X), DIVUP(numOut, LOGREG_GRAD_THREADS_Y)); cudaStream_t stream = NVMatrix::getDefaultStream(); if (!add) { target.resize(probs); cudaFuncSetCacheConfig(kCrossEntSoftmaxGrad, cudaFuncCachePreferL1); kCrossEntSoftmaxGrad<<>>(probs.getDevData(), labels.getDevData(), target.getDevData(), numCases, numOut, coeff); } else { cudaFuncSetCacheConfig(kCrossEntSoftmaxGrad, cudaFuncCachePreferL1); kCrossEntSoftmaxGrad<<>>(probs.getDevData(), labels.getDevData(), target.getDevData(), numCases, numOut, coeff); } getLastCudaError("kCrossEntSoftmaxGrad: Kernel execution failed"); } /* * E = -log(y_t) * probs: (numOut, numCases) * labels: (1, numCases) * maxProbs: (1, numCases) * labelLogProbs: (1, numCases) (*out) * correctProbs: (1, numCases) (*out) * * target: (1, numCases) == log(y_l[labels,:] */ void computeLogregCost(NVMatrix& labels, NVMatrix& probs, NVMatrix& maxProbs, NVMatrix& labelLogProbs_out, NVMatrix& correctProbs_out) { int numCases = probs.getNumCols(); int numOut = probs.getNumRows(); assert(labels.getNumElements() == numCases); assert(!labels.isTrans()); assert(!probs.isTrans()); assert(labels.isContiguous()); assert(probs.isContiguous()); labelLogProbs_out.resize(1, numCases); correctProbs_out.resize(1, numCases); dim3 threads(LOGREG_ERR_THREADS_X, 1); dim3 blocks(DIVUP(numCases, LOGREG_ERR_THREADS_X), 1); cudaStream_t stream = NVMatrix::getDefaultStream(); cudaFuncSetCacheConfig(kLogregCost, cudaFuncCachePreferL1); kLogregCost<<>>(probs.getDevData(), labels.getDevData(), maxProbs.getDevData(), labelLogProbs_out.getDevData(), correctProbs_out.getDevData(), numCases, numOut); getLastCudaError("computeLogregCost: Kernel execution failed"); } void computeLogregGrad(NVMatrix& labels, NVMatrix& probs, NVMatrix& target, bool add, float coeff) { int numCases = probs.getLeadingDim(); int numOut = probs.getFollowingDim(); assert(labels.getNumElements() == numCases); assert(probs.isContiguous()); assert(target.isContiguous()); assert(labels.isContiguous()); assert(!labels.isTrans()); assert(!probs.isTrans()); dim3 threads(LOGREG_GRAD_THREADS_X, LOGREG_GRAD_THREADS_Y); dim3 blocks(DIVUP(numCases, LOGREG_GRAD_THREADS_X), DIVUP(numOut, LOGREG_GRAD_THREADS_Y)); cudaStream_t stream = NVMatrix::getDefaultStream(); if (!add) { target.resize(probs); kLogregCostGrad<<>>(probs.getDevData(), labels.getDevData(), target.getDevData(), numCases, numOut, coeff); } else { kLogregCostGrad<<>>(probs.getDevData(), labels.getDevData(), target.getDevData(), numCases, numOut, coeff); } getLastCudaError("computeLogregGrad: Kernel execution failed"); } void computeLogregSoftmaxGrad(NVMatrix& labels, NVMatrix& probs, NVMatrix& target, bool add, float coeff) { int numCases = probs.getLeadingDim(); int numOut = probs.getFollowingDim(); assert(labels.getNumElements() == numCases); assert(probs.isContiguous()); assert(target.isContiguous()); assert(labels.isContiguous()); assert(probs.isTrans()); dim3 threads(LOGREG_GRAD_THREADS_X, LOGREG_GRAD_THREADS_Y); dim3 blocks(DIVUP(numCases, LOGREG_GRAD_THREADS_X), DIVUP(numOut, LOGREG_GRAD_THREADS_Y)); cudaStream_t stream = NVMatrix::getDefaultStream(); if (!add) { target.resize(probs); kLogregSoftmaxGrad<<>>(probs.getDevData(), labels.getDevData(), target.getDevData(), numCases, numOut, coeff); } else { kLogregSoftmaxGrad<<>>(probs.getDevData(), labels.getDevData(), target.getDevData(), numCases, numOut, coeff); } getLastCudaError("computeLogregSoftmaxGrad: Kernel execution failed"); } ================================================ FILE: caffe2/contrib/cuda-convnet2/cudaconvnet/src/lr.cu ================================================ /* * Copyright 2014 Google Inc. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include #include "../include/lr.cuh" #include "../include/util.cuh" /* * ================================== * ParameterSchedule * ================================== */ ParameterSchedule& ParameterSchedule::make(PyObject* schedDict) { std::string type = pyDictGetString(schedDict, "type"); PyObject* paramsDict = PyDict_GetItemString(schedDict, "params"); double base = pyDictGetFloat(paramsDict, "base"); if (type == "const") { return *new ParameterSchedule(base); } else { double tgtFactor = pyDictGetFloat(paramsDict, "tgtFactor"); if (type == "linear") { return *new LinearParameterSchedule(base, tgtFactor); } else if (type == "exp") { return *new ExpParameterSchedule(base, tgtFactor); } else if (type == "dexp") { double numSteps = pyDictGetInt(paramsDict, "numSteps"); return *new DiscreteExpParameterSchedule(base, tgtFactor, numSteps); } } throw std::string("Unknown learning rate schedule type ") + type; } ParameterSchedule::ParameterSchedule(double baseRate) : _baseRate(baseRate) { } double ParameterSchedule::getValue(double progress) { return _baseRate; } double ParameterSchedule::getBaseValue() const { return _baseRate; } ParameterSchedule::~ParameterSchedule() { } /* * ================================== * LinearParameterSchedule * ================================== */ LinearParameterSchedule::LinearParameterSchedule(double baseRate, double tgtFactor) : ParameterSchedule(baseRate) { _finalRate = baseRate / tgtFactor; } double LinearParameterSchedule::getValue(double progress) { return _baseRate * (1 - progress) + _finalRate * progress; } /* * ================================== * ExpParameterSchedule * ================================== */ ExpParameterSchedule::ExpParameterSchedule(double baseRate, double tgtFactor) : ParameterSchedule(baseRate) { _powBase = 1.0 / tgtFactor; } double ExpParameterSchedule::getValue(double progress) { return _baseRate * std::pow(_powBase, progress); } /* * ================================== * DiscreteExpParameterSchedule * ================================== */ DiscreteExpParameterSchedule::DiscreteExpParameterSchedule(double baseRate, double tgtFactor, int numSteps) : ParameterSchedule(baseRate) { ExpParameterSchedule elrs(baseRate, tgtFactor); double finalRate = baseRate / tgtFactor; for (int i = 0; i < numSteps - 1; i++) { double progress = double(i) / (numSteps - 1); _rates.push_back(elrs.getValue(progress)); } _rates.push_back(finalRate); //printf("initialized base %e, final %e, stpes %d\n", baseRate, finalRate, numSteps); } double DiscreteExpParameterSchedule::getValue(double progress) { for (int i = 0; i < _rates.size(); ++i) { if (progress <= double(i + 1) / _rates.size()) { return _rates[i]; } } return _rates.back(); } ================================================ FILE: caffe2/contrib/cuda-convnet2/cudaconvnet/src/memorysource.cu ================================================ /* * Copyright 2014 Google Inc. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "../include/memorysource.cuh" using namespace std; /* * ======================= * MemoryView * ======================= */ MemoryView::MemoryView(MemorySource& src, std::string& name) : _src(&src), _name(name) { } MemoryView::~MemoryView() { // if (_src->truncate(_name)) { // delete _src; // } } NVMatrix& MemoryView::getMemory(int numCases) { return _src->getMemory(_name, numCases); } NVMatrix& MemoryView::getMemory() { return _src->getMemory(_name); } MemorySource& MemoryView::getMemorySource() { return *_src; } bool MemoryView::isParent() { return _src->getRange(_name).first == 0 && _src->getRange(_name).second == _src->getSize(); } std::string& MemoryView::getName() { return _name; } MemoryView& MemoryView::clone(std::string& name) { return _src->addUser(name, _src->getRange(_name)); } /* * ======================= * MemorySource * ======================= */ MemorySource::MemorySource(int size, int deviceID) : _size(size), _deviceID(deviceID) { } MemorySource::~MemorySource() { // Each MemoryView is deleted by owner Layer, and the last one deletes the MemorySource. // So this is a no-op. } NVMatrix& MemorySource::getMemory(std::string& name) { return getMemory(name, _memory.getLeadingDim()); } // Deletes old view when appropriate NVMatrix& MemorySource::getMemory(std::string& name, int numCases) { numCases = numCases < 0 ? _memory.getLeadingDim() : numCases; _lock.acquire(); if (_memory.getLeadingDim() != numCases || _memory.getFollowingDim() != _size) { int d = NVMatrix::getDeviceID(); NVMatrix::setDeviceID(_deviceID); _memory.resize(_size, numCases, false); for (map::const_iterator it = _memoryViews.begin(); it != _memoryViews.end(); ++it) { delete it->second; } _memoryViews.clear(); if (d >= 0) { NVMatrix::setDeviceID(d); } } if (_memoryViews.count(name) == 0) { assert(!_memory.isTrans()); _memoryViews[name] = &_memory.sliceRows(_viewRanges[name].first, _viewRanges[name].second); } NVMatrix& view = *_memoryViews[name]; assert(view.isContiguous()); _lock.release(); return view; } MemoryView& MemorySource::addUser(std::string& name, std::pair range) { assert(_viewRanges.count(name) == 0); _viewRanges[name] = range; return *new MemoryView(*this, name); } MemoryView& MemorySource::addUser(std::string& name) { return addUser(name, std::pair(0, _size)); } MemoryView& MemorySource::make(int size, int deviceID, std::string& parentUser) { return (new MemorySource(size, deviceID))->addUser(parentUser); } pair MemorySource::getRange(std::string& name) { return _viewRanges[name]; } int MemorySource::getSize() { return _size; } bool MemorySource::truncate(std::string& name) { bool truncated = false; _lock.acquire(); _truncateRequests.insert(name); if (_truncateRequests.size() == _viewRanges.size()) { for (map::const_iterator it = _memoryViews.begin(); it != _memoryViews.end(); ++it) { delete it->second; } _memoryViews.clear(); _memory.truncate(); _truncateRequests.clear(); truncated = true; } _lock.release(); return truncated; } ================================================ FILE: caffe2/contrib/cuda-convnet2/cudaconvnet/src/neuron.cu ================================================ /* * Copyright 2014 Google Inc. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "../include/neuron.cuh" #include "../include/util.cuh" using namespace std; Neuron& Neuron::makeNeuron(PyObject* neuronDict) { std::string type = pyDictGetString(neuronDict, "type"); PyObject* neuronParamsDict = PyDict_GetItemString(neuronDict, "params"); if (type == "relu") { return *new ReluNeuron(); } if (type == "drelu") { return *new DoubleReluNeuron(pyDictGetFloat(neuronParamsDict, "a")); } if (type == "softrelu") { return *new SoftReluNeuron(); } if (type == "brelu") { return *new BoundedReluNeuron(pyDictGetFloat(neuronParamsDict, "a")); } if (type == "abs") { return *new AbsNeuron(); } if (type == "logistic") { return *new LogisticNeuron(); } if (type == "tanh") { return *new TanhNeuron(pyDictGetFloat(neuronParamsDict, "a"), pyDictGetFloat(neuronParamsDict, "b")); } if (type == "square") { return *new SquareNeuron(); } if (type == "sqrt") { return *new SqrtNeuron(); } if (type == "linear") { return *new LinearNeuron(pyDictGetFloat(neuronParamsDict, "a"), pyDictGetFloat(neuronParamsDict, "b")); } if (type == "log") { return *new LogNeuron(pyDictGetFloat(neuronParamsDict, "a")); } if (type == "ident") { return *new Neuron(); } throw std::string("Unknown neuron type: ") + type; } ================================================ FILE: caffe2/contrib/cuda-convnet2/cudaconvnet/src/pyconvnet.cu ================================================ /* * Copyright 2014 Google Inc. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include #include #include #include #include #include #include #include #include #include "../../util/include/matrix.h" #include "../../util/include/queue.h" #include "../include/worker.cuh" #include "../include/util.cuh" #include "../include/cost.cuh" #include "../include/pyconvnet.cuh" #include "../include/convnet.cuh" #include "../include/jpeg.h" using namespace std; static ConvNet* model = NULL; static PyMethodDef _ConvNetMethods[] = {{ "initModel", initModel, METH_VARARGS }, { "startBatch", startBatch, METH_VARARGS }, { "finishBatch", finishBatch, METH_VARARGS }, { "checkGradients", checkGradients, METH_VARARGS }, { "startMultiviewTest", startMultiviewTest, METH_VARARGS }, { "startFeatureWriter", startFeatureWriter, METH_VARARGS }, { "startDataGrad", startDataGrad, METH_VARARGS }, { "syncWithHost", syncWithHost, METH_VARARGS }, { "decodeJpeg", decodeJpeg, METH_VARARGS }, { NULL, NULL } }; void init_ConvNet() { (void) Py_InitModule("_ConvNet", _ConvNetMethods); import_array(); } void signalHandler(int sig) { const size_t max_trace_size = 40; void *array[max_trace_size]; size_t trace_size = backtrace(array, max_trace_size); fprintf(stderr, "Error signal %d:\n", sig); backtrace_symbols_fd(array, trace_size, STDERR_FILENO); exit(1); } PyObject* initModel(PyObject *self, PyObject *args) { assert(model == NULL); signal(SIGSEGV, signalHandler); signal(SIGABRT, signalHandler); PyDictObject* pyLayerParams; PyListObject* pyDeviceIDs; int pyMinibatchSize; int conserveMem; if (!PyArg_ParseTuple(args, "O!O!ii", &PyDict_Type, &pyLayerParams, &PyList_Type, &pyDeviceIDs, &pyMinibatchSize, &conserveMem)) { return NULL; } intv& deviceIDs = *getIntV((PyObject*)pyDeviceIDs); model = new ConvNet((PyObject*)pyLayerParams, deviceIDs, pyMinibatchSize, conserveMem); model->start(); return Py_BuildValue("i", 0); } /* * Starts training/testing on the given batch (asynchronous -- returns immediately). */ PyObject* startBatch(PyObject *self, PyObject *args) { assert(model != NULL); // printf("starting next batch\n"); PyListObject* data; double progress; int test = 0; if (!PyArg_ParseTuple(args, "O!d|i", &PyList_Type, &data, &progress, &test)) { return NULL; } CPUData* cpuData = new CPUData((PyObject*)data); TrainingWorker* wr = new TrainingWorker(*model, *cpuData, progress, test); model->getWorkerQueue().enqueue(wr); return Py_BuildValue("i", 0); } /* * Starts testing on the given batch (asynchronous -- returns immediately). */ PyObject* startMultiviewTest(PyObject *self, PyObject *args) { assert(model != NULL); PyListObject* data; int numViews; PyArrayObject* pyProbs = NULL; char* logregName = NULL; if (!PyArg_ParseTuple(args, "O!i|O!s", &PyList_Type, &data, &numViews, &PyArray_Type, &pyProbs, &logregName)) { return NULL; } CPUData* cpuData = new CPUData((PyObject*)data); MultiviewTestWorker* wr = pyProbs == NULL ? new MultiviewTestWorker(*model, *cpuData, numViews) : new MultiviewTestWorker(*model, *cpuData, numViews, *new Matrix(pyProbs), logregName); model->getWorkerQueue().enqueue(wr); return Py_BuildValue("i", 0); } PyObject* startFeatureWriter(PyObject *self, PyObject *args) { assert(model != NULL); PyListObject* data; PyListObject* pyFtrs; PyListObject* pyLayerNames; if (!PyArg_ParseTuple(args, "O!O!O!", &PyList_Type, &data, &PyList_Type, &pyFtrs, &PyList_Type, &pyLayerNames)) { return NULL; } stringv* layerNames = getStringV((PyObject*)pyLayerNames); CPUData* cpuData = new CPUData((PyObject*)data); MatrixV* ftrs = getMatrixV((PyObject*)pyFtrs); FeatureWorker* wr = new FeatureWorker(*model, *cpuData, *ftrs, *layerNames); model->getWorkerQueue().enqueue(wr); return Py_BuildValue("i", 0); } PyObject* startDataGrad(PyObject *self, PyObject *args) { // assert(model != NULL); // PyListObject* data; // int dataLayerIdx, softmaxLayerIdx; // if (!PyArg_ParseTuple(args, "O!ii", // &PyList_Type, &data, // &dataLayerIdx, &softmaxLayerIdx)) { // return NULL; // } // CPUData* cpuData = new CPUData((PyObject*)data); // Matrix& ftrs = *mvec.back(); // mvec.pop_back(); // // DataGradWorker* wr = new DataGradWorker(*model, *cpuData, ftrs, dataLayerIdx, softmaxLayerIdx); // model->getWorkerQueue().enqueue(wr); return Py_BuildValue("i", 0); } /* * Waits for the trainer to finish training on the batch given to startBatch. * This is a blocking call so lets release the GIL. */ PyObject* finishBatch(PyObject *self, PyObject *args) { assert(model != NULL); WorkResult* res = model->getResultQueue().dequeue(); assert(res != NULL); assert(res->getResultType() == WorkResult::BATCH_DONE); Cost& cost = res->getResults(); PyObject* dict = PyDict_New(); CostMap& costMap = cost.getCostMap(); for (CostMap::const_iterator it = costMap.begin(); it != costMap.end(); ++it) { PyObject* v = PyList_New(0); for (vector::const_iterator iv = it->second->begin(); iv != it->second->end(); ++iv) { PyObject* f = PyFloat_FromDouble(*iv); PyList_Append(v, f); } PyDict_SetItemString(dict, it->first.c_str(), v); } PyObject* retVal = Py_BuildValue("Ni", dict, cost.getNumCases()); delete res; // Deletes cost too return retVal; } PyObject* checkGradients(PyObject *self, PyObject *args) { assert(model != NULL); PyListObject* data; if (!PyArg_ParseTuple(args, "O!", &PyList_Type, &data)) { return NULL; } CPUData* cpuData = new CPUData((PyObject*)data); GradCheckWorker* wr = new GradCheckWorker(*model, *cpuData); model->getWorkerQueue().enqueue(wr); WorkResult* res = model->getResultQueue().dequeue(); assert(res != NULL); assert(res->getResultType() == WorkResult::BATCH_DONE); delete res; return Py_BuildValue("i", 0); } /* * Copies weight matrices from GPU to system memory. */ PyObject* syncWithHost(PyObject *self, PyObject *args) { assert(model != NULL); SyncWorker* wr = new SyncWorker(*model); model->getWorkerQueue().enqueue(wr); WorkResult* res = model->getResultQueue().dequeue(); assert(res != NULL); assert(res->getResultType() == WorkResult::SYNC_DONE); delete res; return Py_BuildValue("i", 0); } PyObject* decodeJpeg(PyObject *self, PyObject *args) { PyListObject* pyJpegStrings; PyArrayObject* pyTarget; int img_size, inner_size, test, multiview; if (!PyArg_ParseTuple(args, "O!O!iiii", &PyList_Type, &pyJpegStrings, &PyArray_Type, &pyTarget, &img_size, &inner_size, &test, &multiview)) { return NULL; } Thread* threads[NUM_JPEG_DECODER_THREADS]; int num_imgs = PyList_GET_SIZE(pyJpegStrings); int num_imgs_per_thread = DIVUP(num_imgs, NUM_JPEG_DECODER_THREADS); Matrix& dstMatrix = *new Matrix(pyTarget); for (int t = 0; t < NUM_JPEG_DECODER_THREADS; ++t) { int start_img = t * num_imgs_per_thread; int end_img = min(num_imgs, (t+1) * num_imgs_per_thread); threads[t] = new DecoderThread((PyObject*)pyJpegStrings, dstMatrix, start_img, end_img, img_size, inner_size, test, multiview); threads[t]->start(); } for (int t = 0; t < NUM_JPEG_DECODER_THREADS; ++t) { threads[t]->join(); delete threads[t]; } assert(dstMatrix.isView()); delete &dstMatrix; return Py_BuildValue("i", 0); } ================================================ FILE: caffe2/contrib/cuda-convnet2/cudaconvnet/src/reducepipeline.cu ================================================ /* * Copyright 2014 Google Inc. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include #include #include "../include/reducepipeline.cuh" using namespace std; /* ========================= * IReducerSegment * ========================= */ // Null mat --> reducer on host IReduceSegment::IReduceSegment(IEightGPUReducer& parent, int deviceID, Queue* finishQueue) : _deviceID(deviceID), _next(NULL), _finishQueue(finishQueue), Thread(true, getDeviceCPUs(parent.getTgtDeviceID())) { } IReduceSegment::~IReduceSegment() { } NVMatrix& IReduceSegment::getChunk(const NVMatrix& mat, int chunkSize, int chunkIdx) { NVMatrix& line = mat.reshaped(1, mat.getNumElements()); int start = chunkIdx * chunkSize; int end = min((chunkIdx+1) * chunkSize, mat.getNumElements()); // _mat->printShape("_mat"); NVMatrix& chunk = line.sliceCols(start, end); delete &line; // chunk.printShape("chunk"); return chunk; } void* IReduceSegment::run() { bool exit = false; while (!exit) { ReduceMessage& msg = *_queue.dequeue(); if (msg.getType() == EXIT) { exit = true; } else { bool term = processMessage(msg); if (term) { assert(_finishQueue); _finishQueue->enqueue(1); } } delete &msg; } return NULL; } inline NVMatrix& IReduceSegment::getMatrix(ReduceMessage& msg) { return msg.getMatrix(getDeviceID()); } Queue& IReduceSegment::getQueue() { return _queue; } inline int IReduceSegment::getDeviceID() const { return _deviceID; } void IReduceSegment::addPrev(IReduceSegment& c) { _prev.push_back(&c); } void IReduceSegment::addNext(ReducePeer& c) { assert(_next == NULL); _next = &c; c.addPrev(*this); } bool IReduceSegment::isTerminal() const { return _next == NULL; } /* ========================= * ReducerSource * ========================= */ ReducerSource::ReducerSource(IEightGPUReducer& parent, int deviceID) : IReduceSegment(parent, deviceID, NULL) { } bool ReducerSource::processMessage(ReduceMessage& msg) { assert(msg.getType() == REDUCE_START); int numChunks = min(getMatrix(msg).getNumElements(), max(REDUCE_MIN_CHUNKS, min(REDUCE_MAX_CHUNKS, DIVUP(getMatrix(msg).getNumElements(), REDUCE_MIN_CHUNK_SIZE)))); int chunkSize = DIVUP(getMatrix(msg).getNumElements(), numChunks); //printf("num chunks: %d\n", numChunks); for (int c = 0; c <= numChunks; ++c) { _next->getQueue().enqueue(new ReduceChunkMessage(*this, c, chunkSize, numChunks, msg.getScaleIntermediates(), msg.getScaleTarget(), msg.getMatrices())); } return false; } /* ========================= * ReducerPeer * ========================= */ ReducePeer::ReducePeer(IEightGPUReducer& parent,int deviceID, Queue* finishQueue) : IReduceSegment(parent, deviceID, finishQueue), _numInputsFinished(0) { _add = deviceID != DEVICE_HOST; } ReducePeer::ReducePeer(IEightGPUReducer& parent) : IReduceSegment(parent, DEVICE_HOST, NULL), _numInputsFinished(0), _add(false) { } ReducePeer::~ReducePeer() { for(std::map::iterator it = _streams.begin(); it != _streams.end(); ++it) { checkCudaErrors(cudaStreamDestroy(it->second)); } _streams.clear(); } inline cudaStream_t ReducePeer::getStream(int deviceID) { if (deviceID < 0) { return NULL; } if (_streams.count(deviceID) == 0) { NVMatrix::setDeviceID(deviceID); checkCudaErrors(cudaStreamCreateWithFlags(&_streams[deviceID], cudaStreamNonBlocking)); } return _streams[deviceID]; } bool ReducePeer::processMessage(ReduceMessage& msg) { assert(msg.getType() == REDUCE_CHUNK); ReduceChunkMessage& cmsg = *static_cast(&msg); // if (_numInputsReceived.count(cmsg.getChunkIdx()) == 0) { // _numInputsReceived[cmsg.getChunkIdx()] = 0; // } int& inputsRcvd = ++_numInputsReceived[cmsg.getChunkIdx()]; // printf("reducer on device %d got msg chunk idx %d of %d, inputs rcvd for this chunk idx: %d/%d\n", // getDeviceID(), cmsg.getChunkIdx(), cmsg.getNumChunks(),_numInputsReceived[cmsg.getChunkIdx()], _prev.size()); if (cmsg.getChunkIdx() < cmsg.getNumChunks()) { IReduceSegment& src = cmsg.getSource(); float scalePrev = isTerminal() ? cmsg.getScaleIntermediates() : 1; float scaleSelf = inputsRcvd == 1 ? _add * (isTerminal() ? cmsg.getScaleTarget() : 1): 1; if (scaleSelf == 0 || isTerminal()) { if (getDeviceID() >= 0) { NVMatrix::setDeviceID(getDeviceID()); } getMatrix(msg).resize(src.getMatrix(msg)); } assert(getMatrix(msg).isSameDims(src.getMatrix(msg))); NVMatrix& prevChunk = getChunk(src.getMatrix(msg), cmsg.getChunkSize(), cmsg.getChunkIdx()); NVMatrix& myChunk = getChunk(getMatrix(msg), cmsg.getChunkSize(), cmsg.getChunkIdx()); int execDeviceID = getDeviceID() >= 0 ? getDeviceID() : src.getDeviceID(); if (execDeviceID >= 0) { NVMatrix::setDeviceID(execDeviceID); prevChunk.add(myChunk, scalePrev, scaleSelf, myChunk, getStream(execDeviceID)); NVMatrix::syncStream(getStream(execDeviceID)); } else { assert(!isTerminal()); hostAdd(prevChunk.getDevData(), myChunk.getDevData(), prevChunk.getNumElements(), scaleSelf); } delete &prevChunk; delete &myChunk; } else { _numInputsFinished++; } if (!isTerminal() && inputsRcvd == _prev.size()) { // printf(" device %d enqueueing msg for next on device %d\n", getDeviceID(), _next->getDeviceID()); _next->getQueue().enqueue( new ReduceChunkMessage(*this, cmsg.getChunkIdx(), cmsg.getChunkSize(), cmsg.getNumChunks(), cmsg.getScaleIntermediates(), cmsg.getScaleTarget(), cmsg.getMatrices())); } bool finished = _numInputsFinished == _prev.size(); if (finished) { _numInputsFinished = 0; _numInputsReceived.clear(); } return finished && isTerminal(); } void ReducePeer::hostAdd(const float* src, float* tgt, const int n, const float scaleTgt) { if (scaleTgt != 0) { for (int i = 0; i < n; ++i) { tgt[i] = scaleTgt * tgt[i] + src[i]; } } else { for (int i = 0; i < n; ++i) { tgt[i] = src[i]; } } } inline NVMatrix& ReducePeer::getMatrix(ReduceMessage& msg) { if (getDeviceID() != DEVICE_HOST) { return IReduceSegment::getMatrix(msg); } return _mat; } /* ========================= * EightGPUReducer * ========================= */ IEightGPUReducer::IEightGPUReducer(int tgtDeviceID) : _tgtDeviceID(tgtDeviceID) { } IEightGPUReducer::~IEightGPUReducer() { vector v; v.insert(v.end(), _sources.begin(), _sources.end()); v.insert(v.end(), _peers.begin(), _peers.end()); for (vector::iterator it = v.begin(); it != v.end(); ++it) { (*it)->getQueue().enqueue(new ReduceMessage(EXIT)); (*it)->join(); delete *it; } } IEightGPUReducer& IEightGPUReducer::construct() { vector same, other; for (int i = 0; i < 8; ++i) { if (i != _tgtDeviceID) { if (NVMatrix::canAccessPeer(_tgtDeviceID, i)) { same.insert(same.begin() + rand() % (1 + same.size()), i); } else { other.insert(other.begin() + rand() % (1 + other.size()), i); } } } assert(same.size() == 3); assert(other.size() == 4); makeConnections(same, other); for (vector::const_iterator it = _sources.begin(); it != _sources.end(); ++it) { (*it)->start(); } for (vector::const_iterator it = _peers.begin(); it != _peers.end(); ++it) { (*it)->start(); } return *this; } void IEightGPUReducer::reduce(std::map& mats, float scaleIntermediates, float scaleTarget) { assert(mats.size() == 8); // Check if source matrices are 0-sized bool zero = true; for (map::const_iterator it = mats.begin(); it != mats.end(); ++it) { if (it->first != _tgtDeviceID && it->second->getNumElements() != 0) { zero = false; break; } } if (zero) { mats[_tgtDeviceID]->resize(*mats[(_tgtDeviceID + 1) % 8]); } else { for (vector::const_iterator it = _sources.begin(); it != _sources.end(); ++it) { (*it)->getQueue().enqueue(new ReduceStartMessage(scaleIntermediates, scaleTarget, mats)); } _finishQueue.dequeue(); } assert(_finishQueue.getNumElements() == 0); } void IEightGPUReducer::reduce(std::map& mats, float scaleIntermediates) { reduce(mats, scaleIntermediates, 1); } void IEightGPUReducer::reduce(std::map& mats) { reduce(mats, 1, 1); } int IEightGPUReducer::getTgtDeviceID() const { return _tgtDeviceID; } /* ========================= * EightGPUReducer1 * ========================= */ EightGPUReducer1::EightGPUReducer1(int tgtDeviceID) : IEightGPUReducer(tgtDeviceID) { } void EightGPUReducer1::makeConnections(vector& same, vector&other) { // Setup segments on same truck _peers.push_back(new ReducePeer(*this, _tgtDeviceID, &_finishQueue)); // peers[0] = tgt _peers.push_back(new ReducePeer(*this,same[0], &_finishQueue)); // peers[1] = same truck 1 _peers.push_back(new ReducePeer(*this,same[1], &_finishQueue)); // peers[2] = same truck 2 _sources.push_back(new ReducerSource(*this,same[2])); // sources[0] = same truck 3 _sources[0]->addNext(*_peers[2]); _peers[2]->addNext(*_peers[1]); _peers[1]->addNext(*_peers[0]); // Setup segments on other truck _sources.push_back(new ReducerSource(*this,other[0])); // sources[1] = other truck 1 _peers.push_back(new ReducePeer(*this,other[1], &_finishQueue)); // peers[3] = other truck 2 _peers.push_back(new ReducePeer(*this,other[2], &_finishQueue)); // peers[4] = other truck 3 _sources.push_back(new ReducerSource(*this,other[3])); // sources[2] = other truck 4 _peers.push_back(new ReducePeer(*this)); // peers[5] = host 1 _peers.push_back(new ReducePeer(*this)); // peers[6] = host 2 _peers.push_back(new ReducePeer(*this)); // peers[7] = host 3 _sources[1]->addNext(*_peers[3]); _peers[3]->addNext(*_peers[5]); _peers[5]->addNext(*_peers[7]); _peers[7]->addNext(*_peers[0]); _peers[4]->addNext(*_peers[6]); _peers[6]->addNext(*_peers[7]); _sources[2]->addNext(*_peers[4]); } /* ========================= * EightGPUReducer2 * ========================= */ EightGPUReducer2::EightGPUReducer2(int tgtDeviceID) : IEightGPUReducer(tgtDeviceID) { } void EightGPUReducer2::makeConnections(vector& same, vector&other) { // Setup segments on same truck _peers.push_back(new ReducePeer(*this,_tgtDeviceID, &_finishQueue)); // peers[0] = tgt _peers.push_back(new ReducePeer(*this,same[0], &_finishQueue)); // peers[1] = same truck 1 _peers.push_back(new ReducePeer(*this,same[1], &_finishQueue)); // peers[2] = same truck 2 _sources.push_back(new ReducerSource(*this,same[2])); // sources[0] = same truck 3 _sources[0]->addNext(*_peers[2]); _peers[2]->addNext(*_peers[1]); _peers[1]->addNext(*_peers[0]); // Setup segments on other truck _sources.push_back(new ReducerSource(*this,other[0])); // sources[1] = other truck 1 _peers.push_back(new ReducePeer(*this,other[1], &_finishQueue)); // peers[3] = other truck 2 _peers.push_back(new ReducePeer(*this,other[2], &_finishQueue)); // peers[4] = other truck 3 _peers.push_back(new ReducePeer(*this,other[3], &_finishQueue)); // peers[5] = other truck 4 _peers.push_back(new ReducePeer(*this)); // peers[6] = host 1 _sources[1]->addNext(*_peers[3]); _peers[3]->addNext(*_peers[4]); _peers[4]->addNext(*_peers[5]); _peers[5]->addNext(*_peers[6]); _peers[6]->addNext(*_peers[0]); } ================================================ FILE: caffe2/contrib/cuda-convnet2/cudaconvnet/src/streambroadcast.cu ================================================ /* * Copyright 2014 Google Inc. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "../include/streambroadcast.cuh" using namespace std; /* * ===================== * StreamBroadcast * ===================== */ StreamBroadcast::StreamBroadcast(map& streams) { _streams = streams; } StreamBroadcast::StreamBroadcast() { } void StreamBroadcast::toHostMem(NVMatrix& src, NVMatrix& hostmem, int srcDevice) { src.copy(hostmem, _streams[srcDevice]); } void StreamBroadcast::toTarget(NVMatrix& hostmem, NVMatrix& tgt, int tgtDevice, float scaleTarget, float scaleOutput) { tgt.add(hostmem, scaleTarget, scaleOutput, tgt, _streams[tgtDevice]); } void StreamBroadcast::init(map& mats) { for (map::const_iterator it = mats.begin(); it != mats.end(); ++it) { if (_streams.count(it->first) == 0) { _ownedStreams.insert(it->first); NVMatrix::setDeviceID(it->first); checkCudaErrors(cudaStreamCreateWithFlags(&_streams[it->first], cudaStreamNonBlocking)); } } } StreamBroadcast::~StreamBroadcast() { for (set::const_iterator it = _ownedStreams.begin(); it != _ownedStreams.end(); ++it) { checkCudaErrors(cudaStreamDestroy(_streams[*it])); } } cudaStream_t StreamBroadcast::getStream(int deviceID) { return _streams[deviceID]; } // Sync stream associated with given device id void StreamBroadcast::sync(int deviceID) { NVMatrix::syncStream(_streams[deviceID]); } void StreamBroadcast::transfer(map& mats, int srcDevice) { transfer(mats, _hostMem, srcDevice, 0, 1); } void StreamBroadcast::transfer(map& mats, int srcDevice, float scaleTarget, float scaleOutput) { transfer(mats, _hostMem, srcDevice, scaleTarget, scaleOutput); } void StreamBroadcast::transfer(map& mats, HostNVMatrix& hostbuf, int srcDevice, float scaleTarget, float scaleOutput) { int oldDeviceID = NVMatrix::getDeviceID(); assert(mats.count(srcDevice) != 0); init(mats); // assert(_streams.count(srcDevice) != 0); if (mats.size() > 1) { if (mats[srcDevice]->getNumElements() == 0) { for (map::const_iterator it = mats.begin(); it != mats.end(); ++it) { it->second->resize(*mats[srcDevice]); } } else { int tgtDevice = mats.begin()->first != srcDevice ? mats.begin()->first : (++mats.begin())->first; // This case is a simple copy if (mats.size() == 2 && NVMatrix::canAccessPeer(tgtDevice, srcDevice)) { NVMatrix::setDeviceID(tgtDevice); mats[tgtDevice]->add(*mats[srcDevice], scaleTarget, scaleOutput, *mats[tgtDevice], _streams[tgtDevice]); } else { NVMatrix& src = *mats[srcDevice]; if (hostbuf.getNumElements() < src.getNumElements()) { hostbuf.resize(1,src.getNumElements()); } hostbuf.setTrans(src.isTrans()); NVMatrix& hostmat = hostbuf.sliceCols(0, src.getNumElements()); assert(hostmat.isView()); hostmat.reshape(src.getNumRows(), src.getNumCols()); for (map::const_iterator it = mats.begin(); it != mats.end(); ++it) { assert(it->second->isContiguous()); NVMatrix::setDeviceID(it->first); it->second->resize(src); assert(it->second->isTrans() == src.isTrans()); } int numChunks = min(DIVUP(src.getNumElements(), SB_MIN_CHUNK_SIZE), SB_MAX_CHUNKS); if (numChunks == 1) { // This is a bit faster for small matrices NVMatrix::setDeviceID(srcDevice); toHostMem(src, hostmat, srcDevice); NVMatrix::syncStream(_streams[srcDevice]); for (map::const_iterator it = mats.begin(); it != mats.end(); ++it) { if (it->first != src.getDataDeviceID()) { NVMatrix::setDeviceID(it->first); toTarget(hostmat, *it->second, it->first, scaleTarget, scaleOutput); } } } else { int n = src.getNumElements(); map lines; for (map::const_iterator it = mats.begin(); it != mats.end(); ++it) { lines[it->first] = &it->second->reshaped(1, n); lines[it->first]->setTrans(src.isTrans()); } NVMatrix& srcLine = *lines[srcDevice]; hostmat.reshape(1, n); int chunkSize = DIVUP(n, numChunks); bool trans = src.isTrans(); for (int i = 0; i < numChunks; ++i) { int start = i * chunkSize; int end = min((i+1) * chunkSize, n); if (start < end) { NVMatrix& tmpSrc = srcLine.sliceCols(start, end); // view NVMatrix& tmpHostmem = hostmat.sliceCols(start, end); // view NVMatrix::setDeviceID(srcDevice); toHostMem(tmpSrc, tmpHostmem, srcDevice); NVMatrix::syncStream(_streams[srcDevice]); for (map::const_iterator it = lines.begin(); it != lines.end(); ++it) { if (it->first != srcDevice) { NVMatrix& tmpTgt = it->second->sliceCols(start, end); // view NVMatrix::setDeviceID(it->first); toTarget(tmpHostmem, tmpTgt, it->first, scaleTarget, scaleOutput); delete &tmpTgt; } } delete &tmpSrc; delete &tmpHostmem; } } for (map::const_iterator it = lines.begin(); it != lines.end(); ++it) { delete it->second; } } delete &hostmat; } for(map::const_iterator it = mats.begin(); it != mats.end(); ++it) { if (it->first != srcDevice) { NVMatrix::syncStream(_streams[it->first]); } } } } if (oldDeviceID >= 0) { NVMatrix::setDeviceID(oldDeviceID); } } ================================================ FILE: caffe2/contrib/cuda-convnet2/cudaconvnet/src/util.cu ================================================ /* * Copyright 2014 Google Inc. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include #include #include #include "../include/util.cuh" using namespace std; stringv* getStringV(PyObject* pyList) { if (pyList == NULL) { return NULL; } stringv* vec = new stringv(); for (int i = 0; i < PyList_GET_SIZE(pyList); i++) { vec->push_back(std::string(PyString_AS_STRING(PyList_GET_ITEM(pyList, i)))); } return vec; } floatv* getFloatV(PyObject* pyList) { if (pyList == NULL) { return NULL; } floatv* vec = new floatv(); for (int i = 0; i < PyList_GET_SIZE(pyList); i++) { vec->push_back(PyFloat_AS_DOUBLE(PyList_GET_ITEM(pyList, i))); } return vec; } intv* getIntV(PyObject* pyList) { if (pyList == NULL) { return NULL; } intv* vec = new intv(); for (int i = 0; i < PyList_GET_SIZE(pyList); i++) { vec->push_back(PyInt_AS_LONG(PyList_GET_ITEM(pyList, i))); } return vec; } int* getIntA(PyObject* pyList) { if (pyList == NULL) { return NULL; } int* arr = new int[PyList_GET_SIZE(pyList)]; for (int i = 0; i < PyList_GET_SIZE(pyList); i++) { arr[i] = PyInt_AS_LONG(PyList_GET_ITEM(pyList, i)); } return arr; } MatrixV* getMatrixV(PyObject* pyList) { return getMatrixV(pyList, PyList_GET_SIZE(pyList)); } MatrixV* getMatrixV(PyObject* pyList, int len) { if (pyList == NULL) { return NULL; } MatrixV* vec = new MatrixV(); for (int i = 0; i < len; i++) { vec->push_back(new Matrix((PyArrayObject*)PyList_GET_ITEM(pyList, i))); } return vec; } PyObjectV* pyDictGetValues(PyObject* dict) { PyObjectV* pov = new PyObjectV(); PyObject* valuesList = PyDict_Values(dict); int numValues = PyList_GET_SIZE(valuesList); for (int i = 0; i < numValues; i++) { pov->push_back(PyList_GET_ITEM(valuesList, i)); } Py_DECREF(valuesList); return pov; } int pyDictGetInt(PyObject* dict, const char* key) { return PyInt_AS_LONG(PyDict_GetItemString(dict, key)); } intv* pyDictGetIntV(PyObject* dict, const char* key) { return getIntV(PyDict_GetItemString(dict, key)); } int* pyDictGetIntA(PyObject* dict, const char* key) { return getIntA(PyDict_GetItemString(dict, key)); } std::string pyDictGetString(PyObject* dict, const char* key) { return std::string(PyString_AS_STRING(PyDict_GetItemString(dict, key))); } float pyDictGetFloat(PyObject* dict, const char* key) { return PyFloat_AS_DOUBLE(PyDict_GetItemString(dict, key)); } floatv* pyDictGetFloatV(PyObject* dict, const char* key) { return getFloatV(PyDict_GetItemString(dict, key)); } Matrix* pyDictGetMatrix(PyObject* dict, const char* key) { return new Matrix((PyArrayObject*)PyDict_GetItemString(dict, key)); } MatrixV* pyDictGetMatrixV(PyObject* dict, const char* key) { return getMatrixV(PyDict_GetItemString(dict, key)); } stringv* pyDictGetStringV(PyObject* dict, const char* key) { return getStringV(PyDict_GetItemString(dict, key)); } bool pyDictHasKey(PyObject* dict, const char* key) { PyObject* str = PyString_FromString(key); bool b = PyDict_Contains(dict, str); Py_DECREF(str); return b; } template void shuffleVector(vector& v, int start, int end) { const int len = end - start; for (int i = 0; i < len*5; ++i) { int r1 = start + rand() % len; int r2 = start + rand() % len; int tmp = v[r1]; v[r1] = v[r2]; v[r2] = tmp; } } template std::string tostr(T n) { ostringstream result; result << n; return result.str(); } template void deleteElements(vector& v) { deleteElements(v, false); } template void deleteElements(vector& v, bool deleteContainer) { for (typename vector::const_iterator it = v.begin(); it != v.end(); ++it) { delete *it; } if (deleteContainer) { delete &v; } } static Lock deviceCPULock; static std::map > deviceCPUs; std::vector& getDeviceCPUs(int deviceID) { deviceCPULock.acquire(); if (deviceCPUs.count(deviceID) == 0 && deviceID >= 0) { struct cudaDeviceProp props; checkCudaErrors(cudaGetDeviceProperties(&props, deviceID)); char pciString[13]; sprintf(pciString, "%04x", props.pciDomainID); pciString[4] = ':'; sprintf(pciString + 5, "%02x", props.pciBusID); pciString[7] = ':'; sprintf(pciString + 8, "%02x", props.pciDeviceID); pciString[10] = '.'; pciString[11] = '0'; pciString[12] = 0; std::string path = std::string("/sys/bus/pci/devices/") + std::string(pciString) + "/local_cpulist"; ifstream f(path.c_str()); if (f.is_open()) { std::string cpuString; while (getline(f, cpuString, ',')) { int start, end; int found = sscanf(cpuString.c_str(), "%d-%d", &start, &end); end = found == 1 ? start : end; if (found > 0) { for (int i = start; i <= end; ++i) { deviceCPUs[deviceID].push_back(i); } } } f.close(); } else { printf("Unable to open %s\n", path.c_str()); } } vector& ret = deviceCPUs[deviceID]; deviceCPULock.release(); return ret; } template void shuffleVector(std::vector& v, int start, int end); template std::string tostr(int n); template void deleteElements(std::vector& v, bool deleteContainer); ================================================ FILE: caffe2/contrib/cuda-convnet2/cudaconvnet/src/weights.cu ================================================ /* * Copyright 2014 Google Inc. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include #include #include "../include/weights.cuh" #include "../include/lr.cuh" #include "../include/worker.cuh" using namespace std; /* ======================== * IWeightReducer * ======================== */ int IWeightReducer::getDeviceID() { return _replicas[_tgtReplicaID]->getDeviceID(); } IWeightReducer::IWeightReducer(std::map& replicas, int tgtReplicaID) : _replicas(replicas), _tgtReplicaID(tgtReplicaID) { } IWeightReducer::~IWeightReducer() { } IWeightReducer& IWeightReducer::make(std::map& replicas, int tgtReplicaID) { if (replicas.size() == 8) { return *new ParallelWeightReducer(replicas, tgtReplicaID); } return *new SequentialWeightReducer(replicas, tgtReplicaID); } /* ======================== * SequentialWeightReducer * ======================== */ SequentialWeightReducer::SequentialWeightReducer(std::map& replicas, int tgtReplicaID) : IWeightReducer(replicas, tgtReplicaID) { _sb = new StreamBroadcast(); } SequentialWeightReducer::~SequentialWeightReducer() { delete _sb; } void SequentialWeightReducer::reduce(std::map gradShards, float gradScale, bool toInc) { std::map mats; // device id -> grad mats[getDeviceID()] = toInc ? &_replicas[_tgtReplicaID]->getInc() : &_replicas[_tgtReplicaID]->getGrad(); for (int i = 0, r = _tgtReplicaID; i < _replicas.size(); ++i, r = (r + 1) % _replicas.size()) { if (r != _tgtReplicaID) { mats[_replicas[r]->getDeviceID()] = gradShards[r]; _sb->transfer(mats, _replicas[r]->getDeviceID(), 1, gradScale); mats.erase(_replicas[r]->getDeviceID()); } } } /* ======================== * ParallelWeightReducer * ======================== */ ParallelWeightReducer::ParallelWeightReducer(std::map& replicas, int tgtReplicaID) : IWeightReducer(replicas, tgtReplicaID) { _reducer = &(new EightGPUReducer1(getDeviceID()))->construct(); } ParallelWeightReducer::~ParallelWeightReducer() { delete _reducer; } void ParallelWeightReducer::reduce(std::map gradShards, float gradScale, bool toInc) { std::map mats; // device id -> grad mats[getDeviceID()] = toInc ? &_replicas[_tgtReplicaID]->getInc() : &_replicas[_tgtReplicaID]->getGrad(); for (std::map::const_iterator it = _replicas.begin(); it != _replicas.end(); ++it) { if (it->first != _tgtReplicaID) { mats[it->second->getDeviceID()] = gradShards[it->first]; } } _reducer->reduce(mats, gradScale, 1); } // weights has pointer to layer, layer pointer to thread // thread has sync (copy) object for every other thread // weights uses copy object to sum grad contributions into inc matrix slice (phase 1) // weights broadcasts inc matrix slice to other inc matrix replicas (phase 2) NVMatrix& Weights::operator*() const { return getW(); } /* * TODO: get rid of this constructor duplication. */ Weights::Weights(Weights& srcWeights, ParameterSchedule& lrs, Layer& parent) { init(srcWeights.getCPUW(), srcWeights.getCPUWInc(), lrs, parent, 0, 0, srcWeights.getMom(), srcWeights.isUseGrad(), false); _srcWeights = &srcWeights; } Weights::Weights(Matrix& hWeights, Matrix& hWeightsInc, ParameterSchedule& lrs, Layer& parent, float wc, float wball, float mom, bool useGrad) { init(hWeights, hWeightsInc, lrs, parent, wc, wball, mom, useGrad, true); } void Weights::init(Matrix& hWeights, Matrix& hWeightsInc, ParameterSchedule& lrs, Layer& parent, float wc, float wball, float mom, bool useGrad, bool cleanup) { _srcWeights = NULL; _hWeights = &hWeights; _hWeightsInc = &hWeightsInc; _numUpdates = 0; _lrs = &lrs; _parent = &parent; _wc = wc; _wball = wball; _mom = mom; _useGrad = useGrad; _onGPU = false; _weights = NULL; _weightsInc = NULL; _weightsGrad = NULL; _cleanup = cleanup; _reducer = NULL; _broadcaster = NULL; } Weights::~Weights() { delete _lrs; delete _reducer; delete _broadcaster; if (_cleanup) { delete _hWeights; delete _hWeightsInc; if (_srcWeights == NULL) { delete _weights; delete _weightsInc; delete _weightsGrad; } } } NVMatrix& Weights::getW() const { assert(_onGPU); return *_weights; } NVMatrix& Weights::getInc() const { assert(_onGPU); return *_weightsInc; } /* * TODO: This seems like pretty nasty behavior, I should change this. */ NVMatrix& Weights::getGrad() const { assert(_onGPU); return _useGrad ? *_weightsGrad : *_weightsInc; } Matrix& Weights::getCPUW() const { return *_hWeights; } Matrix& Weights::getCPUWInc() const { return *_hWeightsInc; } int Weights::getNumRows() const { return _hWeights->getNumRows(); } int Weights::getNumCols() const { return _hWeights->getNumCols(); } map& Weights::getReplicas() { return _replicas; } template T& Weights::getShard(T& mat, int replicaID) { const int n = mat.getNumElements(); T& line = mat.reshaped(1, n); const int shardStart = min(n, replicaID * _shardSize); const int shardEnd = min(n, (replicaID + 1) * _shardSize); T& slice = line.sliceCols(shardStart, shardEnd); assert(slice.isView()); delete &line; return slice; } template T& Weights::getShard(T& mat) { return getShard(mat, getReplicaID()); } ISafeBroadcastNetwork& Weights::getBroadcaster() { if (_broadcaster == NULL) { set devices; for (map::const_iterator it = _replicas.begin(); it != _replicas.end(); ++it) { devices.insert(it->second->getDeviceID()); } // NOTE: we must use safe broadcaster becasue we want to *add* our value to everyone else _broadcaster = &ISafeBroadcastNetwork::make(devices, getDeviceID()); //&(new NaiveBroadcaster(devices, getDeviceID()))->construct(); } return *_broadcaster; } IWeightReducer& Weights::getReducer() { if (_reducer == NULL) { _reducer = &IWeightReducer::make(_replicas, getReplicaID()); } return *_reducer; } void Weights::copyToCPU() { if (_srcWeights == NULL) { assert(_onGPU); NVMatrix::syncStream(); // for safety if (getReplicaID() == 0) { _weights->copyToHost(*_hWeights); // Synchronize weights amongst replicas while we're at it. map weights; for (map::const_iterator it = _replicas.begin(); it != _replicas.end(); ++it) { weights[it->second->getDeviceID()] = &it->second->getW(); } // These things sync before returning. getBroadcaster().broadcast(weights, 1, 0); } if (_useGrad) { Matrix& hIncShard = getShard(*_hWeightsInc); _weightsInc->copyToHost(hIncShard); delete &hIncShard; } else { // In this case there's definitely only one replica _weightsInc->copyToHost(*_hWeightsInc); } } } // This function is assumed to be called in the order in which the layers // were defined void Weights::copyToGPU() { assert(!_onGPU); // Copies are performed on the default (computation) stream, so that's fine. if (_srcWeights == NULL) { _weights = _weights == NULL ? new NVMatrix() : _weights; _weightsInc = _weightsInc == NULL ? new NVMatrix() : _weightsInc; _weights->copyFromHost(*_hWeights, true); if (_useGrad) { // In this case there is no need to store the entire inc matrix. // Just this replica's shard (for synchronization purposes) will do. Matrix& hIncShard = getShard(*_hWeightsInc); _weightsInc->copyFromHost(hIncShard, true); delete &hIncShard; } else { _weightsInc->copyFromHost(*_hWeightsInc, true); } _weightsGrad = _useGrad ? (_weightsGrad == NULL ? new NVMatrix(*_weights) : _weightsGrad) : NULL; } else { _weights = _srcWeights->_weights; _weightsInc = _srcWeights->_weightsInc; _weightsGrad = _srcWeights->_weightsGrad; } _onGPU = true; } void Weights::aggregateReplicaGradients(float progress) { map gradShards; map wShards; for (map::const_iterator it = _replicas.begin(); it != _replicas.end(); ++it) { gradShards[it->first] = &getShard(it->second->getGrad(), getReplicaID()); wShards[it->first] = &getShard(it->second->getW(), getReplicaID()); assert(wShards[it->first]->isContiguous() && gradShards[it->first]->isContiguous()); } float gradScale = _lrs->getValue(progress); NVMatrix::setDeviceID(getDeviceID()); if (_wc > 0) { NVMatrixTernaryOps::WeightedAdd wadd = NVMatrixTernaryOps::WeightedAdd(_mom, gradScale, -_wc * _lrs->getValue(progress)); _weightsInc->applyTernary(wadd, *gradShards[getReplicaID()], *wShards[getReplicaID()], *_weightsInc); } else { _weightsInc->add(*gradShards[getReplicaID()], _mom, gradScale); } // Reduce everyone's gradient into my inc shard NVMatrix::syncStream(); // Crucial since the reducer does everything in its own streams!! getReducer().reduce(gradShards, gradScale, true); // Broadcast my inc -> all replicas map mats; // device id -> grad mats[getDeviceID()] = _weightsInc; for (map::const_iterator it = _replicas.begin(); it != _replicas.end(); ++it) { if (it->first != getReplicaID()) { mats[it->second->getDeviceID()] = wShards[it->first]; } } getBroadcaster().broadcast(mats, 1, 1); NVMatrix::setDeviceID(getDeviceID()); wShards[getReplicaID()]->add(*_weightsInc); // Cleanup for (map::const_iterator it = _replicas.begin(); it != _replicas.end(); ++it) { delete gradShards[it->first]; delete wShards[it->first]; } } // When _useGrad is false, weightsInc is assumed to contain the // entire, properly scaled weight increment. // OTHERWISE, scale your gradient by 1 / numCases only. // The scaling by epsW will be done in this routine. void Weights::update(float progress) { // Only true owner of weights updates // printf("%s update weights\n", _parent->getName().c_str()); if (_srcWeights == NULL && _lrs->getBaseValue() > 0) { assert(_onGPU); if (_useGrad) { aggregateReplicaGradients(progress); } else { // Definitely no replicas in this case if (_wc > 0) { _weightsInc->add(*_weights, -_wc * _lrs->getValue(progress)); } _weights->add(*_weightsInc); } _numUpdates = 0; } } int Weights::incNumUpdates() { if (_srcWeights != NULL) { return _srcWeights->incNumUpdates(); } return _numUpdates++; } // Returns the number of times a gradient has been computed for this // weight matrix during the current pass (interval between two calls of update()) // through the net. This number will only be greater than 1 if this weight matrix // is *shared* by multiple layers in the net. int Weights::getNumUpdates() const { if (_srcWeights != NULL) { return _srcWeights->getNumUpdates(); } return _numUpdates; } float Weights::getEps(float progress) const { return _lrs->getValue(progress); } float Weights::getMom() const { return _mom; } float Weights::getWC() const { return _wc; } float Weights::getWBall() const { return _wball; } bool Weights::isUseGrad() const { // is good grammar return _useGrad; } bool Weights::isOwner() const { return _srcWeights == NULL; } ParameterSchedule& Weights::getLearningRateSchedule() const { return *_lrs; } void Weights::addReplica(Weights& replica) { _replicas[replica.getReplicaID()] = &replica; const int n = _hWeights->getNumElements(); _shardSize = DIVUP(n, _replicas.size()); } int Weights::getReplicaID() { return _parent->getReplicaID(); } int Weights::getDeviceID() { return _parent->getDeviceID(); } Layer& Weights::getParent() { return *_parent; } /* * =============== * WeightList * =============== */ Weights& WeightList::operator[](const int i) const { return *_weightList[i]; } Weights& WeightList::at(const int i) const { return *_weightList[i]; } WeightList::~WeightList() { for (int i = 0; i < _weightList.size(); i++) { delete _weightList[i]; } } WeightList::WeightList() { } void WeightList::addWeights(Weights& w) { _weightList.push_back(&w); } void WeightList::update(float progress) { for (int i = 0; i < getSize(); i++) { _weightList[i]->update(progress); } } void WeightList::copyToCPU() { for (int i = 0; i < getSize(); i++) { _weightList[i]->copyToCPU(); } } void WeightList::copyToGPU() { for (int i = 0; i < getSize(); i++) { _weightList[i]->copyToGPU(); } } int WeightList::getSize() const { return _weightList.size(); } void WeightList::addReplica(WeightList& replica) { for (int i = 0; i < getSize(); i++) { _weightList[i]->addReplica(replica[i]); } } ================================================ FILE: caffe2/contrib/cuda-convnet2/cudaconvnet/src/worker.cu ================================================ /* * Copyright 2014 Google Inc. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include #include "../include/util.cuh" #include "../include/worker.cuh" #include "../include/timer.cuh" using namespace std; /* * ==================== * WorkResult * ==================== */ WorkResult::WorkResult(WorkResult::RESULTS resultType, Cost& results) : _resultType(resultType), _results(&results) { } WorkResult::WorkResult(WorkResult::RESULTS resultType) : _resultType(resultType), _results(NULL) { } WorkResult::~WorkResult() { delete _results; // delete NULL is ok } Cost& WorkResult::getResults() const { return *_results; } WorkResult::RESULTS WorkResult::getResultType() const { return _resultType; } /* * ==================== * Worker * ==================== */ Worker::Worker(ConvNet& convNet) : _convNet(&convNet) { } Worker::~Worker() { } /* * ==================== * DataWorker * ==================== */ DataWorker::DataWorker(ConvNet& convNet, CPUData& data) : Worker(convNet), _data(&data), _dp(NULL) { assert(_data != NULL); } bool DataWorker::run() { _dp = &_convNet->getDataProvider(); _dp->setData(*_data); _run(); _dp->clearData(); return false; } DataWorker::~DataWorker() { } /* * ==================== * TrainingWorker * ==================== */ TrainingWorker::TrainingWorker(ConvNet& convNet, CPUData& data, double progress, bool test) : DataWorker(convNet, data), _progress(progress), _test(test) { } void TrainingWorker::_run() { _convNet->setTrainingProgress(_progress); Cost& batchCost = *new Cost(); int numMinibatches = _dp->getNumMinibatches(); for (int i = 0; i < numMinibatches; i++) { for (int p = 0; p < _convNet->getNumPasses(); p++) { _convNet->fprop(i, p, _test ? PASS_TEST : PASS_TRAIN); _convNet->getCost(batchCost); if (!_test) { _convNet->bprop(p, PASS_TRAIN); _convNet->updateWeights(p); } } } _convNet->getResultQueue().enqueue(new WorkResult(WorkResult::BATCH_DONE, batchCost)); } /* * ==================== * SyncWorker * ==================== */ SyncWorker::SyncWorker(ConvNet& convNet) : Worker(convNet) { } bool SyncWorker::run() { _convNet->copyToCPU(); _convNet->getResultQueue().enqueue(new WorkResult(WorkResult::SYNC_DONE)); return false; } /* * ==================== * ExitWorker * ==================== */ ExitWorker::ExitWorker(ConvNet& convNet) : Worker(convNet) { } bool ExitWorker::run() { return true; } /* * ==================== * GradCheckWorker * ==================== */ GradCheckWorker::GradCheckWorker(ConvNet& convNet, CPUData& data) : DataWorker(convNet, data) { } void GradCheckWorker::_run() { _convNet->checkGradients(); exit(0); // eh } /* * ==================== * MultiviewTestWorker * ==================== */ MultiviewTestWorker::MultiviewTestWorker(ConvNet& convNet, CPUData& data, int numViews, Matrix& cpuProbs, const char* logregName) : DataWorker(convNet, data), _numViews(numViews), _cpuProbs(&cpuProbs), _logregName(logregName) { // assert(_data->getNumCases() % _numViews == 0); // assert(convNet.getNumReplicas() == 1); // For now? } MultiviewTestWorker::MultiviewTestWorker(ConvNet& convNet, CPUData& data, int numViews) : DataWorker(convNet, data), _numViews(numViews), _cpuProbs(NULL), _logregName("") { // assert(_data->getNumCases() % _numViews == 0); } MultiviewTestWorker::~MultiviewTestWorker() { // delete _cpuProbs; } CPUData& MultiviewTestWorker::getMinibatch(int v, int i) { int numCasesPerView = _dp->getNumCases() / _numViews; int miniStart = v * numCasesPerView + i * _dp->getMinibatchSize(); int miniEnd = v * numCasesPerView + min(numCasesPerView, (i + 1) * _dp->getMinibatchSize()); CPUData& mini = _dp->getDataSlice(miniStart, miniEnd); return mini; } void MultiviewTestWorker::_run() { int numCasesPerView = _dp->getNumCases() / _numViews; int numMiniPerView = DIVUP(numCasesPerView, _dp->getMinibatchSize()); Cost& batchCost = *new Cost(); for (int i = 0; i < numMiniPerView; i++) { for (int v = 0; v < _numViews - 1; v++) { for (int p = 0; p < _convNet->getNumPasses(); p++) { _convNet->fprop(getMinibatch(v, i), p, v == 0 ? PASS_MULTIVIEW_TEST_START : PASS_MULTIVIEW_TEST); } } for (int p = 0; p < _convNet->getNumPasses(); p++) { _convNet->fprop(getMinibatch(_numViews - 1, i), p, PASS_MULTIVIEW_TEST_END); _convNet->getCost(batchCost); } // if (_cpuProbs != NULL) { // LogregCostLayer& logregLayer = *dynamic_cast(&_convNet->getLayer(_logregName, 0)); // NVMatrix::setDeviceID(logregLayer.getDeviceID()); // Matrix& miniProbs = _cpuProbs->sliceRows(i * _dp->getMinibatchSize(), // min(numCasesReal, (i + 1) * _dp->getMinibatchSize())); // NVMatrix& acts = logregLayer.getProbsAccum(); // NVMatrix acts_T; // acts.transpose(acts_T); // acts_T.copyToHost(miniProbs); // // delete &miniProbs; // } } _convNet->getResultQueue().enqueue(new WorkResult(WorkResult::BATCH_DONE, batchCost)); } /* * ==================== * FeatureWorker * ==================== */ FeatureWorker::FeatureWorker(ConvNet& convNet, CPUData& data, MatrixV& ftrs, stringv& layerNames, bool deleteFeatures) : DataWorker(convNet, data), _ftrs(&ftrs), _layerNames(&layerNames), _deleteFeatures(deleteFeatures) { assert(layerNames.size() == ftrs.size()); for (int i = 0; i < layerNames.size(); i++) { assert(ftrs[i]->getNumRows() == data.getNumCases()); assert(!ftrs[i]->isTrans()); } } FeatureWorker::~FeatureWorker() { if (_deleteFeatures) { for (int i = 0; i < _ftrs->size(); i++) { delete _ftrs->at(i); } delete _ftrs; } delete _layerNames; } void FeatureWorker::_run() { Cost& batchCost = *new Cost(); map repStart; // Feature write start offsets within minibatch for (int i = 0; i < _dp->getNumMinibatches(); i++) { for (int f = 0; f < _layerNames->size(); f++) { repStart[f] = 0; } for (int p = 0; p < _convNet->getNumPasses(); p++) { _convNet->fprop(i, p, PASS_FEATURE_GEN); _convNet->getCost(batchCost); for (int f = 0; f < _layerNames->size(); f++) { if (_convNet->getLayer(_layerNames->at(f), 0).getFwdActiveInputReplicaIdx(p) >= 0) { Matrix& miniFtrs = _ftrs->at(f)->sliceRows(i * _dp->getMinibatchSize(), min(_dp->getNumCases(), (i + 1) * _dp->getMinibatchSize())); for (int r = 0; r < _convNet->getLayer(_layerNames->at(f), 0).getNumReplicas(); ++r) { Layer& ftrLayer = _convNet->getLayer(_layerNames->at(f), r); int d = ftrLayer.getDeviceID(); NVMatrix::setDeviceID(d); NVMatrix& acts = ftrLayer.getActs(); Matrix& repMiniFtrs = miniFtrs.sliceRows(repStart[f], min(int(miniFtrs.getNumRows()), repStart[f] + acts.getLeadingDim())); NVMatrix acts_T; acts.transpose(false); acts.transpose(acts_T); acts_T.copyToHost(repMiniFtrs); NVMatrix::syncStream(); // eh why not delete &repMiniFtrs; repStart[f] += acts.getLeadingDim(); } delete &miniFtrs; } } } } _convNet->getResultQueue().enqueue(new WorkResult(WorkResult::BATCH_DONE, batchCost)); } /* * ==================== * DataGradWorker * ==================== */ DataGradWorker::DataGradWorker(ConvNet& convNet, CPUData& data, Matrix& dataGrads, int dataLayerIdx, int softmaxLayerIdx) : DataWorker(convNet, data), _dataGrads(&dataGrads), _dataLayerIdx(dataLayerIdx), _softmaxLayerIdx(softmaxLayerIdx) { // assert(dataGrads.getNumRows() == data.getNumCases()); // assert(!dataGrads.isTrans()); } DataGradWorker::~DataGradWorker() { // delete _dataGrads; } void DataGradWorker::_run() { // DataLayer& dataLayer = *dynamic_cast(&_convNet->getLayer(_dataLayerIdx)); // SoftmaxLayer& softmaxLayer = *dynamic_cast(&_convNet->getLayer(_softmaxLayerIdx)); // softmaxLayer.setDoLogregGrad(false); // Cost& batchCost = *new Cost(0); // for (int i = 0; i < _dp->getNumMinibatches(); i++) { // _convNet->fprop(i, PASS_TEST); // _convNet->getCost(batchCost); // softmaxLayer.getActs().apply(NVMatrixOps::Log(), softmaxLayer.getActsGrad()); // // softmaxLayer.getActsGrad().addScalar(1); // softmaxLayer.getActsGrad().scale(-1); // softmaxLayer.incRcvdBInputs(); // softmaxLayer.bprop(PASS_TEST); // // Matrix& miniDataGrads = _dataGrads->sliceRows(i * _dp->getMinibatchSize(), // min(_dp->getNumCases(), (i + 1) * _dp->getMinibatchSize())); // NVMatrix& grads = dataLayer.getActsGrad(); // NVMatrix grads_T; // if (grads.isTrans()) { // NVMatrix& soft_T = grads.getTranspose(); // soft_T.transpose(grads_T); // delete &soft_T; // } else { // grads.transpose(grads_T); // } // grads_T.copyToHost(miniDataGrads); // delete &miniDataGrads; // // _convNet->reset(); // } // cudaThreadSynchronize(); // _convNet->getResultQueue().enqueue(new WorkResult(WorkResult::BATCH_DONE, batchCost)); } ================================================ FILE: caffe2/contrib/cuda-convnet2/initw.py ================================================ # Copyright 2014 Google Inc. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from python_util.gpumodel import * import numpy as n import numpy.random as nr def get_src(filename): src = IGPUModel.load_checkpoint(filename) return src['model_state']['layers'] # Initialize weight matrix by copying weight matrix of given layer def makew(name, idx, shape, params): src = get_src(params[0]) return src[name]['weights'][idx] # Initialize bias vector by copying bias vector of given layer def makeb(name, shape, params): src = get_src(params[0]) return src[name]['biases'] def concat(shape, src, src_layers, src_func): mat = n.empty(shape, dtype=n.single, order='F') start = 0 for s in src_layers: m = src_func(src[s]) mat[:,start:start+m.shape[1]] = m start += m.shape[1] return mat # Initialize weight matrix by concatenating weight matrices of given layers def makewcat(name, idx, shape, params): src, src_layers = get_src(params[0]), params[1:] return concat(shape, src, src_layers, lambda x: x['weights'][idx]) # Initialize bias vector by concatenating bias vectors of given layers def makebcat(name, shape, params): src, src_layers = get_src(params[0]), params[1:] return concat(shape, src, src_layers, lambda x: x['biases']) # Initialize bias vector from tuple input def makeb_vec(name, shape, params): return n.array([n.single(x) for x in params], dtype=n.single).reshape((1, len(params))) ================================================ FILE: caffe2/contrib/cuda-convnet2/layer.py ================================================ # Copyright 2014 Google Inc. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from math import exp import sys import ConfigParser as cfg import os import numpy as n import numpy.random as nr from math import ceil, floor from collections import OrderedDict from os import linesep as NL from python_util.options import OptionsParser import re class LayerParsingError(Exception): pass # A neuron that doesn't take parameters class NeuronParser: def __init__(self, type, func_str, uses_acts=True, uses_inputs=True): self.type = type self.func_str = func_str self.uses_acts = uses_acts self.uses_inputs = uses_inputs def parse(self, type): if type == self.type: return {'type': self.type, 'params': {}, 'usesActs': self.uses_acts, 'usesInputs': self.uses_inputs} return None # A neuron that takes parameters class ParamNeuronParser(NeuronParser): neuron_regex = re.compile(r'^\s*(\w+)\s*\[\s*(\w+(\s*,\w+)*)\s*\]\s*$') def __init__(self, type, func_str, uses_acts=True, uses_inputs=True): NeuronParser.__init__(self, type, func_str, uses_acts, uses_inputs) m = self.neuron_regex.match(type) self.base_type = m.group(1) self.param_names = m.group(2).split(',') assert len(set(self.param_names)) == len(self.param_names) def parse(self, type): m = re.match(r'^%s\s*\[([\d,\.\s\-]*)\]\s*$' % self.base_type, type) if m: try: param_vals = [float(v.strip()) for v in m.group(1).split(',')] if len(param_vals) == len(self.param_names): return {'type': self.base_type, 'params': dict(zip(self.param_names, param_vals)), 'usesActs': self.uses_acts, 'usesInputs': self.uses_inputs} except TypeError: pass return None class AbsTanhNeuronParser(ParamNeuronParser): def __init__(self): ParamNeuronParser.__init__(self, 'abstanh[a,b]', 'f(x) = a * |tanh(b * x)|') def parse(self, type): dic = ParamNeuronParser.parse(self, type) # Make b positive, since abs(tanh(bx)) = abs(tanh(-bx)) and the C++ code # assumes b is positive. if dic: dic['params']['b'] = abs(dic['params']['b']) return dic class ParamParser: lrs_regex = re.compile(r'^\s*(\w+)\s*(?:\[\s*(\w+(\s*;\w+)*)\s*\])?\s*$') param_converters = {'i': int, 'f': float} def __init__(self, type): m = self.lrs_regex.match(type) self.base_type = m.group(1) param_names_with_type = m.group(2).split(';') if m.group(2) is not None else [] self.param_names = [p[1:] for p in param_names_with_type] self.param_types = [self.param_converters[p[0]] for p in param_names_with_type] self.param_regex_inner = ";".join([('\s*%s\s*=\s*[^;,\s=]+\s*' % p) for p in self.param_names]) self.regex_str = ('^%s\s*(?:\[(%s)\])?\s*$') % (self.base_type, self.param_regex_inner) assert len(set(self.param_names)) == len(self.param_names) def parse(self, type): m = re.match(self.regex_str, type, flags=re.IGNORECASE) if m: try: param_vals = [ptype(v.split('=')[1].strip()) for ptype,v in zip(self.param_types, m.group(1).split(';'))] if m.group(1) is not None else [] if len(param_vals) == len(self.param_names): return {'type': self.base_type, 'params': dict(zip(self.param_names, param_vals))} except TypeError: pass return None # Subclass that throws more convnet-specific exceptions than the default class MyConfigParser(cfg.SafeConfigParser): def safe_get(self, section, option, f=cfg.SafeConfigParser.get, typestr=None, default=None): try: return f(self, section, option) except cfg.NoOptionError, e: if default is not None: return default raise LayerParsingError("Layer '%s': required parameter '%s' missing" % (section, option)) except ValueError, e: if typestr is None: raise e raise LayerParsingError("Layer '%s': parameter '%s' must be %s" % (section, option, typestr)) def safe_get_list(self, section, option, f=str, typestr='strings', default=None): v = self.safe_get(section, option, default=default) if type(v) == list: return v try: return [f(x.strip()) for x in v.split(',')] except: raise LayerParsingError("Layer '%s': parameter '%s' must be ','-delimited list of %s" % (section, option, typestr)) def safe_get_int(self, section, option, default=None): return self.safe_get(section, option, f=cfg.SafeConfigParser.getint, typestr='int', default=default) def safe_get_float(self, section, option, default=None): return self.safe_get(section, option, f=cfg.SafeConfigParser.getfloat, typestr='float', default=default) def safe_get_bool(self, section, option, default=None): return self.safe_get(section, option, f=cfg.SafeConfigParser.getboolean, typestr='bool', default=default) def safe_get_float_list(self, section, option, default=None): return self.safe_get_list(section, option, float, typestr='floats', default=default) def safe_get_int_list(self, section, option, default=None): return self.safe_get_list(section, option, int, typestr='ints', default=default) def safe_get_bool_list(self, section, option, default=None): return self.safe_get_list(section, option, lambda x: x.lower() in ('true', '1'), typestr='bools', default=default) # A class that implements part of the interface of MyConfigParser class FakeConfigParser(object): def __init__(self, dic): self.dic = dic def safe_get(self, section, option, default=None): if option in self.dic: return self.dic[option] return default def safe_get_int(self, section, option, default=None): return int(self.safe_get(section, option, default)) def safe_get_int_list(self, section, option, default=None): return list(self.safe_get(section, option, default)) class LayerParser: def __init__(self): self.dic = {} self.set_defaults() # Post-processing step -- this is called after all layers have been initialized def optimize(self, layers): self.dic['actsTarget'] = -1 self.dic['actsGradTarget'] = -1 if len(set(len(l['gpu']) for l in layers.values() if 'inputs' in l and self.dic['name'] in l['inputs'])) > 1: # print set(len(l['gpu']) for l in layers.values()) raise LayerParsingError("Layer '%s': all next layers must have equal number of replicas." % (self.dic['name'])) def parse_params(self, vals, parsers, param_name, human_name, num_params=1): dic, name = self.dic, self.dic['name'] # print vals if len(vals) != num_params and len(vals) != 1: raise LayerParsingError("Layer '%s': expected list of length %d for %s but got list of length %d."% (name, num_params, param_name, len(vals))) parsed = [] # print vals for v in vals: for p in parsers: parsedv = p.parse(v) if parsedv: parsed += [parsedv] break if len(parsed) == 1 and num_params > 1: parsed = parsed * num_params if len(parsed) == num_params: return parsed # print parsed, vals raise LayerParsingError("Layer '%s': unable to parse %s %s=%s." % (name, human_name, param_name, ",".join(vals))) # Add parameters from layer parameter file def add_params(self, mcp): pass # self.dic['conserveMem'] = mcp.convnet.op.get_value('conserve_mem') if mcp.convnet is not None else 0 def init(self, dic): self.dic = dic return self def set_defaults(self): self.dic['outputs'] = 0 self.dic['parser'] = self self.dic['requiresParams'] = False # Does this layer use its own activity matrix # for some purpose other than computing its output? # Usually, this will only be true for layers that require their # own activity matrix for gradient computations. For example, layers # with logistic units must compute the gradient y * (1 - y), where y is # the activity matrix. # # Layers that do not not use their own activity matrix should advertise # this, since this will enable memory-saving matrix re-use optimizations. # # The default value of this property is True, for safety purposes. # If a layer advertises that it does not use its own activity matrix when # in fact it does, bad things will happen. self.dic['usesActs'] = True # Does this layer use the activity matrices of its input layers # for some purpose other than computing its output? # # Again true by default for safety self.dic['usesInputs'] = True # Force this layer to use its own activity gradient matrix, # instead of borrowing one from one of its inputs. # # This should be true for layers where the mapping from output # gradient to input gradient is non-elementwise. self.dic['forceOwnActs'] = True # Does this layer need the gradient at all? # Should only be true for layers with parameters (weights). self.dic['gradConsumer'] = False # The gpu indices on which this layer runs self.dic['gpu'] = [-1] def parse(self, name, mcp, prev_layers, model=None): self.prev_layers = prev_layers self.dic['name'] = name self.dic['type'] = mcp.safe_get(name, 'type') self.dic['id'] = len(prev_layers) return self.dic def verify_float_range(self, v, param_name, _min, _max): self.verify_num_range(v, param_name, _min, _max, strconv=lambda x: '%.3f' % x) def verify_num_range(self, v, param_name, _min, _max, strconv=lambda x:'%d' % x): if type(v) == list: for i,vv in enumerate(v): self._verify_num_range(vv, param_name, _min, _max, i, strconv=strconv) else: self._verify_num_range(v, param_name, _min, _max, strconv=strconv) def _verify_num_range(self, v, param_name, _min, _max, input=-1, strconv=lambda x:'%d' % x): layer_name = self.dic['name'] if input < 0 else '%s[%d]' % (self.dic['name'], input) if _min is not None and _max is not None and (v < _min or v > _max): raise LayerParsingError("Layer '%s': parameter '%s' must be in the range %s-%s" % (layer_name, param_name, strconv(_min), strconv(_max))) elif _min is not None and v < _min: raise LayerParsingError("Layer '%s': parameter '%s' must be greater than or equal to %s" % (layer_name, param_name, strconv(_min))) elif _max is not None and v > _max: raise LayerParsingError("Layer '%s': parameter '%s' must be smaller than or equal to %s" % (layer_name, param_name, strconv(_max))) def verify_divisible(self, value, div, value_name, div_name=None, input_idx=0): layer_name = self.dic['name'] if len(self.dic['inputs']) == 0 else '%s[%d]' % (self.dic['name'], input_idx) if value % div != 0: raise LayerParsingError("Layer '%s': parameter '%s' must be divisible by %s" % (layer_name, value_name, str(div) if div_name is None else "'%s'" % div_name)) def verify_str_in(self, value, param_name, lst, input_idx=-1): lname = self.dic['name'] if input_idx == -1 else ('%s[%d]' % (self.dic['name'], input_idx)) if value not in lst: raise LayerParsingError("Layer '%s': parameter '%s' must be one of %s" % (lname, param_name, ", ".join("'%s'" % s for s in lst))) def verify_int_in(self, value, param_name, lst): if value not in lst: raise LayerParsingError("Layer '%s': parameter '%s' must be one of %s" % (self.dic['name'], param_name, ", ".join("'%d'" % s for s in lst))) def verify_all_ints_in(self, values, param_name, lst): if len([v for v in values if v not in lst]) > 0: raise LayerParsingError("Layer '%s': all parameters to '%s' must be among %s" % (self.dic['name'], param_name, ", ".join("'%d'" % s for s in lst))) def verify_input_dims(self, dims): for i,d in enumerate(dims): if d is not None and self.dic['numInputs'][i] != d: # first input must be labels raise LayerParsingError("Layer '%s': dimensionality of input %d must be %d" % (self.dic['name'], i, d)) # This looks for neuron=x arguments in various layers, and creates # separate layer definitions for them. @staticmethod def detach_neuron_layers(layers): for name,l in layers.items(): if l['type'] != 'neuron' and 'neuron' in l and l['neuron']: NeuronLayerParser().detach_neuron_layer(name, layers) @staticmethod def parse_layers(layer_cfg_path, param_cfg_path, model, layers={}): try: if not os.path.exists(layer_cfg_path): raise LayerParsingError("Layer definition file '%s' does not exist" % layer_cfg_path) if not os.path.exists(param_cfg_path): raise LayerParsingError("Layer parameter file '%s' does not exist" % param_cfg_path) if len(layers) == 0: mcp = MyConfigParser(dict_type=OrderedDict) mcp.readfp(open(layer_cfg_path)) for name in mcp.sections(): if not mcp.has_option(name, 'type'): raise LayerParsingError("Layer '%s': no type given" % name) ltype = mcp.safe_get(name, 'type') if ltype not in layer_parsers: raise LayerParsingError("Layer '%s': Unknown layer type: '%s'" % (name, ltype)) layers[name] = layer_parsers[ltype]().parse(name, mcp, layers, model) LayerParser.detach_neuron_layers(layers) for l in layers.values(): l['parser'].optimize(layers) del l['parser'] for name,l in layers.items(): if not l['type'].startswith('cost.'): found = max(name in l2['inputs'] for l2 in layers.values() if 'inputs' in l2) if not found: raise LayerParsingError("Layer '%s' of type '%s' is unused" % (name, l['type'])) mcp = MyConfigParser(dict_type=OrderedDict) mcp.readfp(open(param_cfg_path)) # mcp.convnet = model for name,l in layers.items(): if not mcp.has_section(name) and l['requiresParams']: raise LayerParsingError("Layer '%s' of type '%s' requires extra parameters, but none given in file '%s'." % (name, l['type'], param_cfg_path)) lp = layer_parsers[l['type']]().init(l) lp.add_params(mcp) except LayerParsingError, e: print e sys.exit(1) return layers @staticmethod def register_layer_parser(ltype, cls): if ltype in layer_parsers: raise LayerParsingError("Layer type '%s' already registered" % ltype) layer_parsers[ltype] = cls # Any layer that takes an input (i.e. non-data layer) class LayerWithInputParser(LayerParser): def __init__(self, num_inputs=-1): LayerParser.__init__(self) self.num_inputs = num_inputs def verify_num_params(self, params, auto_expand=True): for param in params: if len(self.dic[param]) != len(self.dic['inputs']): if auto_expand and len(self.dic[param]) == 1: self.dic[param] *= len(self.dic['inputs']) else: raise LayerParsingError("Layer '%s': %s list length does not match number of inputs" % (self.dic['name'], param)) # layers: dictionary: name -> layer def optimize(self, layers): LayerParser.optimize(self, layers) dic = self.dic # Check if I have an input that no one else uses. #print "Layer %s optimizing" % dic['name'] if not dic['forceOwnActs']: for i, inp in enumerate(dic['inputLayers']): if inp['outputs'] == dic['outputs'] and sum(('inputs' in ll) and (inp['name'] in ll['inputs']) for ll in layers.itervalues()) == 1: # I can share my activity matrix with this layer # if it does not use its activity matrix, and I # do not need to remember my inputs. # TODO: a dropout layer should always be able to overwrite # its input. Make it so. # print "Layer %s(uses inputs=%d), input %s(uses acts = %d)" % (dic['name'], dic['usesInputs'], inp['name'], inp['usesActs']) if not inp['usesActs'] and not dic['usesInputs']: dic['actsTarget'] = i print "Layer %s using acts from layer %s" % (dic['name'], inp['name']) # print "Layer '%s' sharing activity matrix with layer '%s'" % (dic['name'], l['name']) # I can share my gradient matrix with this layer if we're on the same GPU. # This is different from the logic for actsTarget because this guy doesn't # have an actsGrad matrix on my GPU if our GPUs are different, so there's # nothing to share. if dic['gpu'] == inp['gpu']: dic['actsGradTarget'] = i # print "Layer '%s' sharing activity gradient matrix with layer '%s'" % (dic['name'], l['name']) def parse(self, name, mcp, prev_layers, model=None): dic = LayerParser.parse(self, name, mcp, prev_layers, model) dic['inputs'] = [inp.strip() for inp in mcp.safe_get(name, 'inputs').split(',')] for inp in dic['inputs']: if inp not in prev_layers: raise LayerParsingError("Layer '%s': input layer '%s' not defined" % (name, inp)) dic['inputLayers'] = [prev_layers[inp] for inp in dic['inputs']] dic['gpu'] = mcp.safe_get_int_list(name, 'gpu', default=dic['inputLayers'][0]['gpu']) dic['gpus'] = ", ".join('%s' % d for d in dic['gpu']) dic['numReplicas'] = len(dic['gpu']) if len(set(dic['gpu'])) != len(dic['gpu']): raise LayerParsingError("Layer '%s': all replicas must run on different GPUs." % (name)) for inp in dic['inputs']: # Data layers do not explicitly define how many replicas they have. # The number of replicas for a data layer is given by the number of replicas # in the next layer(s). So we set that here. inpl = prev_layers[inp] if inpl['type'] == 'data': inpl['numReplicas'] = dic['numReplicas'] if inpl['numReplicas'] % dic['numReplicas'] != 0: raise LayerParsingError("Layer '%s': number of replicas (%d) must divide number of replicas in all input layers (input %s has %d replicas)." % (name, dic['numReplicas'], inpl['name'], inpl['numReplicas'])) if len(set(inp['numReplicas'] for inp in dic['inputLayers'])) != 1: raise LayerParsingError("Layer '%s': all input layers must have equal numbers of replicas." % (name)) # Need to also assert that all *next* layers have equal number of replicas but this is hard so it's done in Layer.optimize for inp in dic['inputLayers']: if inp['outputs'] == 0: raise LayerParsingError("Layer '%s': input layer '%s' does not produce any output" % (name, inp['name'])) dic['numInputs'] = [inp['outputs'] for inp in dic['inputLayers']] # Layers can declare a neuron activation function to apply to their output, as a shortcut # to avoid declaring a separate neuron layer above themselves. dic['neuron'] = mcp.safe_get(name, 'neuron', default="") if self.num_inputs > 0 and len(dic['numInputs']) != self.num_inputs: raise LayerParsingError("Layer '%s': number of inputs must be %d" % (name, self.num_inputs)) if model: self.verify_all_ints_in(dic['gpu'], 'gpu', range(len(model.op.get_value('gpu')))) return dic def verify_img_size(self): dic = self.dic if dic['numInputs'][0] % dic['imgPixels'] != 0 or dic['imgSize'] * dic['imgSize'] != dic['imgPixels']: raise LayerParsingError("Layer '%s': has %-d dimensional input, not interpretable as %d-channel images" % (dic['name'], dic['numInputs'][0], dic['channels'])) @staticmethod def grad_consumers_below(dic): if dic['gradConsumer']: return True if 'inputLayers' in dic: return any(LayerWithInputParser.grad_consumers_below(l) for l in dic['inputLayers']) def verify_no_grads(self): if LayerWithInputParser.grad_consumers_below(self.dic): raise LayerParsingError("Layer '%s': layers of type '%s' cannot propagate gradient and must not be placed over layers with parameters." % (self.dic['name'], self.dic['type'])) class NailbedLayerParser(LayerWithInputParser): def __init__(self): LayerWithInputParser.__init__(self, num_inputs=1) def parse(self, name, mcp, prev_layers, model=None): dic = LayerWithInputParser.parse(self, name, mcp, prev_layers, model) dic['forceOwnActs'] = False dic['usesActs'] = False dic['usesInputs'] = False dic['channels'] = mcp.safe_get_int(name, 'channels') dic['stride'] = mcp.safe_get_int(name, 'stride') self.verify_num_range(dic['channels'], 'channels', 1, None) # Computed values dic['imgPixels'] = dic['numInputs'][0] / dic['channels'] dic['imgSize'] = int(n.sqrt(dic['imgPixels'])) dic['outputsX'] = (dic['imgSize'] + dic['stride'] - 1) / dic['stride'] dic['start'] = (dic['imgSize'] - dic['stride'] * (dic['outputsX'] - 1)) / 2 dic['outputs'] = dic['channels'] * dic['outputsX']**2 self.verify_num_range(dic['outputsX'], 'outputsX', 0, None) self.verify_img_size() print "Initialized bed-of-nails layer '%s' on GPUs %s, producing %dx%d %d-channel output" % (name, dic['gpus'], dic['outputsX'], dic['outputsX'], dic['channels']) return dic class GaussianBlurLayerParser(LayerWithInputParser): def __init__(self): LayerWithInputParser.__init__(self, num_inputs=1) def parse(self, name, mcp, prev_layers, model=None): dic = LayerWithInputParser.parse(self, name, mcp, prev_layers, model) dic['forceOwnActs'] = False dic['usesActs'] = False dic['usesInputs'] = False dic['outputs'] = dic['numInputs'][0] dic['channels'] = mcp.safe_get_int(name, 'channels') dic['filterSize'] = mcp.safe_get_int(name, 'filterSize') dic['stdev'] = mcp.safe_get_float(name, 'stdev') self.verify_num_range(dic['channels'], 'channels', 1, None) self.verify_int_in(dic['filterSize'], 'filterSize', [3, 5, 7, 9]) # Computed values dic['imgPixels'] = dic['numInputs'][0] / dic['channels'] dic['imgSize'] = int(n.sqrt(dic['imgPixels'])) dic['filter'] = n.array([exp(-(dic['filterSize']/2 - i)**2 / float(2 * dic['stdev']**2)) for i in xrange(dic['filterSize'])], dtype=n.float32).reshape(1, dic['filterSize']) dic['filter'] /= dic['filter'].sum() self.verify_img_size() if dic['filterSize'] > dic['imgSize']: raise LayerParsingError("Later '%s': filter size (%d) must be smaller than image size (%d)." % (dic['name'], dic['filterSize'], dic['imgSize'])) print "Initialized Gaussian blur layer '%s', producing %dx%d %d-channel output" % (name, dic['imgSize'], dic['imgSize'], dic['channels']) return dic class HorizontalReflectionLayerParser(LayerWithInputParser): def __init__(self): LayerWithInputParser.__init__(self, num_inputs=1) def parse(self, name, mcp, prev_layers, model=None): dic = LayerWithInputParser.parse(self, name, mcp, prev_layers, model) dic['outputs'] = dic['numInputs'][0] dic['channels'] = mcp.safe_get_int(name, 'channels') self.verify_num_range(dic['channels'], 'channels', 1, 3) # Computed values dic['imgPixels'] = dic['numInputs'][0] / dic['channels'] dic['imgSize'] = int(n.sqrt(dic['imgPixels'])) self.verify_img_size() print "Initialized horizontal reflection layer '%s', producing %dx%d %d-channel output" % (name, dic['imgSize'], dic['imgSize'], dic['channels']) return dic class ResizeLayerParser(LayerWithInputParser): def __init__(self): LayerWithInputParser.__init__(self, num_inputs=1) def parse(self, name, mcp, prev_layers, model=None): dic = LayerWithInputParser.parse(self, name, mcp, prev_layers, model) dic['forceOwnActs'] = False dic['usesActs'] = False dic['usesInputs'] = False dic['channels'] = mcp.safe_get_int(name, 'channels') dic['imgPixels'] = dic['numInputs'][0] / dic['channels'] dic['imgSize'] = int(n.sqrt(dic['imgPixels'])) dic['scale'] = mcp.safe_get_float(name, 'scale') dic['tgtSize'] = int(floor(dic['imgSize'] / dic['scale'])) dic['tgtPixels'] = dic['tgtSize']**2 self.verify_num_range(dic['channels'], 'channels', 1, None) # Really not recommended to use this for such severe scalings self.verify_float_range(dic['scale'], 'scale', 0.5, 2) dic['outputs'] = dic['channels'] * dic['tgtPixels'] self.verify_img_size() self.verify_no_grads() print "Initialized resize layer '%s', producing %dx%d %d-channel output" % (name, dic['tgtSize'], dic['tgtSize'], dic['channels']) return dic class RandomScaleLayerParser(LayerWithInputParser): def __init__(self): LayerWithInputParser.__init__(self, num_inputs=1) def parse(self, name, mcp, prev_layers, model=None): dic = LayerWithInputParser.parse(self, name, mcp, prev_layers, model) dic['forceOwnActs'] = False dic['usesActs'] = False dic['usesInputs'] = False dic['channels'] = mcp.safe_get_int(name, 'channels') self.verify_num_range(dic['channels'], 'channels', 1, None) # Computed values dic['imgPixels'] = dic['numInputs'][0] / dic['channels'] dic['imgSize'] = int(n.sqrt(dic['imgPixels'])) dic['maxScale'] = mcp.safe_get_float(name, 'maxScale') dic['tgtSize'] = mcp.safe_get_int(name, 'tgtSize') min_size = int(floor(dic['imgSize'] / dic['maxScale'])) max_size = dic['imgSize'] #int(floor(dic['imgSize'] * dic['maxScale'])) if dic['tgtSize'] < min_size: raise LayerParsingError("Layer '%s': target size must be greater than minimum image size after rescaling (%d)" % (name, min_size)) if dic['tgtSize'] > max_size: raise LayerParsingError("Layer '%s': target size must be smaller than maximum image size after rescaling (%d)" % (name, max_size)) dic['tgtPixels'] = dic['tgtSize']**2 self.verify_float_range(dic['maxScale'], 'maxScale', 1, 2) dic['outputs'] = dic['channels'] * dic['tgtPixels'] self.verify_img_size() self.verify_no_grads() print "Initialized random scale layer '%s', producing %dx%d %d-channel output" % (name, dic['tgtSize'], dic['tgtSize'], dic['channels']) return dic class CropLayerParser(LayerWithInputParser): def __init__(self): LayerWithInputParser.__init__(self, num_inputs=1) def parse(self, name, mcp, prev_layers, model=None): dic = LayerWithInputParser.parse(self, name, mcp, prev_layers, model) dic['forceOwnActs'] = False dic['usesActs'] = False dic['usesInputs'] = False dic['channels'] = mcp.safe_get_int(name, 'channels') self.verify_num_range(dic['channels'], 'channels', 1, None) dic['startX'] = mcp.safe_get_int(name, 'startX') dic['startY'] = mcp.safe_get_int(name, 'startY', default=dic['startX']) dic['sizeX'] = mcp.safe_get_int(name, 'sizeX') # Computed values dic['imgPixels'] = dic['numInputs'][0] / dic['channels'] dic['imgSize'] = int(n.sqrt(dic['imgPixels'])) dic['outputs'] = dic['channels'] * (dic['sizeX']**2) self.verify_num_range(dic['startX'], 'startX', 0, dic['imgSize']-1) self.verify_num_range(dic['sizeX'], 'sizeX', 1, dic['imgSize']) self.verify_num_range(dic['startY'], 'startY', 0, dic['imgSize']-1) self.verify_img_size() self.verify_no_grads() if dic['startX'] + dic['sizeX'] > dic['imgSize']: raise LayerParsingError("Layer '%s': startX (%d) + sizeX (%d) > imgSize (%d)" % (name, dic['startX'], dic['sizeX'], dic['imgSize'])) print "Initialized cropping layer '%s', producing %dx%d %d-channel output" % (name, dic['sizeX'], dic['sizeX'], dic['channels']) return dic class ColorTransformLayerParser(LayerWithInputParser): def __init__(self): LayerWithInputParser.__init__(self, num_inputs=1) def parse(self, name, mcp, prev_layers, model=None): dic = LayerWithInputParser.parse(self, name, mcp, prev_layers, model) dic['forceOwnActs'] = False dic['usesActs'] = False dic['usesInputs'] = False # Computed values dic['imgPixels'] = dic['numInputs'][0] / 3 dic['imgSize'] = int(n.sqrt(dic['imgPixels'])) dic['channels'] = 3 dic['outputs'] = dic['numInputs'][0] self.verify_img_size() self.verify_no_grads() return dic class RGBToYUVLayerParser(ColorTransformLayerParser): def __init__(self): ColorTransformLayerParser.__init__(self) def parse(self, name, mcp, prev_layers, model=None): dic = ColorTransformLayerParser.parse(self, name, mcp, prev_layers, model) print "Initialized RGB --> YUV layer '%s', producing %dx%d %d-channel output" % (name, dic['imgSize'], dic['imgSize'], dic['channels']) return dic class RGBToLABLayerParser(ColorTransformLayerParser): def __init__(self): ColorTransformLayerParser.__init__(self) def parse(self, name, mcp, prev_layers, model=None): dic = ColorTransformLayerParser.parse(self, name, mcp, prev_layers, model) dic['center'] = mcp.safe_get_bool(name, 'center', default=False) print "Initialized RGB --> LAB layer '%s', producing %dx%d %d-channel output" % (name, dic['imgSize'], dic['imgSize'], dic['channels']) return dic class NeuronLayerParser(LayerWithInputParser): def __init__(self): LayerWithInputParser.__init__(self, num_inputs=1) @staticmethod def get_unused_layer_name(layers, wish): if wish not in layers: return wish for i in xrange(1, 100): name = '%s.%d' % (wish, i) if name not in layers: return name raise LayerParsingError("This is insane.") def parse_neuron(self, neuron_str): for n in neuron_parsers: p = n.parse(neuron_str) if p: # Successfully parsed neuron, return it self.dic['neuron'] = p self.dic['usesActs'] = self.dic['neuron']['usesActs'] self.dic['usesInputs'] = self.dic['neuron']['usesInputs'] return # Could not parse neuron # Print available neuron types colnames = ['Neuron type', 'Function'] m = max(len(colnames[0]), OptionsParser._longest_value(neuron_parsers, key=lambda x:x.type)) + 2 ntypes = [OptionsParser._bold(colnames[0].ljust(m))] + [n.type.ljust(m) for n in neuron_parsers] fnames = [OptionsParser._bold(colnames[1])] + [n.func_str for n in neuron_parsers] usage_lines = NL.join(ntype + fname for ntype,fname in zip(ntypes, fnames)) raise LayerParsingError("Layer '%s': unable to parse neuron type '%s'. Valid neuron types: %sWhere neurons have parameters, they must be floats." % (self.dic['name'], neuron_str, NL + usage_lines + NL)) def detach_neuron_layer(self, src_name, layers): dic = self.dic # self.set_defaults() dic['name'] = NeuronLayerParser.get_unused_layer_name(layers, '%s_neuron' % src_name) dic['type'] = 'neuron' dic['inputs'] = src_name dic['neuron'] = layers[src_name]['neuron'] dic['gpu'] = layers[src_name]['gpu'] # Yes it's not entirely correct to pass all of layers as prev_layers, but it's harmless dic = self.parse(dic['name'], FakeConfigParser(dic), layers) dic['src_layer'] = src_name # Link upper layers to this new one for l in layers.values(): if 'inputs' in l: l['inputs'] = [inp if inp != src_name else dic['name'] for inp in l['inputs']] l['inputLayers'] = [inp if inp['name'] != src_name else dic for inp in l['inputLayers']] layers[dic['name']] = dic def parse(self, name, mcp, prev_layers, model=None): dic = LayerWithInputParser.parse(self, name, mcp, prev_layers, model) dic['outputs'] = dic['numInputs'][0] self.parse_neuron(dic['neuron']) dic['forceOwnActs'] = False print "Initialized neuron layer '%s' on GPUs %s, producing %d outputs" % (name, dic['gpus'], dic['outputs']) return dic class EltwiseSumLayerParser(LayerWithInputParser): def __init__(self): LayerWithInputParser.__init__(self) def add_params(self, mcp): LayerWithInputParser.add_params(self, mcp) dic, name = self.dic, self.dic['name'] dic['coeffs'] = mcp.safe_get_float_list(name, 'coeffs', default=[1.0] * len(dic['inputs'])) def parse(self, name, mcp, prev_layers, model): dic = LayerWithInputParser.parse(self, name, mcp, prev_layers, model) if len(set(dic['numInputs'])) != 1: raise LayerParsingError("Layer '%s': all inputs must have the same dimensionality. Got dimensionalities: %s" % (name, ", ".join(str(s) for s in dic['numInputs']))) dic['outputs'] = dic['numInputs'][0] dic['usesInputs'] = False dic['usesActs'] = False dic['forceOwnActs'] = False dic['requiresParams'] = True print "Initialized elementwise sum layer '%s' on GPUs %s, producing %d outputs" % (name, dic['gpus'], dic['outputs']) return dic class EltwiseMaxLayerParser(LayerWithInputParser): def __init__(self): LayerWithInputParser.__init__(self) def parse(self, name, mcp, prev_layers, model): dic = LayerWithInputParser.parse(self, name, mcp, prev_layers, model) if len(dic['inputs']) < 2: raise LayerParsingError("Layer '%s': elementwise max layer must have at least 2 inputs, got %d." % (name, len(dic['inputs']))) if len(set(dic['numInputs'])) != 1: raise LayerParsingError("Layer '%s': all inputs must have the same dimensionality. Got dimensionalities: %s" % (name, ", ".join(str(s) for s in dic['numInputs']))) dic['outputs'] = dic['numInputs'][0] print "Initialized elementwise max layer '%s' on GPUs %s, producing %d outputs" % (name, dic['gpus'], dic['outputs']) return dic class SumLayerParser(LayerWithInputParser): def __init__(self): LayerWithInputParser.__init__(self, num_inputs=1) def parse(self, name, mcp, prev_layers, model): dic = LayerWithInputParser.parse(self, name, mcp, prev_layers, model) dic['stride'] = mcp.safe_get_int(name, 'stride', default=1) self.verify_divisible(dic['numInputs'][0], dic['stride'], 'input dimensionality', 'stride') dic['outputs'] = dic['numInputs'][0] / dic['stride'] print "Initialized sum layer '%s' on GPUs %s, producing %d outputs" % (name, dic['gpus'], dic['outputs']) return dic class DropoutLayerParser(LayerWithInputParser): def __init__(self): LayerWithInputParser.__init__(self, num_inputs=1) def add_params(self, mcp): LayerWithInputParser.add_params(self, mcp) dic, name = self.dic, self.dic['name'] dic['enable'] = mcp.safe_get_bool(name, 'enable', default=True) dic['keep'] = mcp.safe_get_float(name, 'keep', default=0.5) def parse(self, name, mcp, prev_layers, model): dic = LayerWithInputParser.parse(self, name, mcp, prev_layers, model) dic['requiresParams'] = True dic['usesInputs'] = False dic['usesActs'] = False dic['forceOwnActs'] = False dic['outputs'] = dic['numInputs'][0] print "Initialized %s layer '%s' on GPUs %s, producing %d outputs" % (dic['type'], name, dic['gpus'], dic['outputs']) return dic class Dropout2LayerParser(DropoutLayerParser): def __init__(self): DropoutLayerParser.__init__(self) class WeightLayerParser(LayerWithInputParser): LAYER_PAT = re.compile(r'^\s*([^\s\[]+)(?:\[(\d+)\])?\s*$') # matches things like layername[5], etc def __init__(self, num_inputs=-1): LayerWithInputParser.__init__(self, num_inputs=num_inputs) @staticmethod def get_layer_name(name_str): m = WeightLayerParser.LAYER_PAT.match(name_str) if not m: return None return m.group(1), m.group(2) def add_params(self, mcp): LayerWithInputParser.add_params(self, mcp) dic, name = self.dic, self.dic['name'] dic['momW'] = mcp.safe_get_float_list(name, 'momW') dic['momB'] = mcp.safe_get_float(name, 'momB') dic['superEps'] = mcp.safe_get_float(name, 'superEps', default=0.0) dic['superMom'] = mcp.safe_get_float(name, 'superMom', default=0.0) dic['wc'] = mcp.safe_get_float_list(name, 'wc', default=[0.0] * len(dic['inputs'])) dic['wball'] = mcp.safe_get_float_list(name, 'wball', default=[0.0] * len(dic['inputs'])) self.verify_num_params(['momW', 'wc', 'wball']) # dic['wballNormed'] = [wball * nweights for wball,nweights in zip(dic['wball'], dic['weightsPerFilter'])] dic['wballNormed'] = dic['wball'] # Convert from old-style 0.001,0.02 hyperparam specification to new-stye # const[base=0.001],const[base=0.02] and so forth def convert_scalars_to_schedules(scalars): parts = scalars.split(',') for i,p in enumerate(parts): p = p.strip() if re.match('(?:\d*\.)?\d+$', p): parts[i] = 'const[base=%s]' % p return parts dic['epsW'] = self.parse_params(convert_scalars_to_schedules(mcp.safe_get(name, 'epsW')), lrs_parsers, 'epsW', 'learning rate schedule', num_params=len(dic['inputs'])) dic['epsB'] = self.parse_params(convert_scalars_to_schedules(mcp.safe_get(name, 'epsB')), lrs_parsers, 'epsB', 'learning rate schedule', num_params=1)[0] dic['updatePeriod'] = mcp.safe_get_int(name, 'updatePeriod', default=0) # 0 means update as often as possible # TODO: assert that updatePeriod is a multiple of active pass period, which is unknown here. # the assert has to go in some post-processing step.. dic['gradConsumer'] = dic['epsB']['params']['base'] > 0 or any(w['params']['base'] > 0 for w in dic['epsW']) @staticmethod def unshare_weights(layer, layers, matrix_idx=None): def unshare(layer, layers, indices): for i in indices: if layer['weightSourceLayers'][i] >= 0: src_matrix_idx = layer['weightSourceMatrixIndices'][i] layer['weightSourceLayers'][i] = "" layer['weightSourceMatrixIndices'][i] = -1 layer['weights'][i] = layer['weights'][i].copy() layer['weightsInc'][i] = n.zeros_like(layer['weights'][i]) print "Unshared weight matrix %s[%d] from %s[%d]." % (layer['name'], i, layer['weightSourceLayers'][i], src_matrix_idx) else: print "Weight matrix %s[%d] already unshared." % (layer['name'], i) if 'weightSourceLayers' in layer: unshare(layer, layers, range(len(layer['inputs'])) if matrix_idx is None else [matrix_idx]) # Load weight/biases initialization module def call_init_func(self, param_name, shapes, input_idx=-1): dic = self.dic func_pat = re.compile('^([^\.]+)\.([^\(\)]+)\s*(?:\(([^,]+(?:,[^,]+)*)\))?$') m = func_pat.match(dic[param_name]) if not m: raise LayerParsingError("Layer '%s': '%s' parameter must have format 'moduleName.functionName(param1,param2,...)'; got: %s." % (dic['name'], param_name, dic['initWFunc'])) module, func = m.group(1), m.group(2) params = m.group(3).split(',') if m.group(3) is not None else [] try: mod = __import__(module) return getattr(mod, func)(dic['name'], input_idx, shapes, params=params) if input_idx >= 0 else getattr(mod, func)(dic['name'], shapes, params=params) except (ImportError, AttributeError, TypeError), e: raise LayerParsingError("Layer '%s': %s." % (dic['name'], e)) def make_weights(self, initW, rows, cols, order='C'): dic = self.dic dic['weights'], dic['weightsInc'] = [], [] if dic['initWFunc']: # Initialize weights from user-supplied python function # Initialization function is supplied in the format # module.func for i in xrange(len(dic['inputs'])): dic['weights'] += [self.call_init_func('initWFunc', (rows[i], cols[i]), input_idx=i)] if type(dic['weights'][i]) != n.ndarray: raise LayerParsingError("Layer '%s[%d]': weight initialization function %s must return numpy.ndarray object. Got: %s." % (dic['name'], i, dic['initWFunc'], type(dic['weights'][i]))) if dic['weights'][i].dtype != n.float32: raise LayerParsingError("Layer '%s[%d]': weight initialization function %s must weight matrices consisting of single-precision floats. Got: %s." % (dic['name'], i, dic['initWFunc'], dic['weights'][i].dtype)) if dic['weights'][i].shape != (rows[i], cols[i]): raise LayerParsingError("Layer '%s[%d]': weight matrix returned by weight initialization function %s has wrong shape. Should be: %s; got: %s." % (dic['name'], i, dic['initWFunc'], (rows[i], cols[i]), dic['weights'][i].shape)) # Convert to desired order dic['weights'][i] = n.require(dic['weights'][i], requirements=order) dic['weightsInc'] += [n.zeros_like(dic['weights'][i])] print "Layer '%s[%d]' initialized weight matrices from function %s" % (dic['name'], i, dic['initWFunc']) else: for i in xrange(len(dic['inputs'])): if dic['weightSourceLayers'][i] != '': # Shared weight matrix src_layer = self.prev_layers[dic['weightSourceLayers'][i]] if dic['weightSourceLayers'][i] != dic['name'] else dic dic['weights'] += [src_layer['weights'][dic['weightSourceMatrixIndices'][i]]] dic['weightsInc'] += [src_layer['weightsInc'][dic['weightSourceMatrixIndices'][i]]] if dic['weights'][i].shape != (rows[i], cols[i]): raise LayerParsingError("Layer '%s': weight sharing source matrix '%s' has shape %dx%d; should be %dx%d." % (dic['name'], dic['weightSource'][i], dic['weights'][i].shape[0], dic['weights'][i].shape[1], rows[i], cols[i])) print "Layer '%s' initialized weight matrix %d from %s" % (dic['name'], i, dic['weightSource'][i]) else: dic['weights'] += [n.array(initW[i] * nr.randn(rows[i], cols[i]), dtype=n.single, order=order)] dic['weightsInc'] += [n.zeros_like(dic['weights'][i])] def make_biases(self, rows, cols, order='C'): dic = self.dic if dic['initBFunc']: dic['biases'] = self.call_init_func('initBFunc', (rows, cols)) if type(dic['biases']) != n.ndarray: raise LayerParsingError("Layer '%s': bias initialization function %s must return numpy.ndarray object. Got: %s." % (dic['name'], dic['initBFunc'], type(dic['biases']))) if dic['biases'].dtype != n.float32: raise LayerParsingError("Layer '%s': bias initialization function %s must return numpy.ndarray object consisting of single-precision floats. Got: %s." % (dic['name'], dic['initBFunc'], dic['biases'].dtype)) if dic['biases'].shape != (rows, cols): raise LayerParsingError("Layer '%s': bias vector returned by bias initialization function %s has wrong shape. Should be: %s; got: %s." % (dic['name'], dic['initBFunc'], (rows, cols), dic['biases'].shape)) dic['biases'] = n.require(dic['biases'], requirements=order) print "Layer '%s' initialized bias vector from function %s" % (dic['name'], dic['initBFunc']) else: dic['biases'] = dic['initB'] * n.ones((rows, cols), order=order, dtype=n.single) dic['biasesInc'] = n.zeros_like(dic['biases']) def parse(self, name, mcp, prev_layers, model): dic = LayerWithInputParser.parse(self, name, mcp, prev_layers, model) dic['requiresParams'] = True dic['gradConsumer'] = True dic['usesActs'] = False dic['initW'] = mcp.safe_get_float_list(name, 'initW', default=0.01) dic['initB'] = mcp.safe_get_float(name, 'initB', default=0) dic['initWFunc'] = mcp.safe_get(name, 'initWFunc', default="") dic['initBFunc'] = mcp.safe_get(name, 'initBFunc', default="") # Find shared weight matrices dic['weightSource'] = mcp.safe_get_list(name, 'weightSource', default=[''] * len(dic['inputs'])) self.verify_num_params(['initW']) self.verify_num_params(['weightSource'], auto_expand=False) dic['weightSourceLayers'] = [] dic['weightSourceMatrixIndices'] = [] for i, src_name in enumerate(dic['weightSource']): src_layer_matrix_idx = -1 src_layer_name = '' if src_name != '': src_layer_match = WeightLayerParser.get_layer_name(src_name) if src_layer_match is None: raise LayerParsingError("Layer '%s': unable to parse weight sharing source '%s'. Format is layer[idx] or just layer, in which case idx=0 is used." % (name, src_name)) src_layer_name = src_layer_match[0] src_layer_matrix_idx = int(src_layer_match[1]) if src_layer_match[1] is not None else 0 if src_layer_name not in prev_layers and src_layer_name != name: raise LayerParsingError("Layer '%s': weight sharing source layer '%s' does not exist." % (name, src_layer_name)) # src_layer_idx = prev_names.index(src_layer_name) if src_layer_name != name else len(prev_names) src_layer = prev_layers[src_layer_name] if src_layer_name != name else dic if src_layer['gpu'] != dic['gpu']: raise LayerParsingError("Layer '%s': weight sharing source layer '%s' runs on GPUs %s, while '%s' runs on GPUs %s." % (name, src_layer_name, src_layer['gpu'], name, dic['gpu'])) if src_layer['type'] != dic['type']: raise LayerParsingError("Layer '%s': weight sharing source layer '%s' is of type '%s'; should be '%s'." % (name, src_layer_name, src_layer['type'], dic['type'])) if src_layer_name != name and len(src_layer['weights']) <= src_layer_matrix_idx: raise LayerParsingError("Layer '%s': weight sharing source layer '%s' has %d weight matrices, but '%s[%d]' requested." % (name, src_layer_name, len(src_layer['weights']), src_name, src_layer_matrix_idx)) if src_layer_name == name and src_layer_matrix_idx >= i: raise LayerParsingError("Layer '%s': weight sharing source '%s[%d]' not defined yet." % (name, name, src_layer_matrix_idx)) dic['weightSourceLayers'] += [src_layer_name] dic['weightSourceMatrixIndices'] += [src_layer_matrix_idx] return dic class FCLayerParser(WeightLayerParser): def __init__(self): WeightLayerParser.__init__(self) def parse(self, name, mcp, prev_layers, model): dic = WeightLayerParser.parse(self, name, mcp, prev_layers, model) dic['outputs'] = mcp.safe_get_int(name, 'outputs') dic['weightsPerFilter'] = dic['numInputs'] self.verify_num_range(dic['outputs'], 'outputs', 1, None) self.make_weights(dic['initW'], dic['numInputs'], [dic['outputs']] * len(dic['numInputs']), order='F') self.make_biases(1, dic['outputs'], order='F') print "Initialized fully-connected layer '%s' on GPUs %s, producing %d outputs" % (name, dic['gpus'], dic['outputs']) return dic class SplitFCLayerParser(WeightLayerParser): def __init__(self): WeightLayerParser.__init__(self) def parse(self, name, mcp, prev_layers, model): dic = WeightLayerParser.parse(self, name, mcp, prev_layers, model) dic['parts'] = mcp.safe_get_int(name, 'parts') dic['outputs'] = mcp.safe_get_int(name, 'outputs') * dic['parts'] dic['weightsPerFilter'] = dic['numInputs'] self.verify_num_range(dic['parts'], 'parts', 1, None) self.make_weights(dic['initW'], dic['numInputs'], [dic['outputs']/dic['parts']] * len(dic['numInputs']), order='F') self.make_biases(1, dic['outputs'], order='F') for i in xrange(len(dic['numInputs'])): self.verify_divisible(dic['numInputs'][i], dic['parts'], 'numInputs', 'parts', input_idx=i) print "Initialized split fully-connected layer '%s' on GPUs %s, producing %d outputs in %d parts" % (name, dic['gpus'], dic['outputs'], dic['parts']) return dic class LocalLayerParser(WeightLayerParser): def __init__(self): WeightLayerParser.__init__(self) # Convert convolutional layer to unshared, locally-connected layer @staticmethod def conv_to_local(layers, lname): layer = layers[lname] if layer['type'] == 'conv': layer['type'] = 'local' for inp,inpname in enumerate(layer['inputs']): src_layer_name = layer['weightSourceLayers'][inp] if src_layer_name != '': src_layer = layers[src_layer_name] src_matrix_idx = layer['weightSourceMatrixIndices'][inp] LocalLayerParser.conv_to_local(layers, src_layer_name) for w in ('weights', 'weightsInc'): layer[w][inp] = src_layer[w][src_matrix_idx] else: layer['weights'][inp] = n.require(n.reshape(n.tile(n.reshape(layer['weights'][inp], (1, n.prod(layer['weights'][inp].shape))), (layer['modules'], 1)), (layer['modules'] * layer['filterChannels'][inp] * layer['filterPixels'][inp], layer['filters'])), requirements='C') layer['weightsInc'][inp] = n.zeros_like(layer['weights'][inp]) if layer['sharedBiases']: layer['biases'] = n.require(n.repeat(layer['biases'], layer['modules'], axis=0), requirements='C') layer['biasesInc'] = n.zeros_like(layer['biases']) print "Converted layer '%s' from convolutional to unshared, locally-connected" % layer['name'] # Also call this function on any layers sharing my weights for l in layers: if 'weightSourceLayers' in l and lname in l['weightSourceLayers']: LocalLayerParser.conv_to_local(layers, l) return layer def parse(self, name, mcp, prev_layers, model): dic = WeightLayerParser.parse(self, name, mcp, prev_layers, model) dic['requiresParams'] = True dic['usesActs'] = False # Supplied values dic['channels'] = mcp.safe_get_int_list(name, 'channels') dic['padding'] = mcp.safe_get_int_list(name, 'padding', default=[0]*len(dic['inputs'])) dic['stride'] = mcp.safe_get_int_list(name, 'stride', default=[1]*len(dic['inputs'])) dic['filterSize'] = mcp.safe_get_int_list(name, 'filterSize') dic['filters'] = mcp.safe_get_int_list(name, 'filters') dic['groups'] = mcp.safe_get_int_list(name, 'groups', default=[1]*len(dic['inputs'])) dic['initW'] = mcp.safe_get_float_list(name, 'initW') dic['initCFunc'] = mcp.safe_get(name, 'initCFunc', default='') dic['modulesX'] = mcp.safe_get_int(name, 'modulesX', default=0) self.verify_num_params(['channels', 'padding', 'stride', 'filterSize', \ 'filters', 'groups', 'initW']) self.verify_num_range(dic['stride'], 'stride', 1, None) self.verify_num_range(dic['filterSize'],'filterSize', 1, None) self.verify_num_range(dic['padding'], 'padding', 0, None) self.verify_num_range(dic['channels'], 'channels', 1, None) self.verify_num_range(dic['groups'], 'groups', 1, None) self.verify_num_range(dic['modulesX'], 'modulesX', 0, None) for i in xrange(len(dic['filters'])): self.verify_divisible(dic['filters'][i], 16, 'filters', input_idx=i) # Computed values dic['imgPixels'] = [numInputs/channels for numInputs,channels in zip(dic['numInputs'], dic['channels'])] dic['imgSize'] = [int(n.sqrt(imgPixels)) for imgPixels in dic['imgPixels']] self.verify_num_range(dic['imgSize'], 'imgSize', 1, None) dic['filters'] = [filters*groups for filters,groups in zip(dic['filters'], dic['groups'])] dic['filterPixels'] = [filterSize**2 for filterSize in dic['filterSize']] if dic['modulesX'] <= 0: dic['modulesX'] = [1 + int(ceil((2*padding + imgSize - filterSize) / float(stride))) for padding,imgSize,filterSize,stride in zip(dic['padding'], dic['imgSize'], dic['filterSize'], dic['stride'])] else: dic['modulesX'] = [dic['modulesX']] * len(dic['inputs']) dic['filterChannels'] = [channels/groups for channels,groups in zip(dic['channels'], dic['groups'])] if len(set(dic['modulesX'])) != 1 or len(set(dic['filters'])) != 1: raise LayerParsingError("Layer '%s': all inputs must produce equally-dimensioned output. Dimensions are: %s." % (name, ", ".join("%dx%dx%d" % (filters, modulesX, modulesX) for filters,modulesX in zip(dic['filters'], dic['modulesX'])))) dic['modulesX'] = dic['modulesX'][0] dic['modules'] = dic['modulesX']**2 dic['filters'] = dic['filters'][0] dic['outputs'] = dic['modules'] * dic['filters'] # dic['filterConns'] = [[]] * len(dic['inputs']) for i in xrange(len(dic['inputs'])): if dic['numInputs'][i] % dic['imgPixels'][i] != 0 or dic['imgSize'][i] * dic['imgSize'][i] != dic['imgPixels'][i]: raise LayerParsingError("Layer '%s[%d]': has %-d dimensional input, not interpretable as square %d-channel images" % (name, i, dic['numInputs'][i], dic['channels'][i])) if dic['channels'][i] > 3 and dic['channels'][i] % 4 != 0: raise LayerParsingError("Layer '%s[%d]': number of channels must be smaller than 4 or divisible by 4" % (name, i)) # if dic['filterSize'][i] > totalPadding[i] + dic['imgSize'][i]: # raise LayerParsingError("Layer '%s[%d]': filter size (%d) greater than image size + padding (%d)" % (name, i, dic['filterSize'][i], dic['padding'][i] + dic['imgSize'][i])) if -dic['padding'][i] + dic['stride'][i] * (dic['modulesX'] - 1) + dic['filterSize'][i] < dic['imgSize'][i]: raise LayerParsingError("Layer '%s[%d]': %dx%d output map with padding=%d, stride=%d does not cover entire input image." % (name, i, dic['modulesX'], dic['outputsX'], dic['padding'][i], dic['stride'][i])) if dic['groups'][i] > 1: self.verify_divisible(dic['channels'][i], 4*dic['groups'][i], 'channels', '4 * groups', input_idx=i) self.verify_divisible(dic['channels'][i], dic['groups'][i], 'channels', 'groups', input_idx=i) self.verify_divisible(dic['filters'], 16*dic['groups'][i], 'filters * groups', input_idx=i) dic['padding'][i] = -dic['padding'][i] # dic['overSample'] = [groups*filterChannels/channels for groups,filterChannels,channels in zip(dic['groups'], dic['filterChannels'], dic['channels'])] dic['weightsPerFilter'] = [fc * (fz**2) for fc, fz in zip(dic['filterChannels'], dic['filterSize'])] return dic class ConvLayerParser(LocalLayerParser): def __init__(self): LocalLayerParser.__init__(self) def add_params(self, mcp): LocalLayerParser.add_params(self, mcp) self.dic['wcNormMax'] = mcp.safe_get_float_list(self.dic['name'], 'wcNormMax', default=[0.0] * len(self.dic['inputs'])) self.dic['wcNormMin'] = mcp.safe_get_float_list(self.dic['name'], 'wcNormMin', default=[0.0] * len(self.dic['inputs'])) self.verify_num_params(['wcNormMax', 'wcNormMin']) for min,max in zip(self.dic['wcNormMin'], self.dic['wcNormMax']): if min > max: raise LayerParsingError("Layer '%s': wcNormMin must be <= wcNormMax." % (self.dic['name'])) def parse(self, name, mcp, prev_layers, model): dic = LocalLayerParser.parse(self, name, mcp, prev_layers, model) dic['sumWidth'] = mcp.safe_get_int(name, 'sumWidth') dic['sharedBiases'] = mcp.safe_get_bool(name, 'sharedBiases', default=True) num_biases = dic['filters'] if dic['sharedBiases'] else dic['modules']*dic['filters'] eltmult = lambda list1, list2: [l1 * l2 for l1,l2 in zip(list1, list2)] self.make_weights(dic['initW'], eltmult(dic['filterPixels'], dic['filterChannels']), [dic['filters']] * len(dic['inputs']), order='C') self.make_biases(num_biases, 1, order='C') print "Initialized convolutional layer '%s' on GPUs %s, producing %dx%d %d-channel output" % (name, dic['gpus'], dic['modulesX'], dic['modulesX'], dic['filters']) return dic class LocalUnsharedLayerParser(LocalLayerParser): def __init__(self): LocalLayerParser.__init__(self) def parse(self, name, mcp, prev_layers, model): dic = LocalLayerParser.parse(self, name, mcp, prev_layers, model) eltmult = lambda list1, list2: [l1 * l2 for l1,l2 in zip(list1, list2)] scmult = lambda x, lst: [x * l for l in lst] self.make_weights(dic['initW'], scmult(dic['modules'], eltmult(dic['filterPixels'], dic['filterChannels'])), [dic['filters']] * len(dic['inputs']), order='C') self.make_biases(dic['modules'] * dic['filters'], 1, order='C') print "Initialized locally-connected layer '%s' on GPUs %s, producing %dx%d %d-channel output" % (name, dic['gpus'], dic['modulesX'], dic['modulesX'], dic['filters']) return dic class DataLayerParser(LayerParser): def __init__(self): LayerParser.__init__(self) def parse(self, name, mcp, prev_layers, model): dic = LayerParser.parse(self, name, mcp, prev_layers, model) dic['dataIdx'] = mcp.safe_get_int(name, 'dataIdx') dic['start'] = mcp.safe_get_int(name, 'start', default=0) dic['end'] = mcp.safe_get_int(name, 'end', default=model.train_data_provider.get_data_dims(idx=dic['dataIdx'])) dic['outputs'] = dic['end'] - dic['start'] # dic['usesActs'] = False print "Initialized data layer '%s', producing %d outputs" % (name, dic['outputs']) return dic class SoftmaxLayerParser(LayerWithInputParser): def __init__(self): LayerWithInputParser.__init__(self, num_inputs=1) def parse(self, name, mcp, prev_layers, model): dic = LayerWithInputParser.parse(self, name, mcp, prev_layers, model) dic['outputs'] = dic['inputLayers'][0]['outputs'] print "Initialized softmax layer '%s' on GPUs %s, producing %d outputs" % (name, dic['gpus'], dic['outputs']) return dic class ConcatentionLayerParser(LayerWithInputParser): def __init__(self): LayerWithInputParser.__init__(self) def parse(self, name, mcp, prev_layers, model): dic = LayerWithInputParser.parse(self, name, mcp, prev_layers, model) dic['outputs'] = sum(l['outputs'] for l in dic['inputLayers']) dic['copyOffsets'] = [sum(dic['inputLayers'][j]['outputs'] for j in xrange(i)) for i in xrange(len(dic['inputLayers']))] print "Initialized concatenation layer '%s' on GPUs %s, producing %d outputs" % (name, dic['gpus'], dic['outputs']) return dic class PassThroughLayerParser(LayerWithInputParser): def __init__(self): LayerWithInputParser.__init__(self) # Note: this doesn't verify all the necessary constraints. Layer construction may still fail in C++ code. # For example, it does not verify that every layer only has one pass-through parent. Obviously having # two such parents is incoherent. def parse(self, name, mcp, prev_layers, model): dic = LayerWithInputParser.parse(self, name, mcp, prev_layers, model) # if len(dic['inputLayers']) == 1: # raise LayerParsingError("Layer %s: pass-through layer must have more than one input." % dic['name']) if len(dic['gpu']) != len(dic['inputLayers'][0]['gpu']): raise LayerParsingError("Layer '%s': number of replicas in pass-through layer must be equivalent to number of replicas in input layers." % dic['name']) for inp in dic['inputLayers']: conflicting_layers = [l for l in prev_layers.values() if l['type'] == 'pass' and inp['name'] in l['inputs'] and len(set(dic['gpu']).intersection(set(l['gpu']))) > 0] if len(conflicting_layers) > 0: raise LayerParsingError("Layer '%s' conflicts with layer '%s'. Both pass-through layers take layer '%s' as input and operate on an overlapping set of GPUs." % (dic['name'], conflicting_layers[0]['name'], inp['name'])) dic['outputs'] = sum(l['outputs'] for l in dic['inputLayers']) # dic['copyOffsets'] = [sum(dic['inputLayers'][j]['outputs'] for j in xrange(i)) for i in xrange(len(dic['inputLayers']))] print "Initialized pass-through layer '%s' on GPUs %s, producing %d outputs" % (name, dic['gpus'], dic['outputs']) return dic class PoolLayerParser(LayerWithInputParser): def __init__(self): LayerWithInputParser.__init__(self, num_inputs=1) def add_params(self, mcp): LayerWithInputParser.add_params(self, mcp) dic, name = self.dic, self.dic['name'] def parse(self, name, mcp, prev_layers, model): dic = LayerWithInputParser.parse(self, name, mcp, prev_layers, model) dic['channels'] = mcp.safe_get_int(name, 'channels') dic['sizeX'] = mcp.safe_get_int(name, 'sizeX') dic['start'] = mcp.safe_get_int(name, 'start', default=0) dic['stride'] = mcp.safe_get_int(name, 'stride') dic['outputsX'] = mcp.safe_get_int(name, 'outputsX', default=0) dic['pool'] = mcp.safe_get(name, 'pool') # Avg pooler does not use its acts or inputs dic['usesActs'] = dic['pool'] != 'avg' dic['usesInputs'] = dic['pool'] != 'avg' dic['imgPixels'] = dic['numInputs'][0] / dic['channels'] dic['imgSize'] = int(n.sqrt(dic['imgPixels'])) if dic['pool'] == 'avg': dic['sum'] = mcp.safe_get_bool(name, 'sum', default=False) self.verify_num_range(dic['sizeX'], 'sizeX', 1, dic['imgSize']) self.verify_num_range(dic['stride'], 'stride', 1, dic['sizeX']) self.verify_num_range(dic['outputsX'], 'outputsX', 0, None) self.verify_num_range(dic['channels'], 'channels', 1, None) if LayerWithInputParser.grad_consumers_below(dic): self.verify_divisible(dic['channels'], 16, 'channels') self.verify_str_in(dic['pool'], 'pool', ['max', 'maxabs', 'avg']) self.verify_img_size() if dic['outputsX'] <= 0: dic['outputsX'] = int(ceil((dic['imgSize'] - dic['start'] - dic['sizeX']) / float(dic['stride']))) + 1; dic['outputs'] = dic['outputsX']**2 * dic['channels'] print "Initialized %s-pooling layer '%s' on GPUs %s, producing %dx%d %d-channel output" % (dic['pool'], name, dic['gpus'], dic['outputsX'], dic['outputsX'], dic['channels']) return dic class CrossMapPoolLayerParser(LayerWithInputParser): def __init__(self): LayerWithInputParser.__init__(self, num_inputs=1) def parse(self, name, mcp, prev_layers, model): dic = LayerWithInputParser.parse(self, name, mcp, prev_layers, model) dic['channels'] = mcp.safe_get_int(name, 'channels') dic['size'] = mcp.safe_get_int(name, 'size') dic['start'] = mcp.safe_get_int(name, 'start', default=0) dic['stride'] = mcp.safe_get_int(name, 'stride') dic['outputChannels'] = mcp.safe_get_int(name, 'outputs', default=0) dic['pool'] = mcp.safe_get(name, 'pool') dic['requiresParams'] = False # Avg pooler does not use its acts or inputs dic['usesActs'] = 'pool' != 'avg' dic['usesInputs'] = 'pool' != 'avg' dic['imgPixels'] = dic['numInputs'][0] / dic['channels'] dic['imgSize'] = int(n.sqrt(dic['imgPixels'])) dic['outputs'] = dic['outputChannels'] * dic['imgPixels'] self.verify_num_range(dic['size'], 'size', 1, dic['channels']) self.verify_num_range(dic['stride'], 'stride', 1, dic['size']) self.verify_num_range(dic['outputChannels'], 'outputChannels', 0, None) self.verify_num_range(dic['channels'], 'channels', 1, None) self.verify_num_range(dic['start'], 'start', None, 0) self.verify_str_in(dic['pool'], 'pool', ['max']) self.verify_img_size() covered_chans = dic['start'] + (dic['outputChannels'] - 1) * dic['stride'] + dic['size'] if covered_chans < dic['channels']: raise LayerParsingError("Layer '%s': cross-map pooling with start=%d, stride=%d, size=%d, outputs=%d covers only %d of %d input channels." % \ (name, dic['start'], dic['stride'], dic['size'], dic['outputChannels'], covered_chans, dic['channels'])) print "Initialized cross-map %s-pooling layer '%s' on GPUs %s, producing %dx%d %d-channel output" % (dic['pool'], name, dic['gpus'], dic['imgSize'], dic['imgSize'], dic['outputChannels']) return dic class NormLayerParser(LayerWithInputParser): RESPONSE_NORM = 'response' CONTRAST_NORM = 'contrast' CROSSMAP_RESPONSE_NORM = 'cross-map response' def __init__(self, norm_type): LayerWithInputParser.__init__(self, num_inputs=1) self.norm_type = norm_type def add_params(self, mcp): LayerWithInputParser.add_params(self, mcp) dic, name = self.dic, self.dic['name'] dic['scale'] = mcp.safe_get_float(name, 'scale') dic['scale'] /= dic['size'] if self.norm_type == self.CROSSMAP_RESPONSE_NORM else dic['size']**2 dic['pow'] = mcp.safe_get_float(name, 'pow') dic['minDiv'] = mcp.safe_get_float(name, 'minDiv', default=1.0) def parse(self, name, mcp, prev_layers, model): dic = LayerWithInputParser.parse(self, name, mcp, prev_layers, model) dic['requiresParams'] = True dic['channels'] = mcp.safe_get_int(name, 'channels') dic['size'] = mcp.safe_get_int(name, 'size') dic['blocked'] = mcp.safe_get_bool(name, 'blocked', default=False) dic['imgPixels'] = dic['numInputs'][0] / dic['channels'] dic['imgSize'] = int(n.sqrt(dic['imgPixels'])) # Contrast normalization layer does not use its inputs dic['usesInputs'] = self.norm_type != self.CONTRAST_NORM self.verify_num_range(dic['channels'], 'channels', 1, None) if self.norm_type == self.CROSSMAP_RESPONSE_NORM: self.verify_num_range(dic['size'], 'size', 2, dic['channels']) if dic['channels'] % 16 != 0: raise LayerParsingError("Layer '%s': number of channels must be divisible by 16 when using crossMap" % name) else: self.verify_num_range(dic['size'], 'size', 1, dic['imgSize']) if self.norm_type != self.CROSSMAP_RESPONSE_NORM and dic['channels'] > 3 and dic['channels'] % 4 != 0: raise LayerParsingError("Layer '%s': number of channels must be smaller than 4 or divisible by 4" % name) self.verify_img_size() dic['outputs'] = dic['imgPixels'] * dic['channels'] print "Initialized %s-normalization layer '%s' on GPUs %s, producing %dx%d %d-channel output" % (self.norm_type, name, dic['gpus'], dic['imgSize'], dic['imgSize'], dic['channels']) return dic class CostParser(LayerWithInputParser): def __init__(self, num_inputs=-1): LayerWithInputParser.__init__(self, num_inputs=num_inputs) def parse(self, name, mcp, prev_layers, model): dic = LayerWithInputParser.parse(self, name, mcp, prev_layers, model) dic['requiresParams'] = True # Stored as string because python can't pickle lambda functions dic['outputFilter'] = 'lambda costs,num_cases: [c/num_cases for c in costs]' dic['children'] = mcp.safe_get_list(name, 'children', default=[]) # Aggregated costs only produce outputs which are additive. for c in dic['children']: if c not in prev_layers: raise LayerParsingError("Layer '%s': child cost layer '%s' not defined" % (name, c)) if prev_layers[c]['type'] != dic['type']: raise LayerParsingError("Layer '%s': child cost layer '%s' must have same type as parent" % (name, c)) prev_layers[c]['aggregated'] = 1 dic['aggregated'] = dic['children'] != [] del dic['neuron'] return dic def add_params(self, mcp): LayerWithInputParser.add_params(self, mcp) dic, name = self.dic, self.dic['name'] dic['coeff'] = mcp.safe_get_float(name, 'coeff') dic['gradConsumer'] = dic['coeff'] > 0 class CrossEntCostParser(CostParser): def __init__(self): CostParser.__init__(self, num_inputs=2) def parse(self, name, mcp, prev_layers, model): dic = CostParser.parse(self, name, mcp, prev_layers, model) if dic['numInputs'][0] != model.train_data_provider.get_num_classes(): # first input must be labels raise LayerParsingError("Layer '%s': Dimensionality of first input must be equal to number of labels" % name) if dic['inputLayers'][1]['type'] != 'softmax': raise LayerParsingError("Layer '%s': Second input must be softmax layer" % name) if dic['numInputs'][1] != model.train_data_provider.get_num_classes(): raise LayerParsingError("Layer '%s': Softmax input '%s' must produce %d outputs, because that is the number of classes in the dataset" \ % (name, dic['inputs'][1], model.train_data_provider.get_num_classes())) print "Initialized cross-entropy cost '%s' on GPUs %s" % (name, dic['gpus']) return dic class LogregCostParser(CostParser): def __init__(self): CostParser.__init__(self, num_inputs=2) def add_params(self, mcp): CostParser.add_params(self, mcp) dic, name = self.dic, self.dic['name'] dic['topk'] = mcp.safe_get_int(name, 'topk', default=1) if dic['topk'] > dic['numInputs'][1]: raise LayerParsingError("Layer '%s': parameter 'topk'must not have value greater than the number of classess." % (name)) def parse(self, name, mcp, prev_layers, model): dic = CostParser.parse(self, name, mcp, prev_layers, model) dic['requiresParams'] = True if dic['numInputs'][0] != 1: # first input must be labels raise LayerParsingError("Layer '%s': dimensionality of first input must be 1" % name) if dic['inputLayers'][1]['type'] != 'softmax': raise LayerParsingError("Layer '%s': second input must be softmax layer" % name) if dic['numInputs'][1] != model.train_data_provider.get_num_classes(): raise LayerParsingError("Layer '%s': softmax input '%s' must produce %d outputs, because that is the number of classes in the dataset" \ % (name, dic['inputs'][1], model.train_data_provider.get_num_classes())) print "Initialized logistic regression cost '%s' on GPUs %s" % (name, dic['gpus']) return dic class BinomialCrossEntCostParser(CostParser): def __init__(self): CostParser.__init__(self, num_inputs=2) def add_params(self, mcp): CostParser.add_params(self, mcp) self.dic['posWeight'] = mcp.safe_get_float(self.dic['name'], 'posWeight', default=1.0) def parse(self, name, mcp, prev_layers, model): dic = CostParser.parse(self, name, mcp, prev_layers, model) if dic['numInputs'][0] != dic['numInputs'][1]: raise LayerParsingError("Layer '%s': both inputs must produce the same number of outputs" % (name)) if 'neuron' not in dic['inputLayers'][1] or dic['inputLayers'][1]['neuron'] != 'logistic': print "WARNING: Layer '%s': input '%s' is not logistic, results may not be what you intend." % (dic['name'], dic['inputs'][1]) if dic['type'] == 'cost.bce': print "Initialized binomial cross-entropy cost '%s' on GPUs %s" % (name, dic['gpus']) dic['computeSoftmaxErrorRate'] = True return dic class DetectionCrossEntCostParser(BinomialCrossEntCostParser): def __init__(self): BinomialCrossEntCostParser.__init__(self) def parse(self, name, mcp, prev_layers, model): dic = BinomialCrossEntCostParser.parse(self, name, mcp, prev_layers, model) if dic['numInputs'][0] != model.train_data_provider.get_num_classes(): # first input must be labels raise LayerParsingError("Layer '%s': Dimensionality of first input must be equal to number of labels" % name) dic['computeSoftmaxErrorRate'] = False dic['outputFilter'] = 'lambda costs,num_cases: [c/num_cases for c in costs[:2]] + [(class_cost[2] / class_cost[j] if class_cost[j] > 0 else n.inf) for class_cost in [costs[2:][i*3:(i+1)*3] for i in range(len(costs[2:])/3)] for j in range(2)]' dic['outputFilterFormatter'] = 'lambda self,costs: "(crossent) %.6f, (err) %.6f, " % (costs[0], costs[1]) + ", ".join("(%s) %.6f, %.6f" % (self.train_data_provider.batch_meta["label_names"][i/2-1],costs[i],costs[i+1]) for i in xrange(2, len(costs), 2))' print "Initialized detection cross-entropy cost '%s' on GPUs %s" % (name, dic['gpus']) return dic class SumOfSquaresCostParser(CostParser): def __init__(self): CostParser.__init__(self, num_inputs=1) def parse(self, name, mcp, prev_layers, model): dic = CostParser.parse(self, name, mcp, prev_layers, model) print "Initialized sum-of-squares cost '%s' on GPUs %s" % (name, dic['gpus']) return dic # All the layer parsers layer_parsers = {'data' : lambda : DataLayerParser(), 'fc': lambda : FCLayerParser(), 'sfc': lambda : SplitFCLayerParser(), 'conv': lambda : ConvLayerParser(), 'local': lambda : LocalUnsharedLayerParser(), 'softmax': lambda : SoftmaxLayerParser(), 'eltsum': lambda : EltwiseSumLayerParser(), 'eltmax': lambda : EltwiseMaxLayerParser(), 'sum': lambda : SumLayerParser(), 'neuron': lambda : NeuronLayerParser(), 'pool': lambda : PoolLayerParser(), 'cmpool': lambda : CrossMapPoolLayerParser(), 'rnorm': lambda : NormLayerParser(NormLayerParser.RESPONSE_NORM), 'cnorm': lambda : NormLayerParser(NormLayerParser.CONTRAST_NORM), 'cmrnorm': lambda : NormLayerParser(NormLayerParser.CROSSMAP_RESPONSE_NORM), 'nailbed': lambda : NailbedLayerParser(), 'blur': lambda : GaussianBlurLayerParser(), 'href': lambda : HorizontalReflectionLayerParser(), 'resize': lambda : ResizeLayerParser(), 'rgb2yuv': lambda : RGBToYUVLayerParser(), 'rgb2lab': lambda : RGBToLABLayerParser(), 'rscale': lambda : RandomScaleLayerParser(), 'crop': lambda : CropLayerParser(), 'concat': lambda : ConcatentionLayerParser(), 'pass': lambda : PassThroughLayerParser(), 'dropout': lambda : DropoutLayerParser(), 'dropout2': lambda : Dropout2LayerParser(), 'cost.logreg': lambda : LogregCostParser(), 'cost.crossent': lambda : CrossEntCostParser(), 'cost.bce': lambda : BinomialCrossEntCostParser(), 'cost.dce': lambda : DetectionCrossEntCostParser(), 'cost.sum2': lambda : SumOfSquaresCostParser()} # All the neuron parsers # This isn't a name --> parser mapping as the layer parsers above because neurons don't have fixed names. # A user may write tanh[0.5,0.25], etc. neuron_parsers = sorted([NeuronParser('ident', 'f(x) = x', uses_acts=False, uses_inputs=False), NeuronParser('logistic', 'f(x) = 1 / (1 + e^-x)', uses_acts=True, uses_inputs=False), NeuronParser('abs', 'f(x) = |x|', uses_acts=False, uses_inputs=True), NeuronParser('relu', 'f(x) = max(0, x)', uses_acts=True, uses_inputs=False), NeuronParser('nrelu', 'f(x) = max(0, x) + noise', uses_acts=True, uses_inputs=False), NeuronParser('softrelu', 'f(x) = log(1 + e^x)', uses_acts=True, uses_inputs=False), NeuronParser('square', 'f(x) = x^2', uses_acts=False, uses_inputs=True), NeuronParser('sqrt', 'f(x) = sqrt(x)', uses_acts=True, uses_inputs=False), ParamNeuronParser('log[a]', 'f(x) = log(a + x)', uses_acts=False, uses_inputs=True), ParamNeuronParser('tanh[a,b]', 'f(x) = a * tanh(b * x)', uses_acts=True, uses_inputs=False), ParamNeuronParser('brelu[a]', 'f(x) = min(a, max(0, x))', uses_acts=True, uses_inputs=False), ParamNeuronParser('linear[a,b]', 'f(x) = a * x + b', uses_acts=True, uses_inputs=False), ParamNeuronParser('drelu[a]', 'f(x) = x - a * tanh(x / a)', uses_acts=False, uses_inputs=True)], key=lambda x:x.type) # Learning rate schedules lrs_parsers = sorted([ParamParser('const[fbase]'), ParamParser('linear[fbase;ftgtFactor]'), ParamParser('exp[fbase;ftgtFactor]'), ParamParser('dexp[fbase;ftgtFactor;inumSteps]')]) ================================================ FILE: caffe2/contrib/cuda-convnet2/layers/layer-params-cifar10-11pct.cfg ================================================ # 11% error on CIFAR-10 - layer parameter file # Methodology: # 1. Train on batches 1-4, use batch 5 for validation. # 2. After about 350 epochs, validation error no longer making improvements. # 3. Fold in batch 5. # 4. Train on batches 1-5 for about 150 more epochs, until the batch 5 error is near the errors for batches 1-4. It takes forever to actually get there but after 150 epochs it's close enough. # 5. Lower learning rates (epsW) by a factor of 10 to 0.0001, train for 10 more epochs. # 6. Lower learning rates (epsW) by another factor of 10 to 0.00001, train for 10 more epochs. # 7. Stop. Test on batch 6 with --test-range=6 --multiview-test=1 --logreg-name=logprob (read more about what this does here: http://code.google.com/p/cuda-convnet/wiki/TrainingNet#Training_on_image_translations ) # More details about methodology: http://code.google.com/p/cuda-convnet/wiki/Methodology [conv1] epsW=0.001 epsB=0.002 momW=0.9 momB=0.9 wc=0.000 [conv2] epsW=0.001 epsB=0.002 momW=0.9 momB=0.9 wc=0.000 [local3] epsW=0.001 epsB=0.002 momW=0.9 momB=0.9 wc=0.004 [local4] epsW=0.001 epsB=0.002 momW=0.9 momB=0.9 wc=0.004 [fc10] epsW=0.001 epsB=0.002 momW=0.9 momB=0.9 wc=0.01 [logprob] coeff=1 [rnorm1] scale=0.001 pow=0.75 [rnorm2] scale=0.001 pow=0.75 ================================================ FILE: caffe2/contrib/cuda-convnet2/layers/layer-params-imagenet-1gpu.cfg ================================================ [conv1] momW=0.9 momB=0.9 wc=0.0005 wball=0.00 epsW=dexp[base=0.01;tgtFactor=250;numSteps=4] epsB=dexp[base=0.02;tgtFactor=10;numSteps=2] updatePeriod=1 [conv2] momW=0.9 momB=0.9 wc=0.0005 wball=0.00 epsW=dexp[base=0.01;tgtFactor=250;numSteps=4] epsB=dexp[base=0.02;tgtFactor=10;numSteps=2] updatePeriod=1 [conv3] momW=0.9 momB=0.9 wc=0.0005 wball=0 epsW=dexp[base=0.01;tgtFactor=250;numSteps=4] epsB=dexp[base=0.02;tgtFactor=10;numSteps=2] updatePeriod=1 [conv4] momW=0.9 momB=0.9 wc=0.0005 wball=0 epsW=dexp[base=0.01;tgtFactor=250;numSteps=4] epsB=dexp[base=0.02;tgtFactor=10;numSteps=2] updatePeriod=1 [conv5] momW=0.9 momB=0.9 wc=0.0005 wball=0 epsW=dexp[base=0.01;tgtFactor=250;numSteps=4] epsB=dexp[base=0.02;tgtFactor=10;numSteps=2] updatePeriod=1 [fc4096a] momW=0.9 momB=0.9 wc=0.0005 wball=0 epsW=dexp[base=0.01;tgtFactor=250;numSteps=4] epsB=dexp[base=0.02;tgtFactor=10;numSteps=2] updatePeriod=1 [fc4096b] momW=0.9 momB=0.9 wc=0.0005 wball=0 epsW=dexp[base=0.01;tgtFactor=250;numSteps=4] epsB=dexp[base=0.02;tgtFactor=10;numSteps=2] updatePeriod=1 [fc1000] momW=0.9 momB=0.9 wc=0.0005 wball=0 epsW=dexp[base=0.01;tgtFactor=250;numSteps=4] epsB=dexp[base=0.02;tgtFactor=10;numSteps=2] updatePeriod=1 [logprob] coeff=1 topk=5 [dropout1] enable=true [dropout2] enable=true [rnorm1] scale=0.0001 pow=0.75 minDiv=2 [rnorm2] scale=0.0001 pow=0.75 minDiv=2 ================================================ FILE: caffe2/contrib/cuda-convnet2/layers/layer-params-imagenet-2gpu-data.cfg ================================================ [conv1] momW=0.9 momB=0.9 wc=0.0005 wball=0.00 epsW=dexp[base=0.02;tgtFactor=250;numSteps=4] epsB=dexp[base=0.04;tgtFactor=10;numSteps=2] updatePeriod=1 [conv2] momW=0.9 momB=0.9 wc=0.0005 wball=0.00 epsW=dexp[base=0.02;tgtFactor=250;numSteps=4] epsB=dexp[base=0.04;tgtFactor=10;numSteps=2] updatePeriod=1 [conv3] momW=0.9 momB=0.9 wc=0.0005 wball=0 epsW=dexp[base=0.02;tgtFactor=250;numSteps=4] epsB=dexp[base=0.04;tgtFactor=10;numSteps=2] updatePeriod=1 [conv4] momW=0.9 momB=0.9 wc=0.0005 wball=0 epsW=dexp[base=0.02;tgtFactor=250;numSteps=4] epsB=dexp[base=0.04;tgtFactor=10;numSteps=2] updatePeriod=1 [conv5] momW=0.9 momB=0.9 wc=0.0005 wball=0 epsW=dexp[base=0.02;tgtFactor=250;numSteps=4] epsB=dexp[base=0.04;tgtFactor=10;numSteps=2] updatePeriod=1 [fc4096a] momW=0.9 momB=0.9 wc=0.0005 wball=0 epsW=dexp[base=0.02;tgtFactor=250;numSteps=4] epsB=dexp[base=0.04;tgtFactor=10;numSteps=2] updatePeriod=1 [fc4096b] momW=0.9 momB=0.9 wc=0.0005 wball=0 epsW=dexp[base=0.02;tgtFactor=250;numSteps=4] epsB=dexp[base=0.04;tgtFactor=10;numSteps=2] updatePeriod=1 [fc1000] momW=0.9 momB=0.9 wc=0.0005 wball=0 epsW=dexp[base=0.02;tgtFactor=250;numSteps=4] epsB=dexp[base=0.04;tgtFactor=10;numSteps=2] updatePeriod=1 [logprob] coeff=1 topk=5 [dropout1] enable=true [dropout2] enable=true [rnorm1] scale=0.0001 pow=0.75 minDiv=2 [rnorm2] scale=0.0001 pow=0.75 minDiv=2 ================================================ FILE: caffe2/contrib/cuda-convnet2/layers/layer-params-imagenet-2gpu-model.cfg ================================================ [conv1a] epsW=dexp[base=0.02;tgtFactor=250;numSteps=4] epsB=dexp[base=0.04;tgtFactor=25;numSteps=2] momW=0.9 momB=0.9 wc=0.0005 wball=0.00 [conv1b] epsW=dexp[base=0.02;tgtFactor=250;numSteps=4] epsB=dexp[base=0.04;tgtFactor=25;numSteps=2] momW=0.9 momB=0.9 wc=0.0005 wball=0.00 [conv2a] epsW=dexp[base=0.02;tgtFactor=250;numSteps=4] epsB=dexp[base=0.04;tgtFactor=25;numSteps=2] momW=0.9 momB=0.9 wc=0.0005 wball=0.00 [conv2b] epsW=dexp[base=0.02;tgtFactor=250;numSteps=4] epsB=dexp[base=0.04;tgtFactor=25;numSteps=2] momW=0.9 momB=0.9 wc=0.0005 wball=0.00 [conv3a] epsW=dexp[base=0.02;tgtFactor=250;numSteps=4] epsB=dexp[base=0.04;tgtFactor=25;numSteps=2] momW=0.9,0.9 momB=0.9 wc=0.0005,0.0005 wball=0,0 [conv3b] epsW=dexp[base=0.02;tgtFactor=250;numSteps=4] epsB=dexp[base=0.04;tgtFactor=25;numSteps=2] momW=0.9,0.9 momB=0.9 wc=0.0005,0.0005 wball=0,0 [conv4a] epsW=dexp[base=0.02;tgtFactor=250;numSteps=4] epsB=dexp[base=0.04;tgtFactor=25;numSteps=2] momW=0.9 momB=0.9 wc=0.0005 wball=0 [conv4b] epsW=dexp[base=0.02;tgtFactor=250;numSteps=4] epsB=dexp[base=0.04;tgtFactor=25;numSteps=2] momW=0.9 momB=0.9 wc=0.0005 wball=0 [conv5a] epsW=dexp[base=0.02;tgtFactor=250;numSteps=4] epsB=dexp[base=0.04;tgtFactor=25;numSteps=2] momW=0.9 momB=0.9 wc=0.0005 wball=0 [conv5b] epsW=dexp[base=0.02;tgtFactor=250;numSteps=4] epsB=dexp[base=0.04;tgtFactor=25;numSteps=2] momW=0.9 momB=0.9 wc=0.0005 wball=0 [fc2048a] epsW=dexp[base=0.02;tgtFactor=250;numSteps=4] epsB=dexp[base=0.04;tgtFactor=25;numSteps=2] momW=0.9,0.9 momB=0.9 wc=0.0005,0.0005 wball=0,0 [fc2048b] epsW=dexp[base=0.02;tgtFactor=250;numSteps=4] epsB=dexp[base=0.04;tgtFactor=25;numSteps=2] momW=0.9,0.9 momB=0.9 wc=0.0005,0.0005 wball=0,0 [fc2048ba] epsW=dexp[base=0.02;tgtFactor=250;numSteps=4] epsB=dexp[base=0.04;tgtFactor=25;numSteps=2] momW=0.9,0.9 momB=0.9 wc=0.0005,0.0005 wball=0,0 [fc2048bb] epsW=dexp[base=0.02;tgtFactor=250;numSteps=4] epsB=dexp[base=0.04;tgtFactor=25;numSteps=2] momW=0.9,0.9 momB=0.9 wc=0.0005,0.0005 wball=0,0 [fc1000] epsW=dexp[base=0.02;tgtFactor=250;numSteps=4] epsB=dexp[base=0.04;tgtFactor=25;numSteps=2] momW=0.9,0.9 momB=0.9 wc=0.0005,0.0005 wball=0,0 [logprob] coeff=1 topk=5 [dropout1a] enable=true keep=0.5 [dropout2a] enable=true keep=0.5 [dropout1b] enable=true keep=0.5 [dropout2b] enable=true keep=0.5 [rnorm1a] scale=0.0001 pow=0.75 minDiv=2 [rnorm1b] scale=0.0001 pow=0.75 minDiv=2 [rnorm2a] scale=0.0001 pow=0.75 minDiv=2 [rnorm2b] scale=0.0001 pow=0.75 minDiv=2 [cnorm2a] scale=0.001 pow=0.75 [cnorm2b] scale=0.001 pow=0.75 ================================================ FILE: caffe2/contrib/cuda-convnet2/layers/layer-params-imagenet-4gpu-data-model.cfg ================================================ [conv1] momW=0.9 momB=0.9 wc=0.0005 wball=0.00 epsW=dexp[base=0.04;tgtFactor=250;numSteps=4] epsB=dexp[base=0.08;tgtFactor=10;numSteps=2] [conv2] momW=0.9 momB=0.9 wc=0.0005 wball=0.00 epsW=dexp[base=0.04;tgtFactor=250;numSteps=4] epsB=dexp[base=0.08;tgtFactor=10;numSteps=2] [conv3] momW=0.9 momB=0.9 wc=0.0005 wball=0 epsW=dexp[base=0.04;tgtFactor=250;numSteps=4] epsB=dexp[base=0.08;tgtFactor=10;numSteps=2] [conv4] momW=0.9 momB=0.9 wc=0.0005 wball=0 epsW=dexp[base=0.04;tgtFactor=250;numSteps=4] epsB=dexp[base=0.08;tgtFactor=10;numSteps=2] [conv5] momW=0.9 momB=0.9 wc=0.0005 wball=0 epsW=dexp[base=0.04;tgtFactor=250;numSteps=4] epsB=dexp[base=0.08;tgtFactor=10;numSteps=2] [fc1024a] momW=0.9 momB=0.9 wc=0.0005 wball=0 epsW=dexp[base=0.01;tgtFactor=250;numSteps=4] epsB=dexp[base=0.02;tgtFactor=10;numSteps=2] updatePeriod=1 [fc1024b] momW=0.9 momB=0.9 wc=0.0005 wball=0 epsW=dexp[base=0.01;tgtFactor=250;numSteps=4] epsB=dexp[base=0.02;tgtFactor=10;numSteps=2] updatePeriod=1 [fc1024c] momW=0.9 momB=0.9 wc=0.0005 wball=0 epsW=dexp[base=0.01;tgtFactor=250;numSteps=4] epsB=dexp[base=0.02;tgtFactor=10;numSteps=2] updatePeriod=1 [fc1024d] momW=0.9 momB=0.9 wc=0.0005 wball=0 epsW=dexp[base=0.01;tgtFactor=250;numSteps=4] epsB=dexp[base=0.02;tgtFactor=10;numSteps=2] updatePeriod=1 [fc1024ba] momW=0.9 momB=0.9 wc=0.0005 wball=0 epsW=dexp[base=0.01;tgtFactor=250;numSteps=4] epsB=dexp[base=0.02;tgtFactor=10;numSteps=2] updatePeriod=1 [fc1024bb] momW=0.9 momB=0.9 wc=0.0005 wball=0 epsW=dexp[base=0.01;tgtFactor=250;numSteps=4] epsB=dexp[base=0.02;tgtFactor=10;numSteps=2] updatePeriod=1 [fc1024bc] momW=0.9 momB=0.9 wc=0.0005 wball=0 epsW=dexp[base=0.01;tgtFactor=250;numSteps=4] epsB=dexp[base=0.02;tgtFactor=10;numSteps=2] updatePeriod=1 [fc1024bd] momW=0.9 momB=0.9 wc=0.0005 wball=0 epsW=dexp[base=0.01;tgtFactor=250;numSteps=4] epsB=dexp[base=0.02;tgtFactor=10;numSteps=2] updatePeriod=1 [fc1000] momW=0.9 momB=0.9 wc=0.0005 wball=0 epsW=dexp[base=0.01;tgtFactor=250;numSteps=4] epsB=dexp[base=0.02;tgtFactor=10;numSteps=2] updatePeriod=1 [logprob] coeff=1 topk=5 [dropout1a] enable=true keep=0.5 [dropout1b] enable=true keep=0.5 [dropout1c] enable=true keep=0.5 [dropout1d] enable=true keep=0.5 [dropout2a] enable=true keep=0.5 [dropout2b] enable=true keep=0.5 [dropout2c] enable=true keep=0.5 [dropout2d] enable=true keep=0.5 [rnorm1] scale=0.0001 pow=0.75 minDiv=2 [rnorm2] scale=0.0001 pow=0.75 minDiv=2 ================================================ FILE: caffe2/contrib/cuda-convnet2/layers/layer-params-imagenet-4gpu-data.cfg ================================================ [conv1] momW=0.9 momB=0.9 wc=0.0005 wball=0.00 epsW=dexp[base=0.04;tgtFactor=250;numSteps=4] epsB=dexp[base=0.08;tgtFactor=10;numSteps=2] updatePeriod=1 [conv2] momW=0.9 momB=0.9 wc=0.0005 wball=0.00 epsW=dexp[base=0.04;tgtFactor=250;numSteps=4] epsB=dexp[base=0.08;tgtFactor=10;numSteps=2] updatePeriod=1 [conv3] momW=0.9 momB=0.9 wc=0.0005 wball=0 epsW=dexp[base=0.04;tgtFactor=250;numSteps=4] epsB=dexp[base=0.08;tgtFactor=10;numSteps=2] updatePeriod=1 [conv4] momW=0.9 momB=0.9 wc=0.0005 wball=0 epsW=dexp[base=0.04;tgtFactor=250;numSteps=4] epsB=dexp[base=0.08;tgtFactor=10;numSteps=2] updatePeriod=1 [conv5] momW=0.9 momB=0.9 wc=0.0005 wball=0 epsW=dexp[base=0.04;tgtFactor=250;numSteps=4] epsB=dexp[base=0.08;tgtFactor=10;numSteps=2] updatePeriod=1 [fc4096a] momW=0.9 momB=0.9 wc=0.0005 wball=0 epsW=dexp[base=0.04;tgtFactor=250;numSteps=4] epsB=dexp[base=0.08;tgtFactor=10;numSteps=2] updatePeriod=1 [fc4096b] momW=0.9 momB=0.9 wc=0.0005 wball=0 epsW=dexp[base=0.04;tgtFactor=250;numSteps=4] epsB=dexp[base=0.08;tgtFactor=10;numSteps=2] updatePeriod=1 [fc1000] momW=0.9 momB=0.9 wc=0.0005 wball=0 epsW=dexp[base=0.04;tgtFactor=250;numSteps=4] epsB=dexp[base=0.08;tgtFactor=10;numSteps=2] updatePeriod=1 [logprob] coeff=1 topk=5 [dropout1] enable=true [dropout2] enable=true [rnorm1] scale=0.0001 pow=0.75 minDiv=2 [rnorm2] scale=0.0001 pow=0.75 minDiv=2 ================================================ FILE: caffe2/contrib/cuda-convnet2/layers/layers-cifar10-11pct.cfg ================================================ [data] type=data dataIdx=0 [labels] type=data dataIdx=1 [conv1] type=conv inputs=data channels=3 filters=64 padding=2 stride=1 filterSize=5 neuron=relu initW=0.0001 sumWidth=4 sharedBiases=1 gpu=0 [pool1] type=pool pool=max inputs=conv1 start=0 sizeX=3 stride=2 outputsX=0 channels=64 [rnorm1] type=cmrnorm inputs=pool1 channels=64 size=9 [conv2] type=conv inputs=rnorm1 filters=64 padding=2 stride=1 filterSize=5 channels=64 neuron=relu initW=0.01 sumWidth=2 sharedBiases=1 [rnorm2] type=cmrnorm inputs=conv2 channels=64 size=9 [pool2] type=pool pool=max inputs=rnorm2 start=0 sizeX=3 stride=2 outputsX=0 channels=64 [local3] type=local inputs=pool2 filters=64 padding=1 stride=1 filterSize=3 channels=64 neuron=relu initW=0.04 [local4] type=local inputs=local3 filters=32 padding=1 stride=1 filterSize=3 channels=64 neuron=relu initW=0.04 [fc10] type=fc outputs=10 inputs=local4 initW=0.01 [probs] type=softmax inputs=fc10 [logprob] type=cost.logreg inputs=labels,probs gpu=0 ================================================ FILE: caffe2/contrib/cuda-convnet2/layers/layers-imagenet-1gpu.cfg ================================================ [data] type=data dataIdx=0 [labvec] type=data dataIdx=1 [conv1] type=conv inputs=data channels=3 filters=64 padding=0 stride=4 filterSize=11 initW=0.01 sumWidth=4 sharedBiases=1 gpu=0 [rnorm1] type=cmrnorm inputs=conv1 channels=64 size=5 [pool1] type=pool pool=max inputs=rnorm1 sizeX=3 stride=2 channels=64 neuron=relu [conv2] type=conv inputs=pool1 filters=192 padding=2 stride=1 filterSize=5 channels=64 initW=0.01 initB=1 sumWidth=3 sharedBiases=1 neuron=relu [rnorm2] type=cmrnorm inputs=conv2 channels=192 size=5 [pool2] type=pool pool=max inputs=rnorm2 sizeX=3 stride=2 channels=192 [conv3] type=conv inputs=pool2 filters=384 padding=1 stride=1 filterSize=3 channels=192 initW=0.03 sumWidth=3 sharedBiases=1 neuron=relu [conv4] type=conv inputs=conv3 filters=256 padding=1 stride=1 filterSize=3 channels=384 neuron=relu initW=0.03 initB=1 sumWidth=3 sharedBiases=1 [conv5] type=conv inputs=conv4 filters=256 padding=1 stride=1 filterSize=3 channels=256 initW=0.03 initB=1 sumWidth=3 [pool3] type=pool pool=max inputs=conv5 sizeX=3 stride=2 channels=256 neuron=relu [fc4096a] type=fc inputs=pool3 outputs=4096 initW=0.01 initB=1 neuron=relu gpu=0 [dropout1] type=dropout2 inputs=fc4096a [fc4096b] type=fc inputs=dropout1 outputs=4096 initW=0.01 initB=1 neuron=relu gpu=0 [dropout2] type=dropout2 inputs=fc4096b [fc1000] type=fc outputs=1000 inputs=dropout2 initW=0.01 initB=-7 gpu=0 [probs] type=softmax inputs=fc1000 [logprob] type=cost.logreg inputs=labvec,probs gpu=0 ================================================ FILE: caffe2/contrib/cuda-convnet2/layers/layers-imagenet-2gpu-data.cfg ================================================ [data] type=data dataIdx=0 [labvec] type=data dataIdx=1 [conv1] type=conv inputs=data channels=3 filters=64 padding=0 stride=4 filterSize=11 initW=0.01 sumWidth=4 sharedBiases=1 gpu=0,1 [rnorm1] type=cmrnorm inputs=conv1 channels=64 size=5 [pool1] type=pool pool=max inputs=rnorm1 sizeX=3 stride=2 channels=64 neuron=relu [conv2] type=conv inputs=pool1 filters=192 padding=2 stride=1 filterSize=5 channels=64 initW=0.01 initB=1 sumWidth=3 sharedBiases=1 neuron=relu [rnorm2] type=cmrnorm inputs=conv2 channels=192 size=5 [pool2] type=pool pool=max inputs=rnorm2 sizeX=3 stride=2 channels=192 [conv3] type=conv inputs=pool2 filters=384 padding=1 stride=1 filterSize=3 channels=192 initW=0.03 sumWidth=3 sharedBiases=1 neuron=relu [conv4] type=conv inputs=conv3 filters=256 padding=1 stride=1 filterSize=3 channels=384 neuron=relu initW=0.03 initB=1 sumWidth=3 sharedBiases=1 [conv5] type=conv inputs=conv4 filters=256 padding=1 stride=1 filterSize=3 channels=256 initW=0.03 initB=1 sumWidth=3 [pool3] type=pool pool=max inputs=conv5 sizeX=3 stride=2 channels=256 neuron=relu [fc4096a] type=fc inputs=pool3 outputs=4096 initW=0.01 initB=1 neuron=relu [dropout1] type=dropout2 inputs=fc4096a [fc4096b] type=fc inputs=dropout1 outputs=4096 initW=0.01 initB=1 neuron=relu [dropout2] type=dropout2 inputs=fc4096b [fc1000] type=fc outputs=1000 inputs=dropout2 initW=0.01 initB=-7 [probs] type=softmax inputs=fc1000 [logprob] type=cost.logreg inputs=labvec,probs gpu=0,1 ================================================ FILE: caffe2/contrib/cuda-convnet2/layers/layers-imagenet-2gpu-model.cfg ================================================ [data] type=data dataIdx=0 [labels] type=data dataIdx=1 [conv1a] type=conv inputs=data channels=3 filters=48 padding=0 stride=4 filterSize=11 initW=0.01 sumWidth=3 sharedBiases=1 gpu=0 [conv1b] type=conv inputs=data channels=3 filters=48 padding=0 stride=4 filterSize=11 initW=0.01 sumWidth=3 sharedBiases=1 gpu=1 [rnorm1a] type=cmrnorm inputs=conv1a channels=48 size=5 [rnorm1b] type=cmrnorm inputs=conv1b channels=48 size=5 [pool1a] type=pool pool=max inputs=rnorm1a sizeX=3 stride=2 channels=48 neuron=relu [pool1b] type=pool pool=max inputs=rnorm1b sizeX=3 stride=2 channels=48 neuron=relu [conv2a] type=conv inputs=pool1a filters=128 padding=2 stride=1 filterSize=5 channels=48 initW=0.01 initB=1 sumWidth=3 sharedBiases=1 neuron=relu gpu=0 [conv2b] type=conv inputs=pool1b filters=128 padding=2 stride=1 filterSize=5 channels=48 initW=0.01 initB=1 sumWidth=3 sharedBiases=1 neuron=relu gpu=1 [rnorm2a] type=cmrnorm inputs=conv2a channels=128 size=5 [rnorm2b] type=cmrnorm inputs=conv2b channels=128 size=5 [cnorm2a] type=rnorm inputs=rnorm2a channels=128 size=5 [cnorm2b] type=rnorm inputs=rnorm2b channels=128 size=5 [pool2a] type=pool pool=max inputs=cnorm2a sizeX=3 stride=2 channels=128 [pool2b] type=pool pool=max inputs=cnorm2b sizeX=3 stride=2 channels=128 [conv3a] type=conv inputs=pool2a,pool2b filters=192,192 padding=1,1 stride=1,1 filterSize=3,3 channels=128,128 initW=0.03,0.03 sumWidth=2 sharedBiases=1 neuron=relu gpu=0 [conv3b] type=conv inputs=pool2a,pool2b filters=192,192 padding=1,1 stride=1,1 filterSize=3,3 channels=128,128 initW=0.03,0.03 sumWidth=2 sharedBiases=1 neuron=relu gpu=1 [conv4a] type=conv inputs=conv3a filters=192 padding=1 stride=1 filterSize=3 channels=192 neuron=relu initW=0.03 initB=1 sumWidth=2 sharedBiases=1 [conv4b] type=conv inputs=conv3b filters=192 padding=1 stride=1 filterSize=3 channels=192 neuron=relu initW=0.03 initB=1 sumWidth=2 sharedBiases=1 [conv5a] type=conv inputs=conv4a filters=128 padding=1 stride=1 filterSize=3 channels=192 initW=0.03 initB=1 sumWidth=2 groups=1 randSparse=0 [conv5b] type=conv inputs=conv4b filters=128 padding=1 stride=1 filterSize=3 channels=192 initW=0.03 initB=1 sumWidth=2 groups=1 randSparse=0 [pool3a] type=pool pool=max inputs=conv5a sizeX=3 stride=2 channels=128 neuron=relu [pool3b] type=pool pool=max inputs=conv5b sizeX=3 stride=2 channels=128 neuron=relu [fc2048a] type=fc inputs=pool3a,pool3b outputs=2048 initW=0.01,0.01 initB=1 neuron=relu gpu=0 [fc2048b] type=fc inputs=pool3a,pool3b outputs=2048 initW=0.01,0.01 initB=1 neuron=relu gpu=1 [dropout1a] type=dropout inputs=fc2048a [dropout1b] type=dropout inputs=fc2048b [fc2048ba] type=fc inputs=dropout1a,dropout1b outputs=2048 initW=0.01,0.01 initB=1 neuron=relu gpu=0 [fc2048bb] type=fc inputs=dropout1b,dropout1a outputs=2048 initW=0.01,0.01 initB=1 neuron=relu gpu=1 [dropout2a] type=dropout inputs=fc2048ba [dropout2b] type=dropout inputs=fc2048bb [fc1000] type=fc outputs=1000 inputs=dropout2a,dropout2b initW=0.01,0.01 gpu=0 [probs] type=softmax inputs=fc1000 [logprob] type=cost.logreg inputs=labels,probs gpu=0 ================================================ FILE: caffe2/contrib/cuda-convnet2/layers/layers-imagenet-4gpu-data-model.cfg ================================================ [data] type=data dataIdx=0 [labvec] type=data dataIdx=1 [conv1] type=conv inputs=data channels=3 filters=64 padding=0 stride=4 filterSize=11 initW=0.01 sumWidth=4 sharedBiases=1 gpu=0,1,2,3 [rnorm1] type=cmrnorm inputs=conv1 channels=64 size=5 [pool1] type=pool pool=max inputs=rnorm1 sizeX=3 stride=2 channels=64 neuron=relu [conv2] type=conv inputs=pool1 filters=192 padding=2 stride=1 filterSize=5 channels=64 initW=0.01 initB=1 sumWidth=3 sharedBiases=1 neuron=relu [rnorm2] type=cmrnorm inputs=conv2 channels=192 size=5 [pool2] type=pool pool=max inputs=rnorm2 sizeX=3 stride=2 channels=192 [conv3] type=conv inputs=pool2 filters=384 padding=1 stride=1 filterSize=3 channels=192 initW=0.03 sumWidth=3 sharedBiases=1 neuron=relu [conv4] type=conv inputs=conv3 filters=256 padding=1 stride=1 filterSize=3 channels=384 neuron=relu initW=0.03 initB=1 sumWidth=3 sharedBiases=1 [conv5] type=conv inputs=conv4 filters=256 padding=1 stride=1 filterSize=3 channels=256 initW=0.03 initB=1 sumWidth=3 [pool3] type=pool pool=max inputs=conv5 sizeX=3 stride=2 channels=256 neuron=relu [fc1024a] type=fc inputs=pool3 outputs=1024 initW=0.01 initB=1 neuron=relu gpu=0 [fc1024b] type=fc inputs=pool3 outputs=1024 initW=0.01 initB=1 neuron=relu gpu=1 [fc1024c] type=fc inputs=pool3 outputs=1024 initW=0.01 initB=1 neuron=relu gpu=2 [fc1024d] type=fc inputs=pool3 outputs=1024 initW=0.01 initB=1 neuron=relu gpu=3 [dropout1a] type=dropout2 inputs=fc1024a [dropout1b] type=dropout2 inputs=fc1024b [dropout1c] type=dropout2 inputs=fc1024c [dropout1d] type=dropout2 inputs=fc1024d # This is like a concatenation layer [pass1a] type=pass inputs=dropout1a,dropout1b,dropout1c,dropout1d gpu=0 # This is like a concatenation layer [pass1b] type=pass inputs=dropout1a,dropout1b,dropout1c,dropout1d gpu=1 # This is like a concatenation layer [pass1c] type=pass inputs=dropout1a,dropout1b,dropout1c,dropout1d gpu=2 # This is like a concatenation layer [pass1d] type=pass inputs=dropout1a,dropout1b,dropout1c,dropout1d gpu=3 [fc1024ba] type=fc inputs=pass1a outputs=1024 initW=0.01 initB=1 neuron=relu [fc1024bb] type=fc inputs=pass1b outputs=1024 initW=0.01 initB=1 neuron=relu [fc1024bc] type=fc inputs=pass1c outputs=1024 initW=0.01 initB=1 neuron=relu [fc1024bd] type=fc inputs=pass1d outputs=1024 initW=0.01 initB=1 neuron=relu [dropout2a] type=dropout2 inputs=fc1024ba [dropout2b] type=dropout2 inputs=fc1024bb [dropout2c] type=dropout2 inputs=fc1024bc [dropout2d] type=dropout2 inputs=fc1024bd [pass2a] inputs=dropout2a,dropout2b,dropout2c,dropout2d type=pass gpu=0 [fc1000] type=fc outputs=1000 inputs=pass2a initW=0.01 [probs] type=softmax inputs=fc1000 [logprob] type=cost.logreg inputs=labvec,probs gpu=0 ================================================ FILE: caffe2/contrib/cuda-convnet2/layers/layers-imagenet-4gpu-data.cfg ================================================ [data] type=data dataIdx=0 [labvec] type=data dataIdx=1 [conv1] type=conv inputs=data channels=3 filters=64 padding=0 stride=4 filterSize=11 initW=0.01 sumWidth=4 sharedBiases=1 gpu=0,1,2,3 [rnorm1] type=cmrnorm inputs=conv1 channels=64 size=5 [pool1] type=pool pool=max inputs=rnorm1 sizeX=3 stride=2 channels=64 neuron=relu [conv2] type=conv inputs=pool1 filters=192 padding=2 stride=1 filterSize=5 channels=64 initW=0.01 initB=1 sumWidth=3 sharedBiases=1 neuron=relu [rnorm2] type=cmrnorm inputs=conv2 channels=192 size=5 [pool2] type=pool pool=max inputs=rnorm2 sizeX=3 stride=2 channels=192 [conv3] type=conv inputs=pool2 filters=384 padding=1 stride=1 filterSize=3 channels=192 initW=0.03 sumWidth=3 sharedBiases=1 neuron=relu [conv4] type=conv inputs=conv3 filters=256 padding=1 stride=1 filterSize=3 channels=384 neuron=relu initW=0.03 initB=1 sumWidth=3 sharedBiases=1 [conv5] type=conv inputs=conv4 filters=256 padding=1 stride=1 filterSize=3 channels=256 initW=0.03 initB=1 sumWidth=3 [pool3] type=pool pool=max inputs=conv5 sizeX=3 stride=2 channels=256 neuron=relu [fc4096a] type=fc inputs=pool3 outputs=4096 initW=0.01 initB=1 neuron=relu [dropout1] type=dropout2 inputs=fc4096a [fc4096b] type=fc inputs=dropout1 outputs=4096 initW=0.01 initB=1 neuron=relu [dropout2] type=dropout2 inputs=fc4096b [fc1000] type=fc outputs=1000 inputs=dropout2 initW=0.01 initB=-7 [probs] type=softmax inputs=fc1000 [logprob] type=cost.logreg inputs=labvec,probs gpu=0,1,2,3 ================================================ FILE: caffe2/contrib/cuda-convnet2/make-data/make-data.py ================================================ # Copyright 2014 Google Inc. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. ################################################################################# # This script makes batches suitable for training from raw ILSVRC 2012 tar files. import tarfile from StringIO import StringIO from random import shuffle import sys from time import time from pyext._MakeDataPyExt import resizeJPEG import itertools import os import cPickle import scipy.io import math import argparse as argp # Set this to True to crop images to square. In this case each image will be # resized such that its shortest edge is OUTPUT_IMAGE_SIZE pixels, and then the # center OUTPUT_IMAGE_SIZE x OUTPUT_IMAGE_SIZE patch will be extracted. # # Set this to False to preserve image borders. In this case each image will be # resized such that its shortest edge is OUTPUT_IMAGE_SIZE pixels. This was # demonstrated to be superior by Andrew Howard in his very nice paper: # http://arxiv.org/abs/1312.5402 CROP_TO_SQUARE = True OUTPUT_IMAGE_SIZE = 256 # Number of threads to use for JPEG decompression and image resizing. NUM_WORKER_THREADS = 8 # Don't worry about these. OUTPUT_BATCH_SIZE = 3072 OUTPUT_SUB_BATCH_SIZE = 1024 def pickle(filename, data): with open(filename, "w") as fo: cPickle.dump(data, fo, protocol=cPickle.HIGHEST_PROTOCOL) def unpickle(filename): fo = open(filename, 'r') contents = cPickle.load(fo) fo.close() return contents def partition_list(l, partition_size): divup = lambda a,b: (a + b - 1) / b return [l[i*partition_size:(i+1)*partition_size] for i in xrange(divup(len(l),partition_size))] def open_tar(path, name): if not os.path.exists(path): print "ILSVRC 2012 %s not found at %s. Make sure to set ILSVRC_SRC_DIR correctly at the top of this file (%s)." % (name, path, sys.argv[0]) sys.exit(1) return tarfile.open(path) def makedir(path): if not os.path.exists(path): os.makedirs(path) def parse_devkit_meta(ILSVRC_DEVKIT_TAR): tf = open_tar(ILSVRC_DEVKIT_TAR, 'devkit tar') fmeta = tf.extractfile(tf.getmember('ILSVRC2012_devkit_t12/data/meta.mat')) meta_mat = scipy.io.loadmat(StringIO(fmeta.read())) labels_dic = dict((m[0][1][0], m[0][0][0][0]-1) for m in meta_mat['synsets'] if m[0][0][0][0] >= 1 and m[0][0][0][0] <= 1000) label_names_dic = dict((m[0][1][0], m[0][2][0]) for m in meta_mat['synsets'] if m[0][0][0][0] >= 1 and m[0][0][0][0] <= 1000) label_names = [tup[1] for tup in sorted([(v,label_names_dic[k]) for k,v in labels_dic.items()], key=lambda x:x[0])] fval_ground_truth = tf.extractfile(tf.getmember('ILSVRC2012_devkit_t12/data/ILSVRC2012_validation_ground_truth.txt')) validation_ground_truth = [[int(line.strip()) - 1] for line in fval_ground_truth.readlines()] tf.close() return labels_dic, label_names, validation_ground_truth def write_batches(target_dir, name, start_batch_num, labels, jpeg_files): jpeg_files = partition_list(jpeg_files, OUTPUT_BATCH_SIZE) labels = partition_list(labels, OUTPUT_BATCH_SIZE) makedir(target_dir) print "Writing %s batches..." % name for i,(labels_batch, jpeg_file_batch) in enumerate(zip(labels, jpeg_files)): t = time() jpeg_strings = list(itertools.chain.from_iterable(resizeJPEG([jpeg.read() for jpeg in jpeg_file_batch], OUTPUT_IMAGE_SIZE, NUM_WORKER_THREADS, CROP_TO_SQUARE))) batch_path = os.path.join(target_dir, 'data_batch_%d' % (start_batch_num + i)) makedir(batch_path) for j in xrange(0, len(labels_batch), OUTPUT_SUB_BATCH_SIZE): pickle(os.path.join(batch_path, 'data_batch_%d.%d' % (start_batch_num + i, j/OUTPUT_SUB_BATCH_SIZE)), {'data': jpeg_strings[j:j+OUTPUT_SUB_BATCH_SIZE], 'labels': labels_batch[j:j+OUTPUT_SUB_BATCH_SIZE]}) print "Wrote %s (%s batch %d of %d) (%.2f sec)" % (batch_path, name, i+1, len(jpeg_files), time() - t) return i + 1 if __name__ == "__main__": parser = argp.ArgumentParser() parser.add_argument('--src-dir', help='Directory containing ILSVRC2012_img_train.tar, ILSVRC2012_img_val.tar, and ILSVRC2012_devkit_t12.tar.gz', required=True) parser.add_argument('--tgt-dir', help='Directory to output ILSVRC 2012 batches suitable for cuda-convnet to train on.', required=True) args = parser.parse_args() print "CROP_TO_SQUARE: %s" % CROP_TO_SQUARE print "OUTPUT_IMAGE_SIZE: %s" % OUTPUT_IMAGE_SIZE print "NUM_WORKER_THREADS: %s" % NUM_WORKER_THREADS ILSVRC_TRAIN_TAR = os.path.join(args.src_dir, 'ILSVRC2012_img_train.tar') ILSVRC_VALIDATION_TAR = os.path.join(args.src_dir, 'ILSVRC2012_img_val.tar') ILSVRC_DEVKIT_TAR = os.path.join(args.src_dir, 'ILSVRC2012_devkit_t12.tar.gz') assert OUTPUT_BATCH_SIZE % OUTPUT_SUB_BATCH_SIZE == 0 labels_dic, label_names, validation_labels = parse_devkit_meta(ILSVRC_DEVKIT_TAR) with open_tar(ILSVRC_TRAIN_TAR, 'training tar') as tf: synsets = tf.getmembers() synset_tars = [tarfile.open(fileobj=tf.extractfile(s)) for s in synsets] print "Loaded synset tars." print "Building training set image list (this can take 10-20 minutes)..." sys.stdout.flush() train_jpeg_files = [] for i,st in enumerate(synset_tars): if i % 100 == 0: print "%d%% ..." % int(round(100.0 * float(i) / len(synset_tars))), sys.stdout.flush() train_jpeg_files += [st.extractfile(m) for m in st.getmembers()] st.close() shuffle(train_jpeg_files) train_labels = [[labels_dic[jpeg.name[:9]]] for jpeg in train_jpeg_files] print "done" # Write training batches i = write_batches(args.tgt_dir, 'training', 0, train_labels, train_jpeg_files) # Write validation batches val_batch_start = int(math.ceil((i / 1000.0))) * 1000 with open_tar(ILSVRC_VALIDATION_TAR, 'validation tar') as tf: validation_jpeg_files = sorted([tf.extractfile(m) for m in tf.getmembers()], key=lambda x:x.name) write_batches(args.tgt_dir, 'validation', val_batch_start, validation_labels, validation_jpeg_files) # Write meta file meta = unpickle('input_meta') meta_file = os.path.join(args.tgt_dir, 'batches.meta') meta.update({'batch_size': OUTPUT_BATCH_SIZE, 'num_vis': OUTPUT_IMAGE_SIZE**2 * 3, 'label_names': label_names}) pickle(meta_file, meta) print "Wrote %s" % meta_file print "All done! ILSVRC 2012 batches are in %s" % args.tgt_dir ================================================ FILE: caffe2/contrib/cuda-convnet2/make-data/pyext/Makefile ================================================ # Copyright 2014 Google Inc. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. INCLUDES := -I./include COMMONFLAGS := CC_ARGS := ifndef debug CC_ARGS += -O3 endif CC=g++ OUT_DIR=./bin/$(OUT_SUFFIX) PYTHON_VERSION=$(shell python -V 2>&1 | cut -d ' ' -f 2 | cut -d '.' -f 1,2) LINK_LIBS := -L$(CUDA_INSTALL_PATH)/lib64 `pkg-config --libs python` `pkg-config --libs opencv` -lpthread INCLUDES += -I$(PYTHON_INCLUDE_PATH) OUT_FILE=_MakeDataPyExt.so all: dir classes $(OUT_FILE) dir: mkdir -p $(OUT_DIR)/src SOURCES = $(shell echo src/*.cpp) CLASSES = $(SOURCES:.cpp=.o) classes: $(CLASSES) %.o: %.cpp $(CC) $(CC_ARGS) -c -fPIC $(BUILD_ARGS) $(COMMONFLAGS) $(INCLUDES) $< -o $(OUT_DIR)/$*.o $(OUT_FILE): classes cd $(OUT_DIR) && $(CC) $(CC_ARGS) $(BUILD_ARGS) $(COMMONFLAGS) -shared -Wl,-no-undefined -o $(OUT_FILE) $(CLASSES) $(LINK_LIBS) ln -sf $(OUT_DIR)/$(OUT_FILE) . clean: rm -rf $(OUT_DIR)/* ================================================ FILE: caffe2/contrib/cuda-convnet2/make-data/pyext/__init__.py ================================================ # Copyright 2014 Google Inc. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. ================================================ FILE: caffe2/contrib/cuda-convnet2/make-data/pyext/include/pyext.h ================================================ /* * Copyright 2014 Google Inc. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef INCLUDE_PYEXT_H_ #define INCLUDE_PYEXT_H_ #include //#include #include #include #include "../../../util/include/thread.h" #define JPEG_QUALITY 95 #ifndef DIVUP #define DIVUP(a,b) (((a) + (b) - 1) / (b)) #endif extern "C" { void init_MakeDataPyExt(); } PyObject* resizeJPEG(PyObject *self, PyObject *args); class DecoderThread : public Thread { protected: PyObject* _py_list_src; PyObject* _py_list_tgt; int _start_img, _end_img; int _target_size; bool _crop_to_square; cv::Mat _resized_mat_buffer; std::vector _output_jpeg_buffer; std::vector _encode_params; void* run(); void makeJPEG(int idx); public: DecoderThread(PyObject* py_list_src, int start_img, int end_img, int target_size, bool crop_to_square); virtual ~DecoderThread(); PyObject* getTargetList(); }; #endif // INCLUDE_PYEXT_H_ ================================================ FILE: caffe2/contrib/cuda-convnet2/make-data/pyext/src/pyext.cpp ================================================ /* * Copyright 2014 Google Inc. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "../include/pyext.h" using namespace std; static PyMethodDef _MakeDataPyExtMethods[] = {{ "resizeJPEG", resizeJPEG, METH_VARARGS }, { NULL, NULL } }; void init_MakeDataPyExt() { (void) Py_InitModule("_MakeDataPyExt", _MakeDataPyExtMethods); } PyObject* resizeJPEG(PyObject *self, PyObject *args) { PyListObject* pyListSrc; int tgtImgSize, numThreads; int cropToSquare; if (!PyArg_ParseTuple(args, "O!iii", &PyList_Type, &pyListSrc, &tgtImgSize, &numThreads, &cropToSquare)) { return NULL; } DecoderThread* threads[numThreads]; int num_imgs = PyList_GET_SIZE(pyListSrc); int num_imgs_per_thread = DIVUP(num_imgs, numThreads); for (int t = 0; t < numThreads; ++t) { int start_img = t * num_imgs_per_thread; int end_img = min(num_imgs, (t+1) * num_imgs_per_thread); threads[t] = new DecoderThread((PyObject*)pyListSrc, start_img, end_img, tgtImgSize, cropToSquare); threads[t]->start(); } PyObject* pyListTgt = PyList_New(0); for (int t = 0; t < numThreads; ++t) { threads[t]->join(); PyList_Append(pyListTgt, threads[t]->getTargetList()); delete threads[t]; // the thread's list too } return pyListTgt; } DecoderThread::DecoderThread(PyObject* py_list_src, int start_img, int end_img, int target_size, bool crop_to_square) : Thread(true), _py_list_src(py_list_src), _start_img(start_img), _end_img(end_img), _target_size(target_size), _crop_to_square(crop_to_square) { _encode_params.push_back(CV_IMWRITE_JPEG_QUALITY); _encode_params.push_back(JPEG_QUALITY); _py_list_tgt = PyList_New(0); } DecoderThread::~DecoderThread(){ Py_DECREF(_py_list_tgt); } void* DecoderThread::run() { for (int i = _start_img; i < _end_img; ++i) { makeJPEG(i); } return NULL; } PyObject* DecoderThread::getTargetList() { return _py_list_tgt; } void DecoderThread::makeJPEG(int idx) { /* * Decompress JPEG */ PyObject* pySrc = PyList_GET_ITEM(_py_list_src, idx); uchar* src = (unsigned char*)PyString_AsString(pySrc); size_t src_len = PyString_GET_SIZE(pySrc); vector src_vec(src, src + src_len); cv::Mat decoded_mat = cv::imdecode(cv::Mat(src_vec), CV_LOAD_IMAGE_COLOR); assert(decoded_mat.channels() == 3); /* * Resize */ double min_dim = std::min(decoded_mat.size().height, decoded_mat.size().width); double scale_factor = _target_size / min_dim; int new_height = round(scale_factor * decoded_mat.size().height); int new_width = round(scale_factor * decoded_mat.size().width); assert((new_height == _target_size && new_width >= _target_size) || (new_width == _target_size && new_height >= _target_size)); int interpolation = scale_factor == 1 ? cv::INTER_LINEAR : scale_factor > 1 ? cv::INTER_CUBIC : cv::INTER_AREA; cv::resize(decoded_mat, _resized_mat_buffer, cv::Size(new_width, new_height), 0, 0, interpolation); /* * Conditionally crop and compress JPEG */ if (_crop_to_square) { int crop_start_x = (new_width - _target_size) / 2; int crop_start_y = (new_height - _target_size) / 2; cv::Rect cropRect(crop_start_x, crop_start_y, _target_size, _target_size); cv::Mat cropped_mat_buffer = _resized_mat_buffer(cropRect); cv::imencode(".jpg", cropped_mat_buffer, _output_jpeg_buffer, _encode_params); } else { cv::imencode(".jpg", _resized_mat_buffer, _output_jpeg_buffer, _encode_params); } char* output_jpeg_buffer_ptr = reinterpret_cast(&_output_jpeg_buffer[0]); PyObject* pyStr = PyString_FromStringAndSize(output_jpeg_buffer_ptr, _output_jpeg_buffer.size()); PyList_Append(_py_list_tgt, pyStr); Py_DECREF(pyStr); } ================================================ FILE: caffe2/contrib/cuda-convnet2/nvmatrix/Makefile ================================================ ################################################################################ # # Copyright 1993-2012 NVIDIA Corporation. All rights reserved. # # NOTICE TO USER: # # This source code is subject to NVIDIA ownership rights under U.S. and # international Copyright laws. # # NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE # CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR # IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH # REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF # MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. # IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, # OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS # OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE # OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE # OR PERFORMANCE OF THIS SOURCE CODE. # # U.S. Government End Users. This source code is a "commercial item" as # that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of # "commercial computer software" and "commercial computer software # documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) # and is provided to the U.S. Government only as a commercial end item. # Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through # 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the # source code with only those rights set forth herein. # ################################################################################ # Location of the CUDA Toolkit binaries and libraries CUDA_INC_PATH = $(CUDA_INSTALL_PATH)/include CUDA_BIN_PATH = $(CUDA_INSTALL_PATH)/bin CUDA_LIB_PATH = $(CUDA_INSTALL_PATH)/lib64 # Common binaries NVCC = $(CUDA_BIN_PATH)/nvcc GCC = g++ AR = ar # CUDA code generation flags GENCODE_SM35 := -gencode arch=compute_35,code=sm_35 GENCODE_FLAGS := $(GENCODE_SM35) LDFLAGS := -L$(CUDA_LIB_PATH) -lcudart CCFLAGS := -m64 NVCCFLAGS := -m64 # Debug build flags ifeq ($(dbg),1) CCFLAGS += -g NVCCFLAGS += -g -G DBG := debug else DBG := release NVCCFLAGS += -O3 CCFLAGS += -O3 endif # Add profiler output ifeq ($(prof),1) NVCCFLAGS += --ptxas-options=-v endif TARGETDIR := ./bin/$(DBG) OBJDIR := ./obj/$(DBG) ########## USER STUFF ########### LDFLAGS += -L../util -lutilpy -lcublas INCLUDES := -I$(CUDA_INC_PATH) -I $(CUDA_SDK_PATH)/common/inc -I./include CUFILES := $(shell find . -name "*.cu") CU_DEPS := $(shell find . -name "*.cuh") CCFILES := $(shell find . -name "*.cpp") C_DEPS := $(shell find . -name "*.h") NVCCFLAGS += --compiler-options '-fPIC' LDFLAGS += -shared CCFLAGS += -fPIC TARGET := $(TARGETDIR)/libnvmatrix.so ################################################################################ # Set up target and object files ################################################################################ OBJS += $(patsubst %.cpp,$(OBJDIR)/%.cpp.o,$(CCFILES)) OBJS += $(patsubst %.c,$(OBJDIR)/%.c.o,$(CFILES)) OBJS += $(patsubst %.cu,$(OBJDIR)/%.cu.o,$(CUFILES)) # Target rules all: makedirs $(TARGET) $(OBJDIR)/%.cu.o : %.cu $(CU_DEPS) $(NVCC) $(NVCCFLAGS) $(GENCODE_FLAGS) $(INCLUDES) -o $@ -c $< $(OBJDIR)/%.cpp.o : %.cpp $(C_DEPS) $(GCC) $(CCFLAGS) $(INCLUDES) -o $@ -c $< $(TARGET): $(OBJS) $(GCC) $(CCFLAGS) -o $@ $+ $(LDFLAGS) ln -sf $(TARGET) . makedirs: mkdir -p $(TARGETDIR) mkdir -p $(OBJDIR)/src clean: rm -rf ./obj ================================================ FILE: caffe2/contrib/cuda-convnet2/nvmatrix/include/memory.cuh ================================================ /* * Copyright 2014 Google Inc. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef MEMORY_CUH_H_ #define MEMORY_CUH_H_ #include #include #include #include #include #include #include "../../util/include/sync.h" #include "nvmatrix_kernels.cuh" #define GPU_ALLOC_FRACTION 0.95 // Take 95% of available GPU memory #define HOST_ALLOC_CHUNK (1UL << 32) #define SYNC_ON_FREE true #define BUCKET_TYPE unsigned int // Allocte memory from up to this many buckets higher than desired without subdividing #define BUCKET_DIVISION_THRESHOLD 1 #define NUM_BUCKETS static_cast(sizeof(BUCKET_TYPE) * 8) #define CLZ(x) ((x) == 0 ? (NUM_BUCKETS) : __builtin_clz(x)) #define CEIL_LOG2(x) (NUM_BUCKETS - CLZ(x)) // Ceiling of log base 2 of (x + 1) #define LOG_FIRST_BUCKET_SIZE 12 #define FIRST_BUCKET_SIZE (1 << LOG_FIRST_BUCKET_SIZE) // First bucket is for 4K bytes #define GET_ALLOC_BUCKET(size) (CEIL_LOG2(((size) - 1) >> LOG_FIRST_BUCKET_SIZE)) #define GET_DEALLOC_BUCKET(size) (CEIL_LOG2((size) >> (1 + LOG_FIRST_BUCKET_SIZE))) #define GET_BUCKET_SIZE(b) (1UL << (LOG_FIRST_BUCKET_SIZE + b)) #define BUCKET_MASK(b) (1UL << (b)) #define PREV_BUCKETS_MASK(b) (BUCKET_MASK(b) - 1) #define AVAILABLE_NEXT_MASK(b, buckets) ((buckets) & ~PREV_BUCKETS_MASK(b)) /* * Returns the "best-matching" available bucket as defined by policy. * The two policies are: * * TAKE_FROM_BIGGEST = true: If a bucket in the range * b...{b + BUCKET_DIVISION_THRESHOLD} is available, return the smallest * available bucket in that range. Otherwise return the *biggest* available * bucket greater than or equal to b. * * TAKE_FROM_BIGGEST = false: Return the *smallest* available bucket greater * than or equal to b. * * Returns -1 when no satisfactory bucket is available. */ #define TAKE_FROM_BIGGEST true #if TAKE_FROM_BIGGEST #define GET_AVAILABLE_BUCKET(b, buckets) \ (-1 + (((AVAILABLE_NEXT_MASK(b, buckets)) \ & (PREV_BUCKETS_MASK((b) + 1 + BUCKET_DIVISION_THRESHOLD))) \ /* Smallest bucket >= b */ ? __builtin_ffs(AVAILABLE_NEXT_MASK(b, buckets)) \ /* Biggest bucket >= b */ : CEIL_LOG2(AVAILABLE_NEXT_MASK(b, buckets)))) #else #define GET_AVAILABLE_BUCKET(b, buckets) __builtin_ffs(AVAILABLE_NEXT_MASK(b, buckets)) #endif /* * Bit get/set/clear. */ #define GET_BIT(x, bit) ((x) & (1 << (bit))) #define SET_BIT(x, bit) ((x) |= (1 << (bit))) #define CLEAR_BIT(x, bit) ((x) &= ~(1 << (bit))) typedef struct __align__(512) { char data; } DataType; #define SIZE_ROUNDUP(size) (sizeof(DataType) * DIVUP((size), sizeof(DataType))) class MemorySegment { friend class FastMemoryManager; protected: DataType* _data; size_t _size; int _deviceID; // Resizes itself to _size - size and // returns pointer to new memory segment MemorySegment* subdivide(size_t size) { assert(size < _size); // assert(size % sizeof(DataType) == 0); _size -= size; return new MemorySegment(_data + _size / sizeof(DataType), size, _deviceID); } inline size_t getSize() const { return _size; } public: MemorySegment(DataType* data, size_t size, int deviceID) : _data(data), _size(size), _deviceID(deviceID) { assert(size % sizeof(DataType) == 0); } // In some cases size is irrelevant template MemorySegment(T* data) : _data(reinterpret_cast(data)), _size(0), _deviceID(-1) { } template inline T* getData() const { return reinterpret_cast(_data); } template inline T** getDataPtr() { return reinterpret_cast(&_data); } inline int getDeviceID() const { return _deviceID; } }; class MemoryManager { protected: static Lock _globalLock; public: virtual MemoryManager* init() = 0; virtual MemorySegment* malloc(size_t size) = 0; virtual void free(MemorySegment* mem) = 0; virtual ~MemoryManager() { } }; class FastMemoryManager : public MemoryManager { protected: int _deviceID; Lock _lock; DataType* _data; size_t _size; BUCKET_TYPE _buckets; // Bucket availability bit vector std::vector > _freeSegments; // bucket idx -> vector of segments static std::map _memoryManagers; virtual void allocateInitialSegment() { assert(_deviceID >= 0); assert(FIRST_BUCKET_SIZE % sizeof(DataType) == 0); checkCudaErrors(cudaSetDevice(_deviceID)); size_t memFree, memTotal; checkCudaErrors(cudaMemGetInfo(&memFree, &memTotal)); _size = sizeof(DataType) * (size_t(round(double(memFree) * GPU_ALLOC_FRACTION)) / sizeof(DataType)); printf("FastMemoryManager[%d] allocating %lu-byte initial segment\n", _deviceID, _size); checkCudaErrors(cudaMalloc(&_data, _size)); } virtual void freeInitialSegment() { checkCudaErrors(cudaFree(_data)); } public: static MemoryManager& getInstance(int deviceID); static void destroyInstance(int deviceID); FastMemoryManager(int deviceID) : _deviceID(deviceID), _data(NULL), _size(0), _buckets(0) { } ~FastMemoryManager() { freeInitialSegment(); for (int i = 0; i < _freeSegments.size(); ++i) { for (int j = 0; j < _freeSegments[i].size(); ++j) { delete _freeSegments[i][j]; } } } virtual MemoryManager* init() { allocateInitialSegment(); for (int i = 0; i < NUM_BUCKETS; ++i) { _freeSegments.push_back(std::vector()); } int bucket = GET_DEALLOC_BUCKET(_size); SET_BIT(_buckets, bucket); _freeSegments[bucket].push_back(new MemorySegment(_data, _size, _deviceID)); return this; } MemorySegment* malloc(size_t size) { assert(size > 0); int requestedBucket = GET_ALLOC_BUCKET(size); _lock.acquire(); int bucket = GET_AVAILABLE_BUCKET(requestedBucket, _buckets); // if (bucket - requestedBucket > BUCKET_DIVISION_THRESHOLD) { // printf("MemoryManager[%d] requested size: %lu, requested bucket: %d, available bucket: %d\n", _deviceID, size, requestedBucket, bucket); // } assert(bucket >= requestedBucket); // Out of memory MemorySegment* sourceSegment = _freeSegments[bucket].back(); MemorySegment* ret = sourceSegment; if (bucket - requestedBucket > BUCKET_DIVISION_THRESHOLD) { // We got a much bigger chunk than we wanted ret = sourceSegment->subdivide(GET_BUCKET_SIZE(requestedBucket)); int newSrcBucket = GET_DEALLOC_BUCKET(sourceSegment->getSize()); if (newSrcBucket != bucket) { _freeSegments[bucket].pop_back(); _freeSegments[newSrcBucket].push_back(sourceSegment); SET_BIT(_buckets, newSrcBucket); } } else { _freeSegments[bucket].pop_back(); } if (_freeSegments[bucket].size() == 0) { CLEAR_BIT(_buckets, bucket); } _lock.release(); return ret; } void free(MemorySegment* mem) { assert(mem != NULL); assert(mem->getSize() >= FIRST_BUCKET_SIZE); int bucket = GET_DEALLOC_BUCKET(mem->getSize()); // Synchronize for safety, so that we don't free memory that's being used. Not synchronizing // could potentially cause a problem if we re-allocate the just-freed chunk and attempt to // use it in a different stream. if (SYNC_ON_FREE) { int d; checkCudaErrors(cudaGetDevice(&d)); checkCudaErrors(cudaSetDevice(mem->getDeviceID())); checkCudaErrors(cudaDeviceSynchronize()); checkCudaErrors(cudaSetDevice(d)); } _lock.acquire(); _freeSegments[bucket].push_back(mem); SET_BIT(_buckets, bucket); // printf("MemoryManager[%d] Freed segment of size %lu into bucket %lu\n", _deviceID, mem->getSize(), bucket); _lock.release(); } }; class FastHostMemoryManager : public FastMemoryManager { protected: static MemoryManager* _memoryManager; void allocateInitialSegment() { _size = HOST_ALLOC_CHUNK; checkCudaErrors(cudaHostAlloc(&_data, _size, cudaHostAllocPortable)); } void freeInitialSegment () { checkCudaErrors(cudaFreeHost(_data)); } public: FastHostMemoryManager() : FastMemoryManager(DEVICE_HOST) { } static MemoryManager& getInstance(); static void destroyInstance(); }; class CUDAMemoryManager : public MemoryManager { protected: static MemoryManager* _memoryManager; virtual void _malloc(DataType** data, size_t size) { checkCudaErrors(cudaMalloc(data, size)); } virtual void _free(MemorySegment* mem) { checkCudaErrors(cudaFree(mem->getData())); } public: static MemoryManager& getInstance(int deviceID); static void destroyInstance(int deviceID); CUDAMemoryManager() { } MemoryManager* init() { return this; } MemorySegment* malloc(size_t size) { MemorySegment* seg = new MemorySegment(reinterpret_cast(NULL)); DataType** data = seg->getDataPtr(); _malloc(data, size); return seg; } void free(MemorySegment* mem) { assert(mem != NULL); _free(mem); delete mem; } }; class CUDAHostMemoryManager : public CUDAMemoryManager { protected: static MemoryManager* _memoryManager; void _free(MemorySegment* mem) { checkCudaErrors(cudaFreeHost(mem->getData())); } void _malloc(DataType** data, size_t size) { checkCudaErrors(cudaHostAlloc(data, size, cudaHostAllocPortable)); } public: static MemoryManager& getInstance(); static void destroyInstance(); CUDAHostMemoryManager() : CUDAMemoryManager() { } }; #endif /* MEMORY_CUH_H_ */ ================================================ FILE: caffe2/contrib/cuda-convnet2/nvmatrix/include/nvmatrix.cuh ================================================ /* * Copyright 2014 Google Inc. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef NVMATRIX_H_ #define NVMATRIX_H_ #include #include #include #include #include #include #include #include #include "../../util/include/matrix.h" #include "nvmatrix_kernels.cuh" #include "nvmatrix_operators.cuh" #include "memory.cuh" #ifdef WARNINGS #define WARN(msg) printf("WARN: File %s, line %d: %s\n", __FILE__, __LINE__, msg); #else #define WARN(msg) ; #endif #define CURAND_CALL(x) do { if((x) != CURAND_STATUS_SUCCESS) { \ printf("CURAND Error at %s:%d\n",__FILE__,__LINE__);\ exit(EXIT_FAILURE);}} while(0) #define CUBLAS_CALL(x) do { if((x) != CUBLAS_STATUS_SUCCESS) { \ printf("CUBLAS Error at %s:%d\n",__FILE__,__LINE__);\ exit(EXIT_FAILURE);}} while(0) /* * Memory manager to use for GPU memory allocations. * * CUDAMemoryManager: Default Nvidia memory manager; just calls cudaMalloc / cudaFree. * Allocating and freeing memory is slow. * FastMemoryManager: A GPU memory manager with very fast (constant time) * alloc / free, but possibly more wasteful of memory. */ #define DEVICE_MEMORY_MANAGER CUDAMemoryManager /* * Memory manager to use for host memory allocations. * * CUDAHostMemoryManager: Default Nvidia memory manager; just calls cudaHostAlloc / cudaFreeHost. * Allocating and freeing memory is slow. * FastHostMemoryManager: A host memory manager with very fast (constant time) * alloc / free, but possibly more wasteful of memory. */ #define HOST_MEMORY_MANAGER CUDAHostMemoryManager class NVMatrix; typedef std::vector NVMatrixV; class NVMatrix { protected: int _numCols, _numRows; int _numElements; int _stride; // float* getDevData(); MemorySegment* _memSegment; bool _isTrans; bool _ownsData; // This flag makes sure that the NVMatrix destructor does nothing // when called on HostNVMatrix instance. bool _deleted; cudaTextureObject_t _texObj; // static std::map rndGen; static std::map _rndDevStates; static std::map _cublasHandles; // Map from device id --> # of random streams initialized on that device static std::map _rndDevThreads; static pthread_mutex_t *_rndMutex, *_cublasMutex, *_streamMutex; // Map from device id --> default stream static std::map _defaultStreams; cublasOperation_t getTransChar() const { /* * not a typo! return opposite character because a * non-transposed nvmatrix is in row-major order while a non-transposed * cublas matrix is in column-major order. */ return _isTrans ? CUBLAS_OP_N : CUBLAS_OP_T; } void _init(bool isTrans); void _sum_setParams(int n, dim3* blocks, dim3* threads); template float cpuAgg(Agg agg, cudaStream_t stream); template float _totalAgg(Agg agg); template float _totalAgg(Agg agg, cudaStream_t stream); template float _totalAgg(Agg agg, NVMatrix& tmpbuf, cudaStream_t stream); template void _aggregate(int axis, NVMatrix& target, Agg agg, UnaryOp uop, BinaryOp bop, cudaStream_t stream, NVMatrix* tmp); template void _aggregate(int axis, NVMatrix& target, Agg agg, UnaryOp uop, BinaryOp bop, cudaStream_t stream); template void _aggregate(int axis, NVMatrix& target, Agg agg, UnaryOp uop, BinaryOp bop); template void _aggregate(int axis, NVMatrix& target, Agg agg, BinaryOp bop, cudaStream_t stream); template void _aggregate(int axis, NVMatrix& target, Agg agg, BinaryOp bop); template NVMatrix& _aggregate(int axis, Agg agg, BinaryOp bop, cudaStream_t stream); template NVMatrix& _aggregate(int axis, Agg agg, BinaryOp bop); template NVMatrix& _aggregate(int axis, Agg agg, UnaryOp, BinaryOp bop, cudaStream_t stream); template NVMatrix& _aggregate(int axis, Agg agg, UnaryOp, BinaryOp bop); template void _aggregate(int axis, NVMatrix& target, Agg agg, UnaryOp uop, BinaryOp bop, NVMatrix& tmp); template void _aggregate(int axis, NVMatrix& target, Agg agg, BinaryOp bop, cudaStream_t stream, NVMatrix& tmp); template void _aggregate(int axis, NVMatrix& target, Agg agg, BinaryOp bop, NVMatrix& tmp); template NVMatrix& _aggregate(int axis, Agg agg, BinaryOp bop, cudaStream_t stream, NVMatrix& tmp); template NVMatrix& _aggregate(int axis, Agg agg, BinaryOp bop, NVMatrix& tmp); template NVMatrix& _aggregate(int axis, Agg agg, UnaryOp, BinaryOp bop, cudaStream_t stream, NVMatrix& tmp); template NVMatrix& _aggregate(int axis, Agg agg, UnaryOp, BinaryOp bop, NVMatrix& tmp); template void _unaryRandomize(NVMatrix& target, Randomizer rnd, cudaStream_t stream); template void _unaryRandomize(NVMatrix& target, Randomizer rnd); template void _binaryRandomize(NVMatrix& data2, NVMatrix& target, Randomizer rnd); template void _binaryRandomize(NVMatrix& data2, NVMatrix& target, Randomizer rnd, cudaStream_t stream); virtual void alloc(int numElements); virtual void dealloc(); void deallocTexture(); virtual NVMatrix& construct() const; virtual NVMatrix& construct(bool isTrans) const; virtual NVMatrix& construct(int numRows, int numCols, bool isTrans=false) const; virtual NVMatrix& construct(const Matrix& like, bool copy) const; virtual NVMatrix& construct(const NVMatrix& like, bool copy) const; virtual NVMatrix& construct(const NVMatrix& like) const; virtual NVMatrix& construct(const Matrix& like) const; virtual NVMatrix& construct(MemorySegment* mem, int numRows, int numCols, int stride, bool isTrans) const; static cublasHandle_t getCublasHandle(); static cublasHandle_t getCublasHandle(int deviceID); public: NVMatrix(); NVMatrix(bool isTrans); NVMatrix(int numRows, int numCols, bool isTrans=false); NVMatrix(const Matrix& like, bool copy); NVMatrix(const NVMatrix& like, bool copy); NVMatrix(const NVMatrix& like); NVMatrix(const Matrix& like); NVMatrix(MemorySegment* mem, int numRows, int numCols, int stride, bool isTrans); virtual ~NVMatrix(); // Returns the device ID on which the data pointer is allocated int getDataDeviceID() const; static void initRandom(unsigned long long seed, int numStreams, cudaStream_t stream); static void initRandom(unsigned long long seed, int numStreams); static void initRandom(unsigned long long seed); static void initRandom(); static void initCublas(); static void destroyCublas(); static std::pair getCudaMemorySize(); // Returns the currently-active device ID for calling thread static int getDeviceID(); static void setDeviceID(int d); static bool canAccessPeer(int srcDevice, int tgtDevice); static bool isRndInitialized(); static bool isRndInitialized(bool haveLock); static curandState* getCurandState(); static curandState* getCurandState(int numStreams); static void destroyRandom(); static pthread_mutex_t* makeMutex(); static cudaStream_t getDefaultStream(int deviceID); static cudaStream_t getDefaultStream(); static void syncDevice(); static void syncStream(); static void syncStream(cudaStream_t stream); /* * DO NOT DEREFERENCE IN HOST CODE! This is a device memory pointer. */ float* getCellPtr(int i, int j) const { if (_isTrans) { return &getDevData()[j * _numRows + i]; } return &getDevData()[i * _numCols + j]; } bool isSameDims(const Matrix& m) const { return m.getNumRows() == _numRows && m.getNumCols() == _numCols; } bool isSameDims(const NVMatrix& m) const { return m.getNumRows() == _numRows && m.getNumCols() == _numCols; } int getNumRows() const { return _numRows; } int getNumCols() const { return _numCols; } int getStride() const { return _stride; } int getLeadingDim() const { return _isTrans ? _numRows : _numCols; } int getFollowingDim() const { return !_isTrans ? _numRows : _numCols; } /* * FALSE: Row-major order. * TRUE: Column-major order. */ bool isTrans() const { return _isTrans; } bool isView() const { return !_ownsData; } float* getDevData() const { return _memSegment == NULL ? NULL : _memSegment->getData(); } MemorySegment& getMemorySegment() const { return *_memSegment; } int getNumElements() const { return _numElements; } size_t getNumDataBytes() const { return size_t(_numElements) * 4; } /* * Only use if you know what you're doing! * Does not actually transpose matrix. */ void setTrans(bool trans) { if (trans != _isTrans) { assert(isContiguous()); _isTrans = trans; _stride = getLeadingDim(); } } /* * Only use if you know what you're doing! * This toggles whether this object will free its GPU memory when it's destroyed. */ void setIsView(bool isView) { _ownsData = !isView; } bool isContiguous() const { return _stride == getLeadingDim() || getFollowingDim() == 1; } void truncate() { resize(0,0); } virtual cudaTextureObject_t getTextureObject(); virtual void copyFromHost(const Matrix& hostMatrix); virtual void copyFromHost(const Matrix& hostMatrix, bool resizeTarget); virtual void copyFromHost(const Matrix& hostMatrix, bool resizeTarget, cudaStream_t stream); virtual void copyToHost(Matrix& hostMatrix) const; virtual void copyToHost(Matrix& hostMatrix, bool resizeTarget) const; virtual void copyToHost(Matrix& hostMatrix, bool resizeTarget, cudaStream_t stream) const; void copy(NVMatrix& dest) const; void copy(NVMatrix& dest, cudaStream_t stream) const; NVMatrix& copy() const; void addProduct(NVMatrix& a, NVMatrix &b, float scaleThis, float scaleAB, cudaStream_t stream); void addProduct(NVMatrix& a, NVMatrix &b, float scaleThis, float scaleAB); void addProduct(NVMatrix& a, NVMatrix &b); void rightMult(NVMatrix &b, float scaleAB, NVMatrix &target, cudaStream_t stream); void rightMult(NVMatrix &b, float scaleAB, NVMatrix &target); void rightMult(NVMatrix &b, NVMatrix &target); void rightMult(NVMatrix &b, float scaleAB); void randomizeUniform(); void addGaussianNoise(NVMatrix& stdevs, bool var, NVMatrix& target); void addGaussianNoise(float stdev, NVMatrix& target); void addGaussianNoise(NVMatrix& stdevs, bool var); void addGaussianNoise(NVMatrix& stdevs); void addGaussianNoise(float stdev); void addGaussianNoise(); void randomizeGaussian(); void randomizeGaussian(float stdev); void randomizeGaussian(float mean, float stdev); void randomizeGaussian(float mean, NVMatrix& stdevs); void randomizeGaussian(float mean, float stdevMult, NVMatrix& stdevs); void randomizeGaussian(NVMatrix& stdevs); void randomizeGaussian(NVMatrix& stdevs, NVMatrix& target); void binarizeProbs(); void binarizeProbs(NVMatrix& target); void biggerThan(NVMatrix& m, NVMatrix& target); void biggerThan(NVMatrix& m); void biggerThanVector(NVMatrix& vec, NVMatrix& target); void biggerThanVector(NVMatrix& vec); void equals(NVMatrix& m, NVMatrix& target); void equals(NVMatrix& m); void _checkBounds(int startRow, int endRow, int startCol, int endCol) const; NVMatrix& slice(int startRow, int endRow, int startCol, int endCol) const; void slice(int startRow, int endRow, int startCol, int endCol, NVMatrix& target) const; NVMatrix& sliceRows(int startRow, int endRow) const; void sliceRows(int startRow, int endRow, NVMatrix& target) const; NVMatrix& sliceCols(int startCol, int endCol) const; void sliceCols(int startCol, int endCol, NVMatrix& target) const; NVMatrixV& splitRows(int numParts); NVMatrixV& splitCols(int numParts); template void apply(Op op, NVMatrix& target, cudaStream_t stream) { if (!target.isSameDims(*this)) { target.resize(*this); } if (getNumElements() > 0) { int height = target.getFollowingDim(), width = target.getLeadingDim(); if (target.isTrans() == isTrans()) { if (!isContiguous() || !target.isContiguous()) { dim3 blocks(std::min(NUM_BLOCKS_MAX, DIVUP(width, ELTWISE_THREADS_X)), std::min(NUM_BLOCKS_MAX, DIVUP(height, ELTWISE_THREADS_Y))); dim3 threads(ELTWISE_THREADS_X, ELTWISE_THREADS_Y); kEltwiseUnaryOp<<>>(getDevData(), target.getDevData(), height, width, getStride(), target.getStride(), op); getLastCudaError("kEltwiseUnaryOp: Kernel execution failed"); } else { dim3 threads = dim3(ELTWISE_FLAT_THREADS_X); dim3 blocks = dim3(std::min(128, DIVUP(_numElements, ELTWISE_FLAT_THREADS_X))); kEltwiseUnaryOpFlat<<>>(getDevData(), target.getDevData(), _numElements, op); getLastCudaError("kEltwiseUnaryOpFlat: Kernel execution failed"); } } else { dim3 blocks(std::min(NUM_BLOCKS_MAX, DIVUP(width, ELTWISE_THREADS_X)), std::min(NUM_BLOCKS_MAX, DIVUP(height, ELTWISE_THREADS_Y))); dim3 threads(ELTWISE_THREADS_X, ELTWISE_THREADS_Y); bool checkBounds = !(width % ELTWISE_THREADS_X == 0 && height % ELTWISE_THREADS_X == 0); // printf("height: %d, width: %d, stride: %d, target stride: %d, check bounds: %d, threads.x: %d, threads.y: %d, blocks.x: %d, blocks.y: %d\n", // height, width, getStride(), target.getStride(), checkBounds, threads.x, threads.y, blocks.x, blocks.y); if (checkBounds) { kEltwiseUnaryOpTrans<<>>(getDevData(), target.getDevData(), height, width, getStride(), target.getStride(), op); } else { kEltwiseUnaryOpTrans<<>>(getDevData(), target.getDevData(), height, width, getStride(), target.getStride(), op); } getLastCudaError("kEltwiseUnaryOpTrans: Kernel execution failed"); } } } template void apply(Op op, cudaStream_t stream) { apply(op, *this, stream); } template void apply(Op op, NVMatrix& target) { apply(op, target, getDefaultStream()); } template void apply(Op op) { apply(op, *this); } template void applyBinary(Op op, NVMatrix& b) { applyBinary(op, b, *this); } template void applyBinary(Op op, NVMatrix& b, NVMatrix& target) { applyBinary(op, b, target, getDefaultStream()); } template void applyBinary(Op op, NVMatrix& b, NVMatrix& target, cudaStream_t stream) { assert(this->isSameDims(b)); if (!target.isSameDims(*this)) { target.resize(*this); } if (getNumElements() > 0) { int height = target.getFollowingDim(), width = target.getLeadingDim(); if (target.isTrans() == isTrans() && target.isTrans() == b.isTrans()) { if (!isContiguous() || !b.isContiguous() || !target.isContiguous()) { dim3 blocks(std::min(128, DIVUP(width, ELTWISE_THREADS_X)), std::min(128, DIVUP(height, ELTWISE_THREADS_Y))); dim3 threads(ELTWISE_THREADS_X, ELTWISE_THREADS_Y); kEltwiseBinaryOp<<>>(getDevData(), b.getDevData(), target.getDevData(), height, width, getStride(), b.getStride(), target.getStride(), op); } else { dim3 threads = dim3(ELTWISE_FLAT_THREADS_X); dim3 blocks = dim3(std::min(128, DIVUP(_numElements, ELTWISE_FLAT_THREADS_X))); kEltwiseBinaryOpFlat<<>>(getDevData(), b.getDevData(), target.getDevData(), _numElements, op); } getLastCudaError("kEltwiseBinaryOp: Kernel execution failed"); } else { dim3 blocks(std::min(128, DIVUP(width, ELTWISE_THREADS_X)), std::min(128, DIVUP(height, ELTWISE_THREADS_Y))); dim3 threads(ELTWISE_THREADS_X, ELTWISE_THREADS_Y); // both x here since y divides x bool checkBounds = !(width % ELTWISE_THREADS_X == 0 && height % ELTWISE_THREADS_X == 0); if (target.isTrans() == isTrans() && target.isTrans() != b.isTrans()) { if (checkBounds) { kEltwiseBinaryOpTrans<<>>(getDevData(), b.getDevData(), target.getDevData(), height, width,getStride(), b.getStride(), target.getStride(), op); } else { kEltwiseBinaryOpTrans<<>>(getDevData(), b.getDevData(), target.getDevData(), height, width,getStride(), b.getStride(), target.getStride(), op); } } else if (target.isTrans() != isTrans() && target.isTrans() != b.isTrans()) { if (checkBounds) { kEltwiseBinaryOpTrans<<>>(getDevData(), b.getDevData(), target.getDevData(), height, width,getStride(), b.getStride(), target.getStride(), op); } else { kEltwiseBinaryOpTrans<<>>(getDevData(), b.getDevData(), target.getDevData(), height, width,getStride(), b.getStride(), target.getStride(), op); } } else if (target.isTrans() != isTrans() && target.isTrans() == b.isTrans()) { if (checkBounds) { kEltwiseBinaryOpTrans<<>>(b.getDevData(), getDevData(), target.getDevData(), height, width,b.getStride(), getStride(), target.getStride(), op); } else { kEltwiseBinaryOpTrans<<>>(b.getDevData(), getDevData(), target.getDevData(), height, width, b.getStride(), getStride(), target.getStride(), op); } } getLastCudaError("kEltwiseBinaryOpTrans: Kernel execution failed"); } } } template void applyTernary(Op op, NVMatrix& b, NVMatrix& c, NVMatrix& target) { applyTernary(op, b, c, target, getDefaultStream()); } template void applyTernary(Op op, NVMatrix& b, NVMatrix& c, NVMatrix& target, cudaStream_t stream) { assert(isSameDims(b)); assert(isSameDims(c)); // For now ternary ops are only supported for matrices of same transposedness assert(isTrans() == b.isTrans()); assert(isTrans() == c.isTrans()); if (!target.isSameDims(*this) || target.isTrans() != isTrans()) { target.resize(*this); } if (getNumElements() > 0) { int height = target.getFollowingDim(), width = target.getLeadingDim(); if (!isContiguous() || !b.isContiguous() || !c.isContiguous() || !target.isContiguous()) { dim3 blocks(std::min(512, DIVUP(width, ELTWISE_THREADS_X)), std::min(512, DIVUP(height, ELTWISE_THREADS_Y))); dim3 threads(ELTWISE_THREADS_X, ELTWISE_THREADS_Y); kEltwiseTernaryOp<<>>(getDevData(), b.getDevData(), c.getDevData(), target.getDevData(), height, width, getStride(), b.getStride(), c.getStride(), target.getStride(), op); getLastCudaError("kEltwiseTernaryOp: Kernel execution failed"); } else { dim3 threads = dim3(ELTWISE_FLAT_THREADS_X); dim3 blocks = dim3(std::min(128, DIVUP(_numElements, ELTWISE_FLAT_THREADS_X))); kEltwiseTernaryOpFlat<<>>(getDevData(), b.getDevData(), c.getDevData(), target.getDevData(), _numElements, op); getLastCudaError("kEltwiseTernaryOpFlat: Kernel execution failed"); } } } bool resize(int numRows, int numCols, bool trans); bool resize(int numRows, int numCols); bool resize(const NVMatrix &like); bool resize(const Matrix &like); void reshape(int numRows, int numCols); NVMatrix& reshaped(int numRows, int numCols) const; void copy(NVMatrix &dest, int srcStartRow, int srcEndRow, int srcStartCol, int srcEndCol, int destStartRow, int destStartCol) const; void copy(NVMatrix &dest, int srcStartRow, int srcEndRow, int srcStartCol, int srcEndCol, int destStartRow, int destStartCol, cudaStream_t stream) const; void add(NVMatrix& b, float scaleA, float scaleB, NVMatrix& target, cudaStream_t stream); void add(NVMatrix& b, float scaleA, float scaleB, NVMatrix& target); void add(NVMatrix& b, float scaleB, NVMatrix& target); void add(NVMatrix& b, NVMatrix& target); void add(NVMatrix& b, float scaleB); void add(NVMatrix& b, float scaleA, float scaleB); void add(NVMatrix& b); void eltwiseMult(NVMatrix& b); void eltwiseMult(NVMatrix& b, NVMatrix& target); void eltwiseDivide(NVMatrix& b); void eltwiseDivide(NVMatrix& b, NVMatrix& target); void squaredDiff(NVMatrix& b); void squaredDiff(NVMatrix& b, NVMatrix& target); void subtract(NVMatrix& b, NVMatrix& target); void subtract(NVMatrix& b); void addVector(NVMatrix& vec, float scaleVec, NVMatrix& target, cudaStream_t stream); void addVector(NVMatrix& vec, float scaleVec, NVMatrix& target); void addVector(NVMatrix& vec); void addVector(NVMatrix& vec, float scaleVec); void addVector(NVMatrix& vec, NVMatrix& target); void equalsVector(NVMatrix& vec, NVMatrix& target); void equalsVector(NVMatrix& vec); void eltwiseMultByVector(NVMatrix& vec, NVMatrix& target, cudaStream_t stream); void eltwiseMultByVector(NVMatrix& vec, NVMatrix& target); void eltwiseMultByVector(NVMatrix& vec); void eltwiseMultByVector(NVMatrix& vec, cudaStream_t stream); void eltwiseDivideByVector(NVMatrix& vec, NVMatrix& target); void eltwiseDivideByVector(NVMatrix& vec); void tile(int timesY, int timesX, NVMatrix& target); void tile(int timesY, int timesX, NVMatrix& target, cudaStream_t stream); void addSum(NVMatrix& a, int axis, float scaleThis, float scaleSum); void addSum(NVMatrix& a, int axis, float scaleThis, float scaleSum, cudaStream_t stream); void addMax(NVMatrix& a, int axis, float scaleThis, float scaleMax); void addMax(NVMatrix& a, int axis, float scaleThis, float scaleMax, cudaStream_t stream); void sum(int axis, NVMatrix& target, cudaStream_t stream); void sum(int axis, NVMatrix& target); void sum(int axis, NVMatrix& target, cudaStream_t stream, NVMatrix& tmp); void sum(int axis, NVMatrix& target, NVMatrix& tmp); NVMatrix& sum(int axis); void max(int axis, NVMatrix& target); void max(int axis, NVMatrix& target, NVMatrix& tmp); NVMatrix& max(int axis); void min(int axis, NVMatrix& target); NVMatrix& min(int axis); void sumOfSquares(int axis, NVMatrix& target, cudaStream_t stream); void sumOfSquares(int axis, NVMatrix& target); NVMatrix& sumOfSquares(int axis); float mean(); float sum(); float sum(NVMatrix& tmpbuf); float max(); float min(); float countInf(); float countNan(); float norm2(); float norm(); void inRangeInc(float lower, float upper); void inRangeInc(float lower, float upper, NVMatrix& target); void inRangeExc(float lower, float upper); void inRangeExc(float lower, float upper, NVMatrix& target); void biggerThanScalar(float scalar); void biggerThanScalar(float scalar, NVMatrix& target); void smallerThanScalar(float scalar); void smallerThanScalar(float scalar, NVMatrix& target); void addScalar(float scaleThis, float scalar, NVMatrix& target); void addScalar(float scalar, NVMatrix& target); void addScalar(float scalar); void minWithScalar(float scalar, NVMatrix& target); void minWithScalar(float scalar); void maxWithScalar(float scalar, NVMatrix& target); void maxWithScalar(float scalar); void pow(float p, NVMatrix& target); void pow(float p); void scale(float _scale); void scale(float _scale, NVMatrix& target); void scale(float _scale, NVMatrix& target, cudaStream_t stream); void scale(float _scale, cudaStream_t stream); void zero(); void zero(NVMatrix& like); float dotProduct(NVMatrix& b, NVMatrix& tmp, cudaStream_t stream); float dotProduct(NVMatrix& b, cudaStream_t stream); float dotProduct(NVMatrix& b); /* * Does SOFT transpose and returns result, leaving this matrix unchanged */ NVMatrix& getTranspose(); NVMatrix& getClone(); /* * Does HARD transpose and puts result in target */ void transpose(NVMatrix& target); /* * Does SOFT transpose */ void transpose(); bool transpose(bool trans); void flipTrans(NVMatrix& target, cudaStream_t stream); void flipTrans(NVMatrix& target); NVMatrix& flipTrans(); void print(int startRow, int rows, int startCol, int cols) const; void print(int rows, int cols) const; void printShape(const char* name) const; template void applyBinaryV(Op op, NVMatrix& vec, NVMatrix& target) { applyBinaryV(op, vec, target, getDefaultStream()); } template void applyBinaryV(Op op, NVMatrix& vec, NVMatrix& target, cudaStream_t stream) { assert(&target != &vec); // for now if (isSameDims(vec)) { applyBinary(op, vec, target, stream); return; } assert(vec.getNumRows() == 1 || vec.getNumCols() == 1); assert(vec.getNumRows() == _numRows || vec.getNumCols() == _numCols); assert(vec.isContiguous()); target.resize(*this); // target must be same orientation as me for now int width = getLeadingDim(); //_isTrans ? _numRows : _numCols; int height = getFollowingDim(); //_isTrans ? _numCols : _numRows; dim3 threads(ADD_VEC_THREADS_X, ADD_VEC_THREADS_Y); if ((vec.getNumRows() == _numRows && !isTrans()) || (vec.getNumCols() == _numCols && isTrans())) { dim3 blocks(std::min(512, DIVUP(width, ADD_VEC_THREADS_X)), std::min(NUM_BLOCKS_MAX, DIVUP(height, ADD_VEC_THREADS_Y))); kColVectorOp<<>>(getDevData(), vec.getDevData(), target.getDevData(), width, height, getStride(), target.getStride(), op); } else { dim3 blocks(std::min(NUM_BLOCKS_MAX, DIVUP(width, ADD_VEC_THREADS_X)), std::min(NUM_BLOCKS_MAX, DIVUP(height, ADD_VEC_THREADS_Y))); kRowVectorOp<<>>(getDevData(), vec.getDevData(), target.getDevData(), width, height, getStride(), target.getStride(), op); } getLastCudaError("Kernel execution failed"); // cudaThreadSynchronize(); } template float argMax(UnaryOperator u) { return _totalAgg(NVMatrixAggs::ArgMax(u)); } static void batchedMatrixMultiply(NVMatrixV& a, NVMatrixV& b, NVMatrixV& target, float scaleTarget, float scaleAB, cudaStream_t stream, const float** aPtrsDev, const float** bPtrsDev, float** tgtPtrsDev); static void batchedMatrixMultiply(NVMatrixV& a, NVMatrixV& b, NVMatrixV& target, float scaleTarget, float scaleAB, cudaStream_t stream); static void batchedMatrixMultiply(NVMatrixV& a, NVMatrixV& b, NVMatrixV& target, float scaleTarget, float scaleAB, const float** aPtrsDev, const float** bPtrsDev, float** tgtPtrsDev); static void batchedMatrixMultiply(NVMatrixV& a, NVMatrixV& b, NVMatrixV& target, float scaleTarget, float scaleAB); static void assertSame(NVMatrixV& a); }; class HostNVMatrix : public NVMatrix { protected: void alloc(int numElements); void dealloc(); NVMatrix& construct() const; NVMatrix& construct(bool isTrans) const; NVMatrix& construct(int numRows, int numCols, bool isTrans=false) const; NVMatrix& construct(const Matrix& like, bool copy) const; NVMatrix& construct(const NVMatrix& like, bool copy) const; NVMatrix& construct(const NVMatrix& like) const; NVMatrix& construct(const Matrix& like) const; NVMatrix& construct(MemorySegment* mem, int numRows, int numCols, int stride, bool isTrans) const; public: ~HostNVMatrix(); HostNVMatrix(); HostNVMatrix(bool isTrans); HostNVMatrix(int numRows, int numCols, bool isTrans=false); HostNVMatrix(const Matrix& like, bool copy); HostNVMatrix(const NVMatrix& like, bool copy); HostNVMatrix(const NVMatrix& like); HostNVMatrix(const Matrix& like); HostNVMatrix(MemorySegment* mem, int numRows, int numCols, int stride, bool isTrans); void copyFromHost(const Matrix& hostMatrix); void copyFromHost(const Matrix& hostMatrix, bool resizeTarget); void copyFromHost(const Matrix& hostMatrix, bool resizeTarget, cudaStream_t stream); void copyToHost(Matrix& hostMatrix) const; void copyToHost(Matrix& hostMatrix, bool resizeTarget) const; void copyToHost(Matrix& hostMatrix, bool resizeTarget, cudaStream_t stream) const; cudaTextureObject_t getTextureObject(); }; #endif /* NVMATRIX_H_ */ ================================================ FILE: caffe2/contrib/cuda-convnet2/nvmatrix/include/nvmatrix_kernels.cuh ================================================ /* * Copyright 2014 Google Inc. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef NVMATRIX_KERNEL_H_ #define NVMATRIX_KERNEL_H_ #include #if defined(_WIN64) || defined(_WIN32) #define uint unsigned int #endif #define NUM_BLOCKS_MAX 65535 #define TEXTURE_SIZE_MAX (1<<29) #define NUM_RND_BLOCKS 96 #define NUM_RND_THREADS_PER_BLOCK 128 #define NUM_RND_STREAMS (NUM_RND_BLOCKS * NUM_RND_THREADS_PER_BLOCK) /* * Default grid/block sizes for the various functions. */ #define ADD_BLOCK_SIZE 16 #define NUM_TILE_BLOCKS 4096 #define NUM_TILE_THREADS_PER_BLOCK 512 #define ELTWISE_THREADS_X 32 #define ELTWISE_THREADS_Y 8 #define ELTWISE_FLAT_THREADS_X 128 #define NUM_SUM_COLS_THREADS_PER_BLOCK 128 #define AGG_SHORT_ROWS_THREADS_X 32 #define AGG_SHORT_ROWS_THREADS_Y 8 #define AGG_SHORT_ROWS_LOOPS_Y 32 #define DP_BLOCKSIZE 512 #define CPUSUM_MAX 4096 #define ADD_VEC_THREADS_X 64 #define ADD_VEC_THREADS_Y 4 #ifndef DIVUP #define DIVUP(x, y) (((x) + (y) - 1) / (y)) #endif #define MYMAX(a, b) ((a) > (b) ? (a) : (b)) #ifndef MUL24 // legacy #define MUL24(x,y) ((x) * (y)) #endif #define AWR_NUM_THREADS 256 #define WARP_SIZE 32 #define AWR_NUM_WARPS AWR_NUM_THREADS / WARP_SIZE #define AWR_LOG_NUM_THREADS 8 #define LOG_WARP_SIZE 5 #define AWR_LOG_NUM_WARPS 3 #define DEVICE_HOST -1 #define DEVICE_NULL -2 __global__ void kTile(const float* src, float* tgt, const uint srcWidth, const uint srcHeight, const uint tgtWidth, const uint tgtHeight); __global__ void kDotProduct_r(float* a, float* b, float* target, const uint numElements); __global__ void kSetupCurand(curandState *state, unsigned long long seed); template __device__ T shfl_down(T a, int b, int c=WARP_SIZE) { #if __CUDA_ARCH__ >= 300 return __shfl_down(a, b, c); #else return 0; #endif } /* * For now this is supported only for arrays with the same transposedness. */ template __global__ void kEltwiseTernaryOp(const float* a, const float* b, const float* c, float* const dest, const uint height, const uint width, uint strideA, const uint strideB, const uint strideC, const uint strideDest, Op op) { const uint idxX = blockIdx.x * ELTWISE_THREADS_X + threadIdx.x; const uint idxY = blockIdx.y * ELTWISE_THREADS_Y + threadIdx.y; for (uint y = idxY; y < height; y += gridDim.y * ELTWISE_THREADS_Y) { for (uint x = idxX; x < width; x += gridDim.x * ELTWISE_THREADS_X) { dest[y * strideDest + x] = op(a[y * strideA + x], b[y * strideB + x], c[y * strideC + x]); } } } template __global__ void kEltwiseTernaryOpFlat(const float* a, const float* b, const float* c, float* const dest, const uint numElements, Op op) { const uint idxX = blockIdx.x * ELTWISE_FLAT_THREADS_X + threadIdx.x; for (uint x = idxX; x < numElements; x += gridDim.x * ELTWISE_FLAT_THREADS_X) { dest[x] = op(a[x], b[x], c[x]); } } /* * dest here is assumed to be "not transposed" -- height and width correspond to it. * b is assumed to be transposed. * a can be either transposed or not -- depending on parameter. * * Performs dest := op(a, b) */ template __global__ void kEltwiseBinaryOpTrans(const float* a, const float* b, float* const dest, const uint height, const uint width, const uint strideA, const uint strideB, const uint strideDest, Op op) { __shared__ float shmem[ELTWISE_THREADS_X][ELTWISE_THREADS_X + 1]; // x here because that's how much work we do for (uint by = ELTWISE_THREADS_X * blockIdx.y; by < height; by += ELTWISE_THREADS_X * gridDim.y) { for (uint bx = ELTWISE_THREADS_X * blockIdx.x; bx < width; bx += ELTWISE_THREADS_X * gridDim.x) { const uint readX = by + threadIdx.x; const uint readY = bx + threadIdx.y; for (uint y = 0; y < ELTWISE_THREADS_X; y+= ELTWISE_THREADS_Y) { if (!checkBounds || (readX < height && readY + y < width)) { if (aTrans) { shmem[threadIdx.x][threadIdx.y + y] = reverse ? op(b[(readY+y) * strideB + readX], a[(readY+y) * strideA + readX]) : op(a[(readY+y) * strideA + readX], b[(readY+y) * strideB + readX]); } else { shmem[threadIdx.x][threadIdx.y + y] = b[(readY+y) * strideB + readX]; } } } __syncthreads(); const uint writeX = bx + threadIdx.x; const uint writeY = by + threadIdx.y; for (uint y = 0; y < ELTWISE_THREADS_X; y+= ELTWISE_THREADS_Y) { if(!checkBounds || (writeX < width && writeY + y < height)) { if (aTrans) { dest[(writeY + y) * strideDest + writeX] = shmem[threadIdx.y + y][threadIdx.x]; } else { dest[(writeY + y) * strideDest + writeX] = reverse ? op(shmem[threadIdx.y + y][threadIdx.x], a[(writeY + y) * strideA + writeX]) : op(a[(writeY + y) * strideA + writeX], shmem[threadIdx.y + y][threadIdx.x]); } } } __syncthreads(); } } } template __global__ void kEltwiseBinaryOp(const float* a, const float* b, float* const dest, const uint height, const uint width, const uint strideA, const uint strideB, const uint strideDest, Op op) { const uint idxX = blockIdx.x * ELTWISE_THREADS_X + threadIdx.x; const uint idxY = blockIdx.y * ELTWISE_THREADS_Y + threadIdx.y; for (uint y = idxY; y < height; y += gridDim.y * ELTWISE_THREADS_Y) { for (uint x = idxX; x < width; x += gridDim.x * ELTWISE_THREADS_X) { dest[y * strideDest + x] = op(a[y * strideA + x], b[y * strideB + x]); } } } template __global__ void kEltwiseBinaryOpFlat(const float* a, const float* b, float* const dest, const uint numElements, Op op) { const uint idxX = blockIdx.x * ELTWISE_FLAT_THREADS_X + threadIdx.x; for (uint x = idxX; x < numElements; x += gridDim.x * ELTWISE_FLAT_THREADS_X) { dest[x] = op(a[x], b[x]); } } /* * dest here is assumed to be "not transposed" -- height and width correspond to it. */ template __global__ void kEltwiseUnaryOpTrans(const float* a, float* const dest, const uint height, const uint width, const uint strideA, const uint strideDest, Op op) { __shared__ float shmem[ELTWISE_THREADS_X][ELTWISE_THREADS_X + 1]; for (uint by = ELTWISE_THREADS_X * blockIdx.y; by < height; by += ELTWISE_THREADS_X * gridDim.y) { for (uint bx = ELTWISE_THREADS_X * blockIdx.x; bx < width; bx += ELTWISE_THREADS_X * gridDim.x) { const uint readX = by + threadIdx.x; const uint readY = bx + threadIdx.y; for (uint y = 0; y < ELTWISE_THREADS_X; y+= ELTWISE_THREADS_Y) { if (!checkBounds || (readX < height && readY + y < width)) { shmem[threadIdx.x][threadIdx.y + y] = op(a[(readY + y) * strideA + readX]); } } __syncthreads(); const uint writeX = bx + threadIdx.x; const uint writeY = by + threadIdx.y; for (uint y = 0; y < ELTWISE_THREADS_X; y+= ELTWISE_THREADS_Y) { if(!checkBounds || (writeX < width && writeY + y < height)) { dest[(writeY + y) * strideDest + writeX] = shmem[threadIdx.y + y][threadIdx.x]; } } __syncthreads(); } } } template __global__ void kEltwiseUnaryOpFlat(const float* a, float* const dest, const uint numElements, Op op) { const uint idxX = blockIdx.x * ELTWISE_FLAT_THREADS_X + threadIdx.x; for (uint x = idxX; x < numElements; x += gridDim.x * ELTWISE_FLAT_THREADS_X) { dest[x] = op(a[x]); } } template __global__ void kEltwiseUnaryOp(const float* a, float* const dest, const uint height, const uint width, const uint strideA, const uint strideDest, Op op) { const uint idxX = blockIdx.x * ELTWISE_THREADS_X + threadIdx.x; const uint idxY = blockIdx.y * ELTWISE_THREADS_Y + threadIdx.y; for (uint y = idxY; y < height; y += gridDim.y * ELTWISE_THREADS_Y) { for (uint x = idxX; x < width; x += gridDim.x * ELTWISE_THREADS_X) { dest[y * strideDest + x] = op(a[y * strideA + x]); } } } /* * Matrix in ROW-MAJOR order! */ template __global__ void kRowVectorOp(const float* mat, const float* vec, float* const tgtMat, const uint width, const uint height, const uint matStride, const uint tgtStride, Op op) { __shared__ float shVec[ADD_VEC_THREADS_X]; const uint bx = ADD_VEC_THREADS_X * blockIdx.x; const uint by = ADD_VEC_THREADS_Y * blockIdx.y; for (uint x = bx; x < width; x += gridDim.x * ADD_VEC_THREADS_X) { __syncthreads(); if (x + threadIdx.x < width && threadIdx.y == 0) { shVec[threadIdx.x] = vec[x + threadIdx.x]; } __syncthreads(); if (x + threadIdx.x < width) { for (uint y = by + threadIdx.y; y < height; y += gridDim.y * ADD_VEC_THREADS_Y) { tgtMat[y * tgtStride + x + threadIdx.x] = op(mat[y * matStride + x + threadIdx.x], shVec[threadIdx.x]); } } } } /* * Matrix in ROW-MAJOR order! */ template __global__ void kColVectorOp(float* mat, float* vec, float* tgtMat, const uint width, const uint height, const uint matStride, const uint tgtStride, Op op) { __shared__ float shVec[ADD_VEC_THREADS_Y]; const uint by = ADD_VEC_THREADS_Y * blockIdx.y; const uint bx = ADD_VEC_THREADS_X * blockIdx.x; const uint tidx = ADD_VEC_THREADS_X * threadIdx.y + threadIdx.x; mat += threadIdx.y * matStride; vec += tidx; tgtMat += threadIdx.y * tgtStride; for (uint y = by; y < height; y += gridDim.y * ADD_VEC_THREADS_Y) { __syncthreads(); if (y + tidx < height && tidx < ADD_VEC_THREADS_Y) { shVec[tidx] = vec[y]; } __syncthreads(); if (y + threadIdx.y < height) { for (uint x = bx + threadIdx.x; x < width; x += gridDim.x * ADD_VEC_THREADS_X) { tgtMat[(y) * tgtStride + x] = op(mat[(y) * matStride + x], shVec[threadIdx.y]); } } } } /* * This one gets coalesced reads but computes only a partial sum which * must either be summed again (recursively) or summed on the host. */ template __global__ void kAggRows(const float* mat, float* matSum, const uint width, const uint height, const uint sumWidth, Agg agg, UnaryOp uop, BinaryOp bop) { const int idxX = blockIdx.x * blockSize*2 + threadIdx.x; __shared__ float accum[blockSize*2]; matSum += blockIdx.y * sumWidth + blockIdx.x; /* * Here it's important to make sure that all threads in a block call __syncthreads, * so I have even the redundant threads (for which idxX >= width) enter this loop * just so that they may call __syncthreads at the appropriate times. */ mat += width * blockIdx.y + idxX; accum[threadIdx.x] = agg.getBaseValue(); accum[threadIdx.x + blockSize] = agg.getBaseValue(); for (uint idxY = blockIdx.y; idxY < height; idxY += gridDim.y) { if (idxX < width) { accum[threadIdx.x] = uop(mat[0]); if(idxX + blockSize < width) accum[threadIdx.x + blockSize] = uop(mat[blockSize]); } if (blockSize >= 512) { __syncthreads(); if (threadIdx.x < 512) accum[threadIdx.x] = agg(accum[threadIdx.x], accum[threadIdx.x + 512]); } if (blockSize >= 256) { __syncthreads(); if (threadIdx.x < 256) accum[threadIdx.x] = agg(accum[threadIdx.x],accum[threadIdx.x + 256]); } if (blockSize >= 128) { __syncthreads(); if (threadIdx.x < 128) accum[threadIdx.x] = agg(accum[threadIdx.x],accum[threadIdx.x + 128]); } if (blockSize >= 64) { __syncthreads(); if (threadIdx.x < 64) accum[threadIdx.x] = agg(accum[threadIdx.x],accum[threadIdx.x + 64]); } __syncthreads(); volatile float* myAccum = &accum[threadIdx.x]; if (threadIdx.x < 32) { // executed only by first warp myAccum[0] = agg(myAccum[0], myAccum[32]); myAccum[0] = agg(myAccum[0], myAccum[16]); myAccum[0] = agg(myAccum[0], myAccum[8]); myAccum[0] = agg(myAccum[0], myAccum[4]); myAccum[0] = agg(myAccum[0], myAccum[2]); myAccum[0] = agg(myAccum[0], myAccum[1]); } if (threadIdx.x == 0) { matSum[0] = bop(matSum[0], myAccum[0]); matSum += gridDim.y * sumWidth; } __syncthreads(); mat += width * gridDim.y; } } template __global__ void kAggRows_wholerow(const float* mat, float* matSum, const uint width, const uint height, Agg agg, BinaryOp op) { const int tidx = threadIdx.x; __shared__ float accum[AWR_NUM_THREADS]; volatile float* vMyAccum = &accum[tidx]; float* myAccum = &accum[tidx]; matSum += blockIdx.y; mat += width * blockIdx.y; for (uint idxY = blockIdx.y; idxY < height; idxY += gridDim.y) { myAccum[0] = agg.getBaseValue(); for (uint x = tidx; x < width; x += AWR_NUM_THREADS) { myAccum[0] = agg(myAccum[0], mat[x]); } #pragma unroll for (uint i = AWR_LOG_NUM_THREADS - 1; i > LOG_WARP_SIZE; i--) { const uint d = 1 << i; __syncthreads(); if (tidx < d) { myAccum[0] = agg(myAccum[0], myAccum[d]); } } __syncthreads(); if (tidx < WARP_SIZE) { #pragma unroll for (int i = LOG_WARP_SIZE; i >= 0; i--) { const uint d = 1 << i; vMyAccum[0] = agg(vMyAccum[0], vMyAccum[d]); } if (tidx == 0) { matSum[0] = op(matSum[0], vMyAccum[0]); matSum += gridDim.y; } } __syncthreads(); mat += width * gridDim.y; } } /* * Implements multiscan idea from http://www.moderngpu.com * Not really useful for pure reductions but neat nonetheless. */ template __global__ void kAggRows_wholerow_nosync(const float* mat, float* matSum, const uint width, const uint height, Agg agg, UnaryOp uop, BinaryOp bop) { const uint tidx = threadIdx.x; const uint warpIdx = tidx / WARP_SIZE; const uint lane = tidx % WARP_SIZE; __shared__ float accum[(WARP_SIZE + 1) * AWR_NUM_WARPS]; __shared__ float finalAccum[AWR_NUM_WARPS]; float* myAccum = &accum[warpIdx * (WARP_SIZE + 1) + lane]; float* myFinalAccum = &finalAccum[tidx]; //volatile float* vMyAccum = &accum[warpIdx * (WARP_SIZE + 1) + lane]; matSum += blockIdx.y; mat += width * blockIdx.y; float rAccum = agg.getBaseValue(); // cache in register, a bit faster than shmem #pragma unroll 32 for (uint x = tidx; x < width; x += AWR_NUM_THREADS) { rAccum = agg(rAccum, uop(mat[x])); } myAccum[0] = rAccum; // Each warp does a reduction that doesn't require synchronizatoin #pragma unroll for (uint i = 0; i < LOG_WARP_SIZE; i++) { const uint d = 1 << i; myAccum[0] = agg(myAccum[0], shfl_down(myAccum[0], d)); } __syncthreads(); // The warps write their results if (tidx < AWR_NUM_WARPS) { //volatile float* vMyFinalAccum = &finalAccum[tidx]; myFinalAccum[0] = accum[tidx * (WARP_SIZE + 1)]; #pragma unroll for (uint i = 0; i < AWR_LOG_NUM_WARPS; i++) { const uint d = 1 << i; myFinalAccum[0] = agg(myFinalAccum[0], shfl_down(myFinalAccum[0], d)); } if (tidx == 0) { matSum[0] = bop(matSum[0], myFinalAccum[0]); matSum += gridDim.y; } } } /* * To be used when the rows are <= 64. * * TODO: try to reduce reg usage. i think this can be made faster too. */ //#define AGG_SHORT_ROWS_LOOPS_X 4 template __global__ void kAggShortRows(const float* mat, float* matSum, const uint width, const uint height, Agg agg, UnaryOp uop, BinaryOp bop) { const uint shmemX = THREADS_X + 1; __shared__ float shmem[AGG_SHORT_ROWS_THREADS_Y*shmemX]; const uint tidx = threadIdx.y * THREADS_X + threadIdx.x; const uint ty = LOOPS_X == 1 ? tidx / width : threadIdx.y; // when loops==1, width is gonna be smaller than block x dim const uint tx = LOOPS_X == 1 ? tidx % width : threadIdx.x; const uint bidx = blockIdx.y * gridDim.x + blockIdx.x; const uint blockRowIdx = bidx * AGG_SHORT_ROWS_LOOPS_Y * AGG_SHORT_ROWS_THREADS_Y; float* shmemWrite = shmem + MUL24(ty, shmemX) + tx; matSum += blockRowIdx + tidx; // shmem[MUL24(threadIdx.y, shmemX) + threadIdx.x] = 0; mat += width * blockRowIdx + MUL24(ty, width) + tx; float* shmemWriteZeros = &shmem[MUL24(threadIdx.y,shmemX) + threadIdx.x]; bool doAgg = tidx < AGG_SHORT_ROWS_THREADS_Y ; if (blockRowIdx < height) { #pragma unroll for (uint y = 0; y < AGG_SHORT_ROWS_LOOPS_Y*AGG_SHORT_ROWS_THREADS_Y; y += AGG_SHORT_ROWS_THREADS_Y) { doAgg &= tidx + y + blockRowIdx < height; const bool heightIdxOK = ty < AGG_SHORT_ROWS_THREADS_Y && ty + y + blockRowIdx < height; shmemWriteZeros[0] = agg.getBaseValue(); __syncthreads(); #pragma unroll for(uint x = 0; x < LOOPS_X * THREADS_X; x+= THREADS_X) { // __syncthreads(); if (heightIdxOK && x + tx < width) { shmemWrite[0] = agg(uop(mat[x]), shmemWrite[0]); } } __syncthreads(); if (doAgg) { /* * I tried doing this final sum as a 4-step reduction, with 8 threads * per warp participating. It was slightly slower. */ float accum = agg.getBaseValue(); float* shmemRead = shmem + MUL24(tidx, shmemX); // this loops too much if the rows are really short :( #pragma unroll for (uint i = 0; i < THREADS_X; i++) { accum = agg(accum, shmemRead[0]); shmemRead++; } matSum[0] = bop(matSum[0], accum); matSum += AGG_SHORT_ROWS_THREADS_Y; } __syncthreads(); mat += width * AGG_SHORT_ROWS_THREADS_Y; } } } template __global__ void kAggShortRows2(const float* mat, float* matSum, const uint width, const uint height, Agg agg, UnaryOp uop, BinaryOp bop) { const uint shmemX = AGG_SHORT_ROWS_THREADS_X + 1; __shared__ float shmem[AGG_SHORT_ROWS_THREADS_Y*shmemX]; const uint LOOPS_X = DIVUP(width, AGG_SHORT_ROWS_THREADS_X); const uint tidx = threadIdx.y * AGG_SHORT_ROWS_THREADS_X + threadIdx.x; const uint bidx = blockIdx.y * gridDim.x + blockIdx.x; const uint blockRowIdx = bidx * AGG_SHORT_ROWS_LOOPS_Y * AGG_SHORT_ROWS_THREADS_Y; float* shmemWrite = shmem + MUL24(threadIdx.y, shmemX) + threadIdx.x; matSum += blockRowIdx + tidx; // shmem[MUL24(threadIdx.y, shmemX) + threadIdx.x] = 0; mat += width * blockRowIdx + MUL24(threadIdx.y, width) + threadIdx.x; bool doAgg = tidx < AGG_SHORT_ROWS_THREADS_Y; if(blockRowIdx < height) { for (uint y = 0; y < AGG_SHORT_ROWS_LOOPS_Y*AGG_SHORT_ROWS_THREADS_Y; y += AGG_SHORT_ROWS_THREADS_Y) { doAgg &= tidx + y + blockRowIdx < height; const bool heightIdxOK = threadIdx.y + y + blockRowIdx < height; float accum = agg.getBaseValue(); shmemWrite[0] = agg.getBaseValue(); for(uint x = 0; x < LOOPS_X * AGG_SHORT_ROWS_THREADS_X; x+= AGG_SHORT_ROWS_THREADS_X) { // __syncthreads(); if (heightIdxOK && x + threadIdx.x < width) { shmemWrite[0] = agg(uop(mat[x]), shmemWrite[0]); } } __syncthreads(); if (doAgg) { float* shmemRead = shmem + MUL24(tidx, shmemX); #pragma unroll for (uint i = 0; i < AGG_SHORT_ROWS_THREADS_X; i++) { accum = agg(accum, shmemRead[0]); shmemRead++; } matSum[0] = bop(matSum[0], accum); matSum += AGG_SHORT_ROWS_THREADS_Y; } __syncthreads(); mat += width * AGG_SHORT_ROWS_THREADS_Y; } } } /* * Bad when there are few columns. */ template __global__ void kDumbAggCols(cudaTextureObject_t mat, float* const vec, const uint width, const uint height, Agg agg, UnaryOp uop, BinaryOp bop) { const uint idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < width) { float mx = agg.getBaseValue(); for (uint j = 0; j < height; j++) { mx = agg(uop(tex1Dfetch(mat, width * j + idx)), mx); } vec[idx] = bop(vec[idx], mx); } } /* * Better with few columns because it only computes a partial sum. */ template __global__ void kAggCols(cudaTextureObject_t mat, float* const vec, const uint width, const uint height, const uint sumLength, Agg agg, UnaryOp op) { const uint idxX = blockIdx.x * blockDim.x + threadIdx.x; const uint idxY = blockIdx.y * sumLength; if (idxX < width) { float mx = agg.getBaseValue(); for (uint j = idxY; j < min(height,idxY + sumLength); j++) { mx = agg(op(tex1Dfetch(mat, j * width + idxX)), mx); } vec[blockIdx.y * width + idxX] = mx; } } template __global__ void kTotalAgg(const float* a, float* const target, const uint numElements, Agg agg) { __shared__ float shmem[DP_BLOCKSIZE]; uint eidx = DP_BLOCKSIZE * blockIdx.x + threadIdx.x; shmem[threadIdx.x] = agg.getBaseValue(); if (eidx < gridDim.x * DP_BLOCKSIZE) { for (; eidx < numElements; eidx += gridDim.x * DP_BLOCKSIZE) { shmem[threadIdx.x] = agg(shmem[threadIdx.x], a[eidx]); } } __syncthreads(); if (threadIdx.x < 256) { shmem[threadIdx.x] = agg(shmem[threadIdx.x], shmem[threadIdx.x + 256]); } __syncthreads(); if (threadIdx.x < 128) { shmem[threadIdx.x] = agg(shmem[threadIdx.x], shmem[threadIdx.x + 128]); } __syncthreads(); if (threadIdx.x < 64) { shmem[threadIdx.x] = agg(shmem[threadIdx.x], shmem[threadIdx.x + 64]); } __syncthreads(); if (threadIdx.x < 32) { volatile float* mysh = &shmem[threadIdx.x]; *mysh = agg(*mysh, mysh[32]); *mysh = agg(*mysh, mysh[16]); *mysh = agg(*mysh, mysh[8]); *mysh = agg(*mysh, mysh[4]); *mysh = agg(*mysh, mysh[2]); *mysh = agg(*mysh, mysh[1]); if (threadIdx.x == 0) { target[blockIdx.x] = *mysh; } } } class AddGaussianUnaryRandomizer { private: const float stdev; public: AddGaussianUnaryRandomizer(float _stdev) : stdev(_stdev) { } __device__ inline float operator ()(float data, curandState* state) { return data + stdev * curand_normal(state); } }; class BinarizeUnaryRandomizer { public: __device__ inline float operator ()(float data, curandState* state) { return data > curand_uniform(state); } }; class UniformUnaryRandomizer { public: __device__ inline float operator ()(float data, curandState* state) { return curand_uniform(state); } }; class GaussianUnaryRandomizer { private: const float mean, stdev; public: GaussianUnaryRandomizer(float _mean, float _stdev) : mean(_mean), stdev(_stdev) { } __device__ inline float operator ()(float data, curandState* state) { return mean + stdev * curand_normal(state); } }; template class AddGaussianBinaryRandomizer { public: __device__ inline float operator ()(float data, float stdev, curandState* state) { return data + (var ? stdev : 1) * stdev * curand_normal(state); } }; class GaussianBinaryRandomizer { private: const float mean; public: GaussianBinaryRandomizer(float _mean) : mean(_mean) { } __device__ inline float operator ()(float data, float stdev, curandState* state) { return mean + stdev * curand_normal(state); } }; class ScaledGaussianBinaryRandomizer { private: const float mean, stdevScale; public: ScaledGaussianBinaryRandomizer(float _mean, float _stdevScale) : mean(_mean), stdevScale(_stdevScale) { } __device__ inline float operator ()(float data, float stdev, curandState* state) { return mean + stdevScale * stdev * curand_normal(state); } }; template __global__ void kUnaryRandomize(float* data, float* targets, curandState* state, const uint numElements, Randomizer rnd) { const uint tidx = NUM_RND_THREADS_PER_BLOCK * blockIdx.x + threadIdx.x; curandState localState = state[tidx]; for (uint i = tidx; i < numElements; i += NUM_RND_STREAMS) { targets[i] = rnd(data[i], &localState); } state[tidx] = localState; } template __global__ void kBinaryRandomize(float* data, float* data2, float* targets, curandState* state, const uint numElements, Randomizer rnd) { const uint tidx = NUM_RND_THREADS_PER_BLOCK * blockIdx.x + threadIdx.x; curandState localState = state[tidx]; for (uint i = tidx; i < numElements; i += NUM_RND_STREAMS) { targets[i] = rnd(data[i], data2[i], &localState); } state[tidx] = localState; } #endif /* NVMATRIX_KERNEL_H_ */ ================================================ FILE: caffe2/contrib/cuda-convnet2/nvmatrix/include/nvmatrix_operators.cuh ================================================ /* * Copyright 2014 Google Inc. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef NVMATRIX_OPERATORS_CUH #define NVMATRIX_OPERATORS_CUH class NVMatrixOps { public: class Exp { public: __device__ inline float operator()(const float a) const { return __expf(a); } }; class Logistic { public: __device__ inline float operator()(const float a) const { return __fdividef(1.0f, 1.0f + __expf(-a)); } }; class Log { public: __device__ inline float operator()(const float a) const { return __logf(a); } }; class Square { public: __device__ inline float operator()(const float a) const { return a * a; } }; class Sqrt { public: __device__ inline float operator()(const float a) const { return sqrtf(a); } }; class SqrtAbs { public: __device__ inline float operator()(const float a) const { return sqrtf(fabsf(a)); } }; class Reciprocal { public: __device__ inline float operator()(const float a) const { return 1.0f / a; } }; class Abs { public: __device__ inline float operator()(const float a) const { return a > 0 ? a : -a; } }; class Sign { public: __device__ inline float operator()(const float a) const { return (a > 0) - (a < 0); } }; class Identity { public: __device__ inline float operator()(const float a) const { return a; } }; class Zero { public: __device__ inline float operator()(const float a) const { return 0; } }; class One { public: __device__ inline float operator()(const float a) const { return 1; } }; class Const { private: const float scalar; public: Const(const float _scalar) : scalar(_scalar) { } __device__ inline float operator()(const float a) const { return scalar; } }; class OneMinus { public: __device__ inline float operator()(const float x) const { return 1.0f - x; } }; class Linear { protected: float _a, _b; public: __device__ inline float operator()(float x) const { return _a * x + _b; } Linear(float a, float b) : _a(a), _b(b) { } }; class IsNan { public: __device__ inline float operator()(const float a) const { return isnan(a); } }; class IsInf { public: __device__ inline float operator()(const float a) const { return isinf(a); } }; class SmallerThanScalar { private: const float scalar; public: SmallerThanScalar(const float _scalar) : scalar(_scalar) { } __device__ inline float operator()(const float a) const { return a < scalar; } }; class BiggerThanScalar { private: const float scalar; public: BiggerThanScalar(const float _scalar) : scalar(_scalar) { } __device__ inline float operator()(const float a) const { return a > scalar; } }; class AddScalar { private: const float scalar; public: AddScalar(const float _scalar) : scalar(_scalar) { } __device__ inline float operator()(const float a) const { return a + scalar; } }; class WeightedAddScalar { private: const float weight, scalar; public: WeightedAddScalar(const float _weight, const float _scalar) : weight(_weight), scalar(_scalar) { } __device__ inline float operator()(const float a) const { return weight * a + scalar; } }; class MultByScalar { private: const float scalar; public: MultByScalar(const float _scalar) : scalar(_scalar) { } __device__ inline float operator()(const float a) const { return a * scalar; } }; class Pow { private: const float p; public: Pow(const float _p) : p(_p) { } __device__ inline float operator()(const float a) const { return __powf(a, p); } }; template class InRange { private: const float lower, upper; public: InRange(const float _lower, const float _upper) : lower(_lower), upper(_upper) { } __device__ inline float operator()(const float a) const { return exclusive ? a > lower && a < upper : a >= lower && a <= upper; } }; class MinWithScalar { private: const float scalar; public: MinWithScalar(const float _scalar) : scalar(_scalar) { } __device__ inline float operator()(const float a) const { return a > scalar ? scalar : a; } }; class MaxWithScalar { private: const float scalar; public: MaxWithScalar(const float _scalar) : scalar(_scalar) { } __device__ inline float operator()(const float a) const { return a > scalar ? a : scalar; } }; }; class NVMatrixBinaryOps { public: class BinaryOp { public: }; class Equals : public BinaryOp { public: __device__ inline float operator()(const float a, const float b) const { return a == b; } }; class BiggerThan : public BinaryOp { public: __device__ inline float operator()(const float a, const float b) const { return a > b; } }; class Divide : public BinaryOp { public: __device__ inline float operator()(const float a, const float b) const { return __fdividef(a, b); } }; class DivideAccurate : public BinaryOp { public: __device__ inline float operator()(const float a, const float b) const { return a / b; } }; class DivideSafe : public BinaryOp { public: __device__ inline float operator()(const float a, const float b) const { return b == 0 ? 0 : __fdividef(a, b); } }; class DivideSafeAccurate : public BinaryOp { public: __device__ inline float operator()(const float a, const float b) const { return b == 0 ? 0 : (a / b); } }; class Multiply : public BinaryOp { public: __device__ inline float operator()(const float a, const float b) const { return a * b; } }; class SquaredDiff : public BinaryOp { public: __device__ inline float operator()(const float a, const float b) const { return (a - b) * (a - b); } }; class WeightedAdd : public BinaryOp { private: const float scaleA, scaleB; public: WeightedAdd(const float _scaleA, const float _scaleB) : scaleA(_scaleA), scaleB(_scaleB) { } WeightedAdd() : scaleA(0), scaleB(0) { // Compiler complains about no default constructor? } __device__ inline float operator()(const float a, const float b) const { return a * scaleA + b * scaleB; } }; class WeightedAdd1 : public BinaryOp { private: const float scaleB; public: WeightedAdd1(const float _scaleB) : scaleB(_scaleB) { } __device__ inline float operator()(const float a, const float b) const { return a + b * scaleB; } }; class ScaledAdd : public BinaryOp { private: const float scaleB; public: ScaledAdd(const float _scaleB) : scaleB(_scaleB) { } __device__ inline float operator()(const float a, const float b) const { return a + b * scaleB; } }; class Add : public BinaryOp { public: __device__ inline float operator()(const float a, const float b) const { return a + b; } }; class First : public BinaryOp { public: __device__ inline float operator()(const float a, const float b) const { return a; } }; class Second : public BinaryOp { public: __device__ inline float operator()(const float a, const float b) const { return b; } }; class SecondScaled : public BinaryOp { private: const float scale; public: SecondScaled(const float _scale) : scale(_scale) { } SecondScaled() : scale(0) { // Compiler complains about no default constructor? } __device__ inline float operator()(const float a, const float b) const { return scale * b; } }; template class CompositeSecond : public BinaryOp { private: UnaryOp _uop; BinaryOp _bop; public: CompositeSecond(UnaryOp uop, BinaryOp bop) : _uop(uop), _bop(bop) { } __device__ inline float operator()(const float a, const float b) const { return _bop(a, _uop(b)); } }; }; class NVMatrixAggs { public: class Sum { public: __device__ inline float operator()(const float a, const float b) const { return a + b; } __device__ inline float getBaseValue() { return 0; } }; class Max { public: __device__ inline float operator()(const float a, const float b) const { return a > b ? a : b; } __device__ inline float getBaseValue() { return -2e38; } }; class Min { public: __device__ inline float operator()(const float a, const float b) const { return a > b ? b : a; } __device__ inline float getBaseValue() { return 2e38; } }; class CountNan { public: __device__ inline float operator()(const float a, const float b) const { return a + isnan(b); } __device__ inline float getBaseValue() { return 0; } }; class CountInf { public: __device__ inline float operator()(const float a, const float b) const { return a + isinf(b); } __device__ inline float getBaseValue() { return 0; } }; template class ArgMax { private: UnaryOperator u; public: ArgMax(UnaryOperator _u) : u(_u) { } __device__ inline float operator()(const float a, const float b) const { return u(a) > u(b) ? a : b; } __device__ inline float getBaseValue() { return u.getArgMin(); } }; }; class NVMatrixTernaryOps { public: class Add { public: __device__ inline float operator()(const float a, const float b, const float c) const { return a + b + c; } }; class WeightedAdd { private: const float scaleA, scaleB, scaleC; public: WeightedAdd(const float _scaleA, const float _scaleB, const float _scaleC) : scaleA(_scaleA), scaleB(_scaleB), scaleC(_scaleC) { } __device__ inline float operator()(const float a, const float b, const float c) const { return a * scaleA + b * scaleB + c * scaleC; } }; }; #endif /* NVMATRIX_OPERATORS_CUH */ ================================================ FILE: caffe2/contrib/cuda-convnet2/nvmatrix/src/memory.cu ================================================ /* * Copyright 2014 Google Inc. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "../include/memory.cuh" Lock MemoryManager::_globalLock; std::map FastMemoryManager::_memoryManagers; MemoryManager& FastMemoryManager::getInstance(int deviceID) { _globalLock.acquire(); if (_memoryManagers.count(deviceID) == 0) { _memoryManagers[deviceID] = (new FastMemoryManager(deviceID))->init(); } MemoryManager& ret = *_memoryManagers[deviceID]; _globalLock.release(); return ret; } MemoryManager* CUDAMemoryManager::_memoryManager = NULL; MemoryManager& CUDAMemoryManager::getInstance(int deviceID) { _globalLock.acquire(); if (_memoryManager == NULL) { _memoryManager = new CUDAMemoryManager(); } _globalLock.release(); return *_memoryManager; } MemoryManager* CUDAHostMemoryManager::_memoryManager = NULL; MemoryManager& CUDAHostMemoryManager::getInstance() { _globalLock.acquire(); if (_memoryManager == NULL) { _memoryManager = new CUDAHostMemoryManager(); } _globalLock.release(); return *_memoryManager; } MemoryManager* FastHostMemoryManager::_memoryManager = NULL; MemoryManager& FastHostMemoryManager::getInstance() { _globalLock.acquire(); if (_memoryManager == NULL) { _memoryManager = (new FastHostMemoryManager())->init(); } _globalLock.release(); return *_memoryManager; } void FastMemoryManager::destroyInstance(int deviceID) { _globalLock.acquire(); if (_memoryManagers.count(deviceID) != 0) { delete _memoryManagers[deviceID]; _memoryManagers.erase(deviceID); } _globalLock.release(); } void FastHostMemoryManager::destroyInstance() { _globalLock.acquire(); if (_memoryManager != NULL) { delete _memoryManager; _memoryManager = NULL; } _globalLock.release(); } void CUDAMemoryManager::destroyInstance(int deviceID) { _globalLock.acquire(); if (_memoryManager != NULL) { delete _memoryManager; _memoryManager = NULL; } _globalLock.release(); } void CUDAHostMemoryManager::destroyInstance() { _globalLock.acquire(); if (_memoryManager != NULL) { delete _memoryManager; _memoryManager = NULL; } _globalLock.release(); } ================================================ FILE: caffe2/contrib/cuda-convnet2/nvmatrix/src/nvmatrix.cu ================================================ /* * Copyright 2014 Google Inc. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include #include #include #include #include #include #include #include #include #include #include #include #include "../include/nvmatrix.cuh" #include "../include/nvmatrix_operators.cuh" using namespace std; /* * Device random number generator pointers. */ //map NVMatrix::rndGen; map NVMatrix::_rndDevStates; map NVMatrix::_rndDevThreads; pthread_mutex_t* NVMatrix::_rndMutex = makeMutex(); pthread_mutex_t* NVMatrix::_cublasMutex = makeMutex(); pthread_mutex_t* NVMatrix::_streamMutex = makeMutex(); std::map NVMatrix::_cublasHandles; std::map NVMatrix::_defaultStreams; pthread_mutex_t* NVMatrix::makeMutex() { pthread_mutex_t* m = (pthread_mutex_t*) malloc(sizeof(pthread_mutex_t)); pthread_mutex_init(m, NULL); return m; } /* Do not call resize in _init because resize is a virtual function which is overridden in base class. Since C++ is retarded and unable to call overridden functions from constructors, we shall call resize separately from every constructor after calling _init. */ void NVMatrix::_init(bool isTrans) { _numRows = 0; _numCols = 0; _numElements = 0; _ownsData = true; _isTrans = isTrans; _memSegment = NULL; _stride = 0; _texObj = 0; } NVMatrix::NVMatrix() : _deleted(false) { _init(false); } NVMatrix::NVMatrix(bool isTrans) : _deleted(false) { _init(isTrans); } NVMatrix::NVMatrix(int numRows, int numCols, bool isTrans) : _deleted(false) { _init(isTrans); resize(numRows, numCols); } NVMatrix::NVMatrix(const Matrix& like, bool copy) : _deleted(false) { _init(like.isTrans()); resize(like.getNumRows(), like.getNumCols()); if (copy) { copyFromHost(like); } } NVMatrix::NVMatrix(const NVMatrix& like, bool copy) : _deleted(false) { _init(like.isTrans()); resize(like.getNumRows(), like.getNumCols()); if (copy) { like.copy(*this); } } /* * Initializes NVMatrix with same dimensions as given matrix but * does not copy any data. */ NVMatrix::NVMatrix(const NVMatrix& like) : _deleted(false) { _init(like.isTrans()); resize(like.getNumRows(), like.getNumCols()); } /* * Initializes NVMatrix with same dimensions as given matrix but * does not copy any data. */ NVMatrix::NVMatrix(const Matrix& like) : _deleted(false) { _init(false); resize(like.getNumRows(), like.getNumCols()); } NVMatrix::NVMatrix(MemorySegment* mem, int numRows, int numCols, int stride, bool isTrans) : _numRows(numRows), _numCols(numCols), _numElements(numRows*numCols), _ownsData(false), _memSegment(mem), _isTrans(isTrans), _deleted(false), _texObj(0) { _stride = stride < 0 ? getLeadingDim() : stride; } NVMatrix::~NVMatrix() { if (!_deleted) { deallocTexture(); if(_ownsData && _numElements > 0) { dealloc(); } else { // dealloc deletes the mem segment. But if this is a view, // then we still need to delete the mem segment object. // assert(_memSegment == NULL || _memSegment->getSize() == 0); delete _memSegment; } } } void NVMatrix::copyFromHost(const Matrix& hostMatrix) { copyFromHost(hostMatrix, false, getDefaultStream()); } void NVMatrix::copyFromHost(const Matrix& hostMatrix, bool resizeTarget) { copyFromHost(hostMatrix, resizeTarget, getDefaultStream()); } void NVMatrix::copyFromHost(const Matrix& hostMatrix, bool resizeTarget, cudaStream_t stream) { if (resizeTarget) { resize(hostMatrix); } else { assert(isSameDims(hostMatrix)); } setTrans(hostMatrix.isTrans()); if (getNumElements() > 0) { CUBLAS_CALL(cublasSetMatrixAsync(hostMatrix.getLeadingDim(), hostMatrix.getFollowingDim(), sizeof(float), hostMatrix.getData(), hostMatrix.getLeadingDim(), getDevData(), _stride, stream)); syncStream(stream); } } void NVMatrix::copyToHost(Matrix& hostMatrix) const { copyToHost(hostMatrix, false, getDefaultStream()); } void NVMatrix::copyToHost(Matrix& hostMatrix, bool resizeTarget) const { copyToHost(hostMatrix, resizeTarget, getDefaultStream()); } void NVMatrix::copyToHost(Matrix& hostMatrix, bool resizeTarget, cudaStream_t stream) const { if (resizeTarget) { hostMatrix.resize(_numRows, _numCols); } else { assert(isSameDims(hostMatrix)); } hostMatrix.setTrans(_isTrans); if (getNumElements() > 0) { CUBLAS_CALL(cublasGetMatrixAsync(getLeadingDim(),getFollowingDim(), sizeof(float), getDevData(), getStride(), hostMatrix.getData(), hostMatrix.getLeadingDim(), stream)); syncStream(stream); } } void NVMatrix::copy(NVMatrix& dest) const { copy(dest, getDefaultStream()); } void NVMatrix::copy(NVMatrix& dest, cudaStream_t stream) const { if (&dest != this) { if (!isSameDims(dest)) { dest.resize(*this); } copy(dest, 0, -1, 0, -1, 0, 0, stream); } } NVMatrix& NVMatrix::copy() const { NVMatrix& c = construct(); copy(c); return c; } void NVMatrix::rightMult(NVMatrix &b, float scaleAB, NVMatrix &target) { rightMult(b, scaleAB, target, getDefaultStream()); } void NVMatrix::rightMult(NVMatrix &b, float scaleAB, NVMatrix &target, cudaStream_t stream) { // if(&target != this && &target != &b) { // target.resize(_numRows, b.getNumCols()); // target.setTrans(true); // } target.addProduct(*this, b, 0, scaleAB, stream); } void NVMatrix::rightMult(NVMatrix &b, float scaleAB) { rightMult(b, scaleAB, *this); } void NVMatrix::rightMult(NVMatrix &b, NVMatrix& target) { rightMult(b, 1, target); } void NVMatrix::addProduct(NVMatrix& a, NVMatrix &b, float scaleThis, float scaleAB) { addProduct(a, b, scaleThis, scaleAB, getDefaultStream()); } /* * This will only work if this matrix is in column-major order! In other words, * if isTrans() returns true. */ void NVMatrix::addProduct(NVMatrix& a, NVMatrix &b, float scaleThis, float scaleAB, cudaStream_t stream) { assert(a.getNumCols() == b.getNumRows()); if (scaleThis == 0) { resize(a.getNumRows(), b.getNumCols()); setTrans(true); } assert(this->getNumRows() == a.getNumRows()); assert(this->getNumCols() == b.getNumCols()); assert(_isTrans); CUBLAS_CALL(cublasSetStream_v2(getCublasHandle(), stream)); CUBLAS_CALL(cublasSgemm_v2(getCublasHandle(), a.getTransChar(), b.getTransChar(), a.getNumRows(), b.getNumCols(), a.getNumCols(), &scaleAB, a.getDevData(), a.getStride(), b.getDevData(), b.getStride(), &scaleThis, getDevData(), getStride())); } void NVMatrix::addProduct(NVMatrix& a, NVMatrix &b) { addProduct(a, b, 1, 1); } void NVMatrix::assertSame(NVMatrixV& a) { for (int i = 1; i < a.size(); ++i) { assert(a[i]->isSameDims(*a[0])); assert(a[i]->isTrans() == a[0]->isTrans()); assert(a[i]->getStride() == a[0]->getStride()); assert(a[i]->getDataDeviceID() == a[0]->getDataDeviceID()); } } void NVMatrix::batchedMatrixMultiply(NVMatrixV& a, NVMatrixV& b, NVMatrixV& target, float scaleTarget, float scaleAB, const float** aPtrsDev, const float** bPtrsDev, float** tgtPtrsDev) { batchedMatrixMultiply(a, b, target, scaleTarget, scaleAB, getDefaultStream(), aPtrsDev, bPtrsDev, tgtPtrsDev); } void NVMatrix::batchedMatrixMultiply(NVMatrixV& a, NVMatrixV& b, NVMatrixV& target, float scaleTarget, float scaleAB) { batchedMatrixMultiply(a, b, target, scaleTarget, scaleAB, getDefaultStream()); } void NVMatrix::batchedMatrixMultiply(NVMatrixV& a, NVMatrixV& b, NVMatrixV& target, float scaleTarget, float scaleAB, cudaStream_t stream, const float** aPtrsDev, const float** bPtrsDev, float** tgtPtrsDev) { assert(a.size() == b.size()); assert(a.size() == target.size()); assertSame(a); assertSame(b); assertSame(target); const int batch = a.size(); if (batch > 0) { const int rows = a[0]->getNumRows(), inner = a[0]->getNumCols(), cols = b[0]->getNumCols(); assert(inner == b[0]->getNumRows()); assert(target[0]->getNumRows() == rows); assert(target[0]->getNumCols() == cols); const int lda = a[0]->getStride(), ldb = b[0]->getStride(), ldc = target[0]->getStride(); cublasOperation_t atrans = a[0]->getTransChar(), btrans = b[0]->getTransChar(); CUBLAS_CALL(cublasSetStream_v2(getCublasHandle(), stream)); CUBLAS_CALL(cublasSgemmBatched(getCublasHandle(), atrans, btrans, rows, cols, inner, &scaleAB, aPtrsDev, lda, bPtrsDev, ldb, &scaleTarget, tgtPtrsDev, ldc, batch)); } } void NVMatrix::batchedMatrixMultiply(NVMatrixV& a, NVMatrixV& b, NVMatrixV& target, float scaleTarget, float scaleAB, cudaStream_t stream) { assert(a.size() == b.size()); assert(a.size() == target.size() || target.size() == 0); const int batch = a.size(); if (batch > 0) { const int rows = a[0]->getNumRows(), cols = b[0]->getNumCols(); const float* aPtrs[batch], *bPtrs[batch], *tgtPtrs[batch]; for (int i = 0; i < batch; ++i) { if (target.size() <= i) { target.push_back(new NVMatrix(rows, cols, true)); } aPtrs[i] = a[i]->getDevData(); bPtrs[i] = b[i]->getDevData(); tgtPtrs[i] = target[i]->getDevData(); } // const float** aPtrsDev, **bPtrsDev; // float **tgtPtrsDev; // checkCudaErrors(cudaMalloc(&aPtrsDev, batch * sizeof(float*))); // checkCudaErrors(cudaMalloc(&bPtrsDev, batch * sizeof(float*))); // checkCudaErrors(cudaMalloc(&tgtPtrsDev, batch * sizeof(float*))); MemorySegment* aPtrsDev = DEVICE_MEMORY_MANAGER::getInstance(getDeviceID()).malloc(batch * sizeof(float*)); MemorySegment* bPtrsDev = DEVICE_MEMORY_MANAGER::getInstance(getDeviceID()).malloc(batch * sizeof(float*)); MemorySegment* tgtPtrsDev = DEVICE_MEMORY_MANAGER::getInstance(getDeviceID()).malloc(batch * sizeof(float*)); checkCudaErrors(cudaMemcpyAsync(aPtrsDev, aPtrs, batch * sizeof(float*), cudaMemcpyHostToDevice, stream)); checkCudaErrors(cudaMemcpyAsync(bPtrsDev, bPtrs, batch * sizeof(float*), cudaMemcpyHostToDevice, stream)); checkCudaErrors(cudaMemcpyAsync(tgtPtrsDev, tgtPtrs, batch * sizeof(float*), cudaMemcpyHostToDevice, stream)); batchedMatrixMultiply(a, b, target, scaleTarget, scaleAB, stream, const_cast(aPtrsDev->getData()), const_cast(bPtrsDev->getData()), tgtPtrsDev->getData()); // checkCudaErrors(cudaFree(aPtrsDev)); // checkCudaErrors(cudaFree(bPtrsDev)); // checkCudaErrors(cudaFree(tgtPtrsDev)); DEVICE_MEMORY_MANAGER::getInstance(getDeviceID()).free(aPtrsDev); DEVICE_MEMORY_MANAGER::getInstance(getDeviceID()).free(bPtrsDev); DEVICE_MEMORY_MANAGER::getInstance(getDeviceID()).free(tgtPtrsDev); } } template void NVMatrix::_unaryRandomize(NVMatrix& target, Randomizer rnd) { _unaryRandomize(target, rnd, getDefaultStream()); } template void NVMatrix::_unaryRandomize(NVMatrix& target, Randomizer rnd, cudaStream_t stream) { assert(isRndInitialized()); assert(isContiguous() && target.isContiguous()); if (!isSameDims(target)) { target.resize(*this); } assert(isTrans() == target.isTrans()); kUnaryRandomize<<>>(getDevData(), target.getDevData(), getCurandState(), getNumElements(), rnd); getLastCudaError("kUnaryRandomize: Kernel execution failed"); } template void NVMatrix::_binaryRandomize(NVMatrix& data2, NVMatrix& target, Randomizer rnd) { _binaryRandomize(data2, target, rnd, getDefaultStream()); } template void NVMatrix::_binaryRandomize(NVMatrix& data2, NVMatrix& target, Randomizer rnd, cudaStream_t stream) { assert(isRndInitialized()); assert(isContiguous() && data2.isContiguous() && target.isContiguous()); assert(isSameDims(data2)); assert(isTrans() == data2.isTrans()); if (!isSameDims(target)) { target.resize(*this); } assert(isTrans() == target.isTrans()); kBinaryRandomize<<>>(getDevData(), data2.getDevData(), target.getDevData(), getCurandState(), getNumElements(), rnd); getLastCudaError("kBinaryRandomize: Kernel execution failed"); } void NVMatrix::initRandom(unsigned long long seed, int numStreams) { NVMatrix::initRandom(seed, numStreams, NVMatrix::getDefaultStream()); } void NVMatrix::initRandom(unsigned long long seed, int numStreams, cudaStream_t stream) { // printf("init random on device %d\n", getDeviceID()); pthread_mutex_lock(_rndMutex); assert(!isRndInitialized(true)); int d = getDeviceID(); // _rndDevStates[d] = NULL; _rndDevThreads[d] = numStreams; _rndDevStates[d] = DEVICE_MEMORY_MANAGER::getInstance(d).malloc(numStreams * sizeof(curandState)); // checkCudaErrors(cudaMalloc((void **)&_rndDevStates[d], numStreams * sizeof(curandState))); pthread_mutex_unlock(_rndMutex); kSetupCurand<<>>(getCurandState(), 1 + seed*2); // so there's no chance it'll be correlated with the other one getLastCudaError("kSetupCurand: Kernel execution failed"); } void NVMatrix::initRandom(unsigned long long seed) { initRandom(seed, NUM_RND_STREAMS); } void NVMatrix::initRandom() { NVMatrix::initRandom(time(0)); } void NVMatrix::initCublas() { int d = getDeviceID(); pthread_mutex_lock(_cublasMutex); assert(_cublasHandles.count(d) == 0); CUBLAS_CALL(cublasCreate(&_cublasHandles[d])); // It appears that cublasCreate causes a host -> device copy on stream 0, // so we synchronize with it because we run everything else on other // streams. syncDevice(); pthread_mutex_unlock(_cublasMutex); } void NVMatrix::destroyCublas() { int d = getDeviceID(); pthread_mutex_lock(_cublasMutex); assert(_cublasHandles.count(d) > 0); CUBLAS_CALL(cublasDestroy(_cublasHandles[d])); _cublasHandles.erase(d); pthread_mutex_unlock(_cublasMutex); } cublasHandle_t NVMatrix::getCublasHandle() { return getCublasHandle(getDeviceID()); } cublasHandle_t NVMatrix::getCublasHandle(int deviceID) { pthread_mutex_lock(_cublasMutex); assert(_cublasHandles.count(deviceID) > 0); cublasHandle_t h = _cublasHandles[deviceID]; pthread_mutex_unlock(_cublasMutex); return h; } cudaStream_t NVMatrix::getDefaultStream() { return getDefaultStream(NVMatrix::getDeviceID()); } cudaStream_t NVMatrix::getDefaultStream(int deviceID) { if (deviceID >= 0) { pthread_mutex_lock(_streamMutex); if (_defaultStreams.count(deviceID) == 0) { int oldDeviceID = getDeviceID(); NVMatrix::setDeviceID(deviceID); checkCudaErrors(cudaStreamCreateWithFlags(&_defaultStreams[deviceID], cudaStreamNonBlocking)); NVMatrix::setDeviceID(oldDeviceID); } cudaStream_t s = _defaultStreams[deviceID]; pthread_mutex_unlock(_streamMutex); return s; } return 0; } void NVMatrix::syncDevice() { checkCudaErrors(cudaDeviceSynchronize()); } void NVMatrix::syncStream(cudaStream_t stream) { checkCudaErrors(cudaStreamSynchronize(stream)); } void NVMatrix::syncStream() { syncStream(getDefaultStream()); } curandState* NVMatrix::getCurandState() { /* * Even though we're only reading from the map here, it's important to grab * the mutex because another thread may be writing to it. */ pthread_mutex_lock(_rndMutex); int d = getDeviceID(); assert(isRndInitialized(true)); curandState* r = _rndDevStates[d]->getData(); pthread_mutex_unlock(_rndMutex); return r; } curandState* NVMatrix::getCurandState(int numStreams) { int d = getDeviceID(); pthread_mutex_lock(_rndMutex); assert(isRndInitialized(true)); bool realloc = numStreams > _rndDevThreads[d]; pthread_mutex_unlock(_rndMutex); if (realloc) { destroyRandom(); initRandom(time(0), numStreams); } return getCurandState(); } int NVMatrix::getDataDeviceID() const { if (getDevData() == NULL) { return DEVICE_NULL; } struct cudaPointerAttributes atts; checkCudaErrors(cudaPointerGetAttributes(&atts, getDevData())); return atts.memoryType == cudaMemoryTypeDevice ? atts.device : DEVICE_HOST; } int NVMatrix::getDeviceID() { int d; checkCudaErrors(cudaGetDevice(&d)); // if (d == 0) { // raise(SIGABRT); // } return d; } void NVMatrix::setDeviceID(int d) { assert(d >= 0); // printf("Setting device to %d\n", d); // if (d == 0) { // raise(SIGABRT); // } checkCudaErrors(cudaSetDevice(d)); } bool NVMatrix::canAccessPeer(int srcDevice, int tgtDevice) { if (srcDevice == tgtDevice) { return true; } int canAccess; checkCudaErrors(cudaDeviceCanAccessPeer(&canAccess, srcDevice, tgtDevice)); return canAccess; } bool NVMatrix::isRndInitialized(bool haveLock) { if (!haveLock) { pthread_mutex_lock(_rndMutex); } bool b = _rndDevStates.count(getDeviceID()) != 0; if (!haveLock) { pthread_mutex_unlock(_rndMutex); } return b; } bool NVMatrix::isRndInitialized() { return isRndInitialized(false); } void NVMatrix::destroyRandom() { int d = getDeviceID(); pthread_mutex_lock(_rndMutex); assert(isRndInitialized(true)); // checkCudaErrors(cudaFree(_rndDevStates[d])); DEVICE_MEMORY_MANAGER::getInstance(d).free(_rndDevStates[d]); _rndDevStates.erase(d); _rndDevThreads.erase(d); pthread_mutex_unlock(_rndMutex); } void NVMatrix::binarizeProbs() { binarizeProbs(*this); } void NVMatrix::binarizeProbs(NVMatrix& target) { _unaryRandomize(target, BinarizeUnaryRandomizer()); } void NVMatrix::randomizeUniform() { assert(isContiguous()); assert(isRndInitialized()); // CURAND_CALL(curandGenerateUniform(rndGen, _devData, getNumElements())); _unaryRandomize(*this, UniformUnaryRandomizer()); } void NVMatrix::randomizeGaussian() { randomizeGaussian(1); } void NVMatrix::randomizeGaussian(float stdev) { randomizeGaussian(0, stdev); } void NVMatrix::randomizeGaussian(float mean, float stdev) { assert(isContiguous()); assert(isRndInitialized()); // CURAND_CALL(curandGenerateNormal(rndGen, _devData, getNumElements(), mean, stdev)); _unaryRandomize(*this, GaussianUnaryRandomizer(mean, stdev)); } /* * Kind of a hack since we don't actually need the contents of this matrix for it, * so we don't really need a binary randomizer. */ void NVMatrix::randomizeGaussian(NVMatrix& stdevs) { randomizeGaussian(0, stdevs); } void NVMatrix::randomizeGaussian(float mean, NVMatrix& stdevs) { _binaryRandomize(stdevs, *this, GaussianBinaryRandomizer(mean)); } void NVMatrix::randomizeGaussian(float mean, float stdevMult, NVMatrix& stdevs) { _binaryRandomize(stdevs, *this, ScaledGaussianBinaryRandomizer(mean, stdevMult)); } void NVMatrix::addGaussianNoise() { addGaussianNoise(1); } void NVMatrix::addGaussianNoise(float stdev) { addGaussianNoise(stdev, *this); } void NVMatrix::addGaussianNoise(float stdev, NVMatrix& target) { _unaryRandomize(target, AddGaussianUnaryRandomizer(stdev)); } void NVMatrix::addGaussianNoise(NVMatrix& stdevs, bool var) { addGaussianNoise(stdevs, var, *this); } void NVMatrix::addGaussianNoise(NVMatrix& stdevs) { addGaussianNoise(stdevs, false, *this); } void NVMatrix::addGaussianNoise(NVMatrix& stdevs, bool var, NVMatrix& target) { if (var) { _binaryRandomize(stdevs, target, AddGaussianBinaryRandomizer()); } else { _binaryRandomize(stdevs, target, AddGaussianBinaryRandomizer()); } } void NVMatrix::biggerThan(NVMatrix& b, NVMatrix& target) { applyBinary(NVMatrixBinaryOps::BiggerThan(), b, target); } void NVMatrix::biggerThan(NVMatrix& b) { biggerThan(b, *this); } void NVMatrix::equals(NVMatrix& b, NVMatrix& target) { applyBinary(NVMatrixBinaryOps::Equals(), b, target); } void NVMatrix::equals(NVMatrix& m) { equals(m, *this); } void NVMatrix::biggerThanVector(NVMatrix& vec, NVMatrix& target) { applyBinaryV(NVMatrixBinaryOps::BiggerThan(), vec, target); } void NVMatrix::biggerThanVector(NVMatrix& vec) { biggerThanVector(vec, *this); } void NVMatrix::_checkBounds(int startRow, int endRow, int startCol, int endCol) const { assert(startRow >= 0 && startRow <= _numRows); assert(endRow >= startRow && endRow <= _numRows); assert(startCol >= 0 && startCol <= _numCols); assert(endCol >= startCol && endCol <= _numCols); } /* * The only place where stride is supported for now! * Will ALWAYS return a view of the original data, sometimes non-contiguous. */ NVMatrix& NVMatrix::slice(int startRow, int endRow, int startCol, int endCol) const { endRow = endRow < 0 ? this->_numRows : endRow; endCol = endCol < 0 ? this->_numCols : endCol; _checkBounds(startRow, endRow, startCol, endCol); if (!isTrans()) { return construct(new MemorySegment(this->getDevData() + startRow * _stride + startCol), endRow - startRow, endCol - startCol, _stride, false); } return construct(new MemorySegment(this->getDevData() + startCol * _stride + startRow), endRow - startRow, endCol - startCol, _stride, true); } /* this will NEVER return a view */ void NVMatrix::slice(int startRow, int endRow, int startCol, int endCol, NVMatrix& target) const { endRow = endRow < 0 ? this->_numRows : endRow; endCol = endCol < 0 ? this->_numCols : endCol; _checkBounds(startRow, endRow, startCol, endCol); int sliceRows = endRow - startRow, sliceCols = endCol - startCol; if (target.getNumRows() != sliceRows || target.getNumCols() != sliceCols) { target.resize(sliceRows, sliceCols); } this->copy(target, startRow, endRow, startCol, endCol, 0, 0); } NVMatrix& NVMatrix::sliceRows(int startRow, int endRow) const { return slice(startRow, endRow, 0, -1); } void NVMatrix::sliceRows(int startRow, int endRow, NVMatrix& target) const { slice(startRow, endRow, 0, -1, target); } NVMatrix& NVMatrix::sliceCols(int startCol, int endCol) const { return slice(0, -1, startCol, endCol); } void NVMatrix::sliceCols(int startCol, int endCol, NVMatrix& target) const { slice(0, -1, startCol, endCol, target); } NVMatrixV& NVMatrix::splitRows(int numParts) { assert(getNumRows() % numParts == 0); NVMatrixV& v = *new NVMatrixV(); int partSize = getNumRows() / numParts; for (int p = 0; p < numParts; ++p) { v.push_back(&sliceRows(p * partSize, (p+1) * partSize)); } return v; } NVMatrixV& NVMatrix::splitCols(int numParts) { assert(getNumCols() % numParts == 0); NVMatrixV& v = *new NVMatrixV(); int partSize = getNumCols() / numParts; for (int p = 0; p < numParts; ++p) { v.push_back(&sliceCols(p * partSize, (p+1) * partSize)); } return v; } /* * Guaranteed to not change the data if the number of elements doesn't change. * So you can use this to "reshape" a matrix. */ bool NVMatrix::resize(int numRows, int numCols, bool trans) { setTrans(trans); bool reallocated = false; if (numRows != _numRows || numCols != _numCols) { assert(_ownsData || (_numElements == numRows * numCols && isContiguous())); if (_numElements != numRows * numCols) { if (_numElements > 0) { // free old memory dealloc(); } if (numRows * numCols > 0) { // allocate new memory alloc(numCols * numRows); } else { _memSegment = NULL; } reallocated = true; } _numRows = numRows; _numCols = numCols; _numElements = numRows * numCols; _stride = getLeadingDim(); } return reallocated; } bool NVMatrix::resize(int numRows, int numCols) { return resize(numRows, numCols, isTrans()); } bool NVMatrix::resize(const NVMatrix& like) { setTrans(like.isTrans()); return resize(like.getNumRows(), like.getNumCols()); } bool NVMatrix::resize(const Matrix& like) { setTrans(like.isTrans()); return resize(like.getNumRows(), like.getNumCols()); } void NVMatrix::reshape(int numRows, int numCols) { assert(isContiguous()); assert(_numElements == numRows*numCols); _numRows = numRows; _numCols = numCols; _stride = getLeadingDim(); } NVMatrix& NVMatrix::reshaped(int numRows, int numCols) const { assert(isContiguous()); assert(_numElements == numRows*numCols); return construct(new MemorySegment(*_memSegment), numRows, numCols, -1, _isTrans); } void NVMatrix::copy(NVMatrix &dest, int srcStartRow, int srcEndRow, int srcStartCol, int srcEndCol, int destStartRow, int destStartCol) const { copy(dest, srcStartRow, srcEndRow, srcStartCol, srcEndCol, destStartRow, destStartCol, getDefaultStream()); } void NVMatrix::copy(NVMatrix &dest, int srcStartRow, int srcEndRow, int srcStartCol, int srcEndCol, int destStartRow, int destStartCol, cudaStream_t stream) const { srcEndRow = srcEndRow < 0 ? _numRows : srcEndRow; srcEndCol = srcEndCol < 0 ? _numCols : srcEndCol; NVMatrix* srcSlice = &slice(srcStartRow, srcEndRow, srcStartCol, srcEndCol); NVMatrix* destSlice = &dest.slice(destStartRow, destStartRow + srcEndRow - srcStartRow, destStartCol, destStartCol + srcEndCol - srcStartCol); if (srcSlice->isContiguous() && destSlice->isContiguous() && srcSlice->isSameDims(*destSlice) && srcSlice->isTrans() == destSlice->isTrans()) { // The commonest case. checkCudaErrors(cudaMemcpyAsync(destSlice->getDevData(), srcSlice->getDevData(), srcSlice->getNumDataBytes(), cudaMemcpyDefault, stream)); } else { srcSlice->apply(NVMatrixOps::Identity(), *destSlice, stream); } delete srcSlice; delete destSlice; } NVMatrix& NVMatrix::getTranspose() { return construct(new MemorySegment(*_memSegment), _numCols, _numRows, _stride, !_isTrans); } NVMatrix& NVMatrix::getClone() { return construct(new MemorySegment(*_memSegment), _numRows, _numCols, _stride, _isTrans); } void NVMatrix::transpose(NVMatrix& target) { flipTrans(target); target.setTrans(!target.isTrans()); target.reshape(target.getNumCols(), target.getNumRows()); } void NVMatrix::transpose() { int tmp = _numCols; _numCols = _numRows; _numRows = tmp; _isTrans = !_isTrans; } bool NVMatrix::transpose(bool trans) { bool oldTrans = _isTrans; if (oldTrans != trans) { transpose(); } return oldTrans; } /* * Flips the ordering of the matrix from row-major to column-major and vice versa. * This creates temporary storage -- not a cheap operation. * * This is not equivalent to a "hard transpose". The resultant matrix still has * the same dimensions, its layout in memory just changes. */ NVMatrix& NVMatrix::flipTrans() { NVMatrix& meTrans = construct(*this); flipTrans(meTrans); return meTrans; } void NVMatrix::flipTrans(NVMatrix& target) { flipTrans(target, getDefaultStream()); } void NVMatrix::flipTrans(NVMatrix& target, cudaStream_t stream) { assert(&target != this); target.resize(_numRows, _numCols); target.setTrans(!isTrans()); // target.printShape("target"); // this->printShape("this"); apply(NVMatrixOps::Identity(), target, stream); } void NVMatrix::squaredDiff(NVMatrix& b) { squaredDiff(b, *this); } void NVMatrix::squaredDiff(NVMatrix& b, NVMatrix& target) { applyBinary(NVMatrixBinaryOps::SquaredDiff(), b, target); } void NVMatrix::add(NVMatrix& b, float scaleA, float scaleB, NVMatrix& target) { add(b, scaleA, scaleB, target, NVMatrix::getDefaultStream()); } void NVMatrix::add(NVMatrix& b, float scaleA, float scaleB, NVMatrix& target, cudaStream_t stream) { if (scaleA == 0) { b.scale(scaleB, target, stream); } else if (scaleB == 0) { scale(scaleA, target, stream); } else if (scaleA == 1 && scaleB == 1) { // slight optimization applyBinary(NVMatrixBinaryOps::Add(), b, target, stream); } else if (scaleA == 1) { applyBinary(NVMatrixBinaryOps::WeightedAdd1(scaleB), b, target, stream); } else { applyBinary(NVMatrixBinaryOps::WeightedAdd(scaleA, scaleB), b, target, stream); } } void NVMatrix::add(NVMatrix& b, float scaleB, NVMatrix& target) { add(b, 1, scaleB, target); } void NVMatrix::add(NVMatrix& b, NVMatrix& target) { add(b, 1, target); } void NVMatrix::add(NVMatrix& b, float scaleB) { add(b, scaleB, *this); } void NVMatrix::add(NVMatrix& b, float scaleA, float scaleB) { add(b, scaleA, scaleB, *this); } void NVMatrix::add(NVMatrix& b) { add(b, 1, *this); } void NVMatrix::subtract(NVMatrix& b, NVMatrix& target) { add(b, -1, target); } void NVMatrix::subtract(NVMatrix& b) { add(b, -1); } void NVMatrix::eltwiseMult(NVMatrix& b, NVMatrix& target) { applyBinary(NVMatrixBinaryOps::Multiply(), b, target); } void NVMatrix::eltwiseMult(NVMatrix& b) { eltwiseMult(b, *this); } void NVMatrix::eltwiseDivide(NVMatrix& b, NVMatrix& target) { applyBinary(NVMatrixBinaryOps::Divide(), b, target); } void NVMatrix::eltwiseDivide(NVMatrix& b) { eltwiseDivide(b, *this); } void NVMatrix::tile(int timesY, int timesX, NVMatrix& target) { tile(timesY, timesX, target, getDefaultStream()); } void NVMatrix::tile(int timesY, int timesX, NVMatrix& target, cudaStream_t stream) { assert(isContiguous() && target.isContiguous()); assert(timesX > 0 && timesY > 0); target.resize(_numRows*timesY, _numCols*timesX); target.setTrans(_isTrans); if(!isTrans()) { kTile<<>>(getDevData(), target.getDevData(), _numCols, _numRows, target._numCols, target._numRows); } else { kTile<<>>(getDevData(), target.getDevData(), _numRows, _numCols, target._numRows, target._numCols); } getLastCudaError("Kernel execution failed"); } void NVMatrix::addVector(NVMatrix& vec, float scaleVec, NVMatrix& target) { addVector(vec, scaleVec, target, getDefaultStream()); } void NVMatrix::addVector(NVMatrix& vec, float scaleVec, NVMatrix& target, cudaStream_t stream) { applyBinaryV(NVMatrixBinaryOps::ScaledAdd(scaleVec), vec, target, stream); } void NVMatrix::addVector(NVMatrix& vec) { addVector(vec, 1); } void NVMatrix::addVector(NVMatrix& vec, float scaleVec) { addVector(vec, scaleVec, *this); } void NVMatrix::addVector(NVMatrix& vec, NVMatrix& target) { addVector(vec, 1, target); } void NVMatrix::equalsVector(NVMatrix& vec, NVMatrix& target) { applyBinaryV(NVMatrixBinaryOps::Equals(), vec, target); } void NVMatrix::equalsVector(NVMatrix& vec) { equalsVector(vec, *this); } void NVMatrix::eltwiseMultByVector(NVMatrix& vec, NVMatrix& target) { eltwiseMultByVector(vec, target, getDefaultStream()); } void NVMatrix::eltwiseMultByVector(NVMatrix& vec, NVMatrix& target, cudaStream_t stream) { applyBinaryV(NVMatrixBinaryOps::Multiply(), vec, target, stream); } void NVMatrix::eltwiseMultByVector(NVMatrix& vec, cudaStream_t stream) { eltwiseMultByVector(vec, *this, stream); } void NVMatrix::eltwiseMultByVector(NVMatrix& vec) { eltwiseMultByVector(vec, *this); } void NVMatrix::eltwiseDivideByVector(NVMatrix& vec) { eltwiseDivideByVector(vec, *this); } void NVMatrix::eltwiseDivideByVector(NVMatrix& vec, NVMatrix& target) { applyBinaryV(NVMatrixBinaryOps::Divide(), vec, target); } template void NVMatrix::_aggregate(int axis, NVMatrix& target, Agg agg, UnaryOp uop, BinaryOp bop, cudaStream_t stream) { _aggregate(axis, target, agg, uop, bop, stream, NULL); } /* * TODO: this is a mess, fix it. it works pretty fast but it's too ugly. * TODO: this function is _really_ bad for very long aggregations of few columns. */ template void NVMatrix::_aggregate(int axis, NVMatrix& target, Agg agg, UnaryOp uop, BinaryOp bop, cudaStream_t stream, NVMatrix* tmp) { assert(axis == 0 || axis == 1); assert(isContiguous() && target.isContiguous()); assert(&target != this); int width = _isTrans ? _numRows : _numCols; int height = _isTrans ? _numCols : _numRows; target.setTrans(_isTrans); assert(width > 0); assert(height > 0); if((axis == 0 && !_isTrans) || (axis == 1 && _isTrans)) { //col sum target.resize(!_isTrans ? 1 : _numRows, !_isTrans ? _numCols : 1); // int height = getFollowingDim(); if ((height <= 2048 || width >= 4096)) { int numBlocks = DIVUP(width, NUM_SUM_COLS_THREADS_PER_BLOCK); assert(numBlocks * NUM_SUM_COLS_THREADS_PER_BLOCK >= width); assert(numBlocks < NUM_BLOCKS_MAX); kDumbAggCols<<>>(getTextureObject(), target.getDevData(), width, height, agg, uop, bop); getLastCudaError("kDumbAggCols: Kernel execution failed"); } else { // Specialize the case when we have very long columns and few of them const int sumLength = 128; bool deltmp = tmp == NULL; if (tmp == NULL) { tmp = new NVMatrix(false); } int numBlocksX = DIVUP(width, NUM_SUM_COLS_THREADS_PER_BLOCK); int numBlocksY = DIVUP(height, sumLength); tmp->resize(numBlocksY, width); dim3 blocks(numBlocksX, numBlocksY); dim3 threads(NUM_SUM_COLS_THREADS_PER_BLOCK); kAggCols<<>>(getTextureObject(), tmp->getDevData(), width, height, sumLength, agg, uop); getLastCudaError("kAggCols: Kernel execution failed"); int numBlocks = DIVUP(width, NUM_SUM_COLS_THREADS_PER_BLOCK); kDumbAggCols<<>>(tmp->getTextureObject(), target.getDevData(), width, numBlocksY, agg, NVMatrixOps::Identity(), bop); getLastCudaError("kDumbAggCols: Kernel execution failed"); if (deltmp) { delete tmp; } } } else { // row sum target.resize(_isTrans ? 1 : _numRows, _isTrans ? _numCols : 1); if (width > 1) { if (height >= 16384) { // linear aggregation int numBlocksX = 1; int numBlocksY = DIVUP(height, AGG_SHORT_ROWS_THREADS_Y*AGG_SHORT_ROWS_LOOPS_Y); int numThreadsX = width <= 4 ? 4 : width <= 8 ? 8 : width <= 12 ? 12 : width <= 16 ? 16 : AGG_SHORT_ROWS_THREADS_X; int numThreadsY = AGG_SHORT_ROWS_THREADS_Y; while (numBlocksY > NUM_BLOCKS_MAX) { numBlocksY = DIVUP(numBlocksY,2); numBlocksX *= 2; } dim3 grid(numBlocksX, numBlocksY), threads(numThreadsX, numThreadsY); if(width <= 16) { if(width <= 4) { kAggShortRows<<>>(getDevData(), target.getDevData(),width, height, agg, uop, bop); } else if(width <= 8) { kAggShortRows<<>>(getDevData(), target.getDevData(),width, height, agg, uop, bop); } else if(width <= 12) { kAggShortRows<<>>(getDevData(), target.getDevData(),width, height, agg, uop, bop); } else { kAggShortRows<<>>(getDevData(), target.getDevData(),width, height, agg, uop, bop); } } else if(width <= 32) { kAggShortRows<<>>(getDevData(), target.getDevData(),width, height, agg, uop, bop); } else if(width <= 48){ kAggShortRows<<>>(getDevData(), target.getDevData(),width, height, agg, uop, bop); } else if(width <= 64){ kAggShortRows<<>>(getDevData(), target.getDevData(),width, height, agg, uop, bop); } else { kAggShortRows2<<>>(getDevData(), target.getDevData(),width, height, agg, uop, bop); } } else { if (width >= 512) { // NOTE: this is the only case which I bothered to try to optimize for Kepler dim3 threads(AWR_NUM_THREADS); dim3 blocks(1, height); kAggRows_wholerow_nosync<<>>(getDevData(), target.getDevData(), width, height, agg, uop, bop); } else { int numThreadsX = width <= 64 ? 32 : (width <= 128 ? 64 : (width <= 256 ? 128 : (width <= 512 ? 256 : 512))); int numThreadsY = 1; int numBlocksX = DIVUP(width, 2*numThreadsX); int numBlocksY = std::min(height, NUM_BLOCKS_MAX); dim3 grid(numBlocksX, numBlocksY), threads(numThreadsX, numThreadsY); assert(numBlocksX <= NUM_BLOCKS_MAX); assert(numBlocksY <= NUM_BLOCKS_MAX); if(width <= 64) { kAggRows<<>>(getDevData(), target.getDevData(), width, height, target.getLeadingDim(), agg, uop, bop); } else if(width <= 128) { kAggRows<<>>(getDevData(), target.getDevData(), width, height, target.getLeadingDim(), agg, uop, bop); } else if(width <= 256) { kAggRows<<>>(getDevData(), target.getDevData(), width, height, target.getLeadingDim(), agg, uop, bop); } else if(width <= 512) { kAggRows<<>>(getDevData(), target.getDevData(), width, height, target.getLeadingDim(), agg, uop, bop); } else { kAggRows<<>>(getDevData(), target.getDevData(), width, height, target.getLeadingDim(), agg, uop, bop); } getLastCudaError("agg rows: Kernel execution failed"); } } } else { target.applyBinary(NVMatrixBinaryOps::CompositeSecond(uop, bop), *this, target, stream); // copy(target, stream); } } } template void NVMatrix::_aggregate(int axis, NVMatrix& target, Agg agg, UnaryOp uop, BinaryOp bop) { _aggregate(axis, target, agg, uop, bop, getDefaultStream()); } template void NVMatrix::_aggregate(int axis, NVMatrix& target, Agg agg, BinaryOp bop) { _aggregate(axis, target, agg, NVMatrixOps::Identity(), bop, getDefaultStream()); } template void NVMatrix::_aggregate(int axis, NVMatrix& target, Agg agg, BinaryOp bop, cudaStream_t stream) { _aggregate(axis, target, agg, NVMatrixOps::Identity(), bop, stream); } template NVMatrix& NVMatrix::_aggregate(int axis, Agg agg, UnaryOp uop, BinaryOp bop) { NVMatrix &sumVec = construct(); _aggregate(axis, sumVec, agg, uop, bop); return sumVec; } template NVMatrix& NVMatrix::_aggregate(int axis, Agg agg, UnaryOp uop, BinaryOp bop, cudaStream_t stream) { NVMatrix &sumVec = construct(); _aggregate(axis, sumVec, agg, uop, bop, stream); return sumVec; } template NVMatrix& NVMatrix::_aggregate(int axis, Agg agg, BinaryOp bop) { return _aggregate(axis, agg, NVMatrixOps::Identity(), bop); } template NVMatrix& NVMatrix::_aggregate(int axis, Agg agg, BinaryOp bop, cudaStream_t stream) { return _aggregate(axis, agg, NVMatrixOps::Identity(), bop, stream); } template void NVMatrix::_aggregate(int axis, NVMatrix& target, Agg agg, UnaryOp uop, BinaryOp bop, NVMatrix& tmp) { _aggregate(axis, target, agg, uop, bop, getDefaultStream(), tmp); } template void NVMatrix::_aggregate(int axis, NVMatrix& target, Agg agg, BinaryOp bop, NVMatrix& tmp) { _aggregate(axis, target, agg, NVMatrixOps::Identity(), bop, getDefaultStream(), &tmp); } template void NVMatrix::_aggregate(int axis, NVMatrix& target, Agg agg, BinaryOp bop, cudaStream_t stream, NVMatrix& tmp) { _aggregate(axis, target, agg, NVMatrixOps::Identity(), bop, stream, &tmp); } template NVMatrix& NVMatrix::_aggregate(int axis, Agg agg, UnaryOp uop, BinaryOp bop, NVMatrix& tmp) { NVMatrix &sumVec = construct(); _aggregate(axis, sumVec, agg, uop, bop, tmp); return sumVec; } template NVMatrix& NVMatrix::_aggregate(int axis, Agg agg, UnaryOp uop, BinaryOp bop, cudaStream_t stream, NVMatrix& tmp) { NVMatrix &sumVec = construct(); _aggregate(axis, sumVec, agg, uop, bop, stream, tmp); return sumVec; } template NVMatrix& NVMatrix::_aggregate(int axis, Agg agg, BinaryOp bop, NVMatrix& tmp) { return _aggregate(axis, agg, NVMatrixOps::Identity(), bop, tmp); } template NVMatrix& NVMatrix::_aggregate(int axis, Agg agg, BinaryOp bop, cudaStream_t stream, NVMatrix& tmp) { return _aggregate(axis, agg, NVMatrixOps::Identity(), bop, stream, tmp); } void NVMatrix::inRangeInc(float lower, float upper) { inRangeInc(lower, upper, *this); } void NVMatrix::inRangeInc(float lower, float upper, NVMatrix& target) { apply(NVMatrixOps::InRange(lower, upper), target); } void NVMatrix::inRangeExc(float lower, float upper) { inRangeExc(lower, upper, *this); } void NVMatrix::inRangeExc(float lower, float upper, NVMatrix& target) { apply(NVMatrixOps::InRange(lower, upper), target); } void NVMatrix::biggerThanScalar(float scalar) { biggerThanScalar(scalar, *this); } void NVMatrix::biggerThanScalar(float scalar, NVMatrix& target) { apply(NVMatrixOps::BiggerThanScalar(scalar), target); } void NVMatrix::smallerThanScalar(float scalar) { smallerThanScalar(scalar, *this); } void NVMatrix::smallerThanScalar(float scalar, NVMatrix& target) { apply(NVMatrixOps::SmallerThanScalar(scalar), target); } void NVMatrix::addScalar(float scaleThis, float scalar, NVMatrix& target) { apply(NVMatrixOps::WeightedAddScalar(scaleThis, scalar), target); } void NVMatrix::addScalar(float scalar, NVMatrix& target) { apply(NVMatrixOps::AddScalar(scalar), target); } void NVMatrix::addScalar(float scalar) { addScalar(scalar, *this); } void NVMatrix::minWithScalar(float scalar, NVMatrix& target) { apply(NVMatrixOps::MinWithScalar(scalar), target); } void NVMatrix::minWithScalar(float scalar) { minWithScalar(scalar, *this); } void NVMatrix::maxWithScalar(float scalar, NVMatrix& target) { apply(NVMatrixOps::MaxWithScalar(scalar), target); } void NVMatrix::maxWithScalar(float scalar) { maxWithScalar(scalar, *this); } void NVMatrix::pow(float p, NVMatrix& target) { apply(NVMatrixOps::Pow(p), target); } void NVMatrix::pow(float p) { pow(p, *this); } void NVMatrix::scale(float _scale) { scale(_scale, *this); } void NVMatrix::scale(float _scale, cudaStream_t stream) { scale(_scale, *this, stream); } void NVMatrix::scale(float _scale, NVMatrix& target) { scale(_scale, target, NVMatrix::getDefaultStream()); } void NVMatrix::scale(float _scale, NVMatrix& target, cudaStream_t stream) { if (_scale != 1 || &target != this) { // optimize away scale by 1 if (_scale == 1) { copy(target, stream); } else { apply(NVMatrixOps::MultByScalar(_scale), target, stream); } } } void NVMatrix::zero() { apply(NVMatrixOps::Zero()); } void NVMatrix::zero(NVMatrix& like) { resize(like); zero(); } void NVMatrix::max(int axis, NVMatrix& target) { _aggregate(axis, target, NVMatrixAggs::Max(), NVMatrixBinaryOps::Second()); } void NVMatrix::max(int axis, NVMatrix& target, NVMatrix& tmp) { _aggregate(axis, target, NVMatrixAggs::Max(), NVMatrixBinaryOps::Second(), tmp); } void NVMatrix::addSum(NVMatrix& a, int axis, float scaleThis, float scaleSum) { addSum(a, axis, scaleThis, scaleSum, getDefaultStream()); } void NVMatrix::addSum(NVMatrix& a, int axis, float scaleThis, float scaleSum, cudaStream_t stream) { if (scaleThis != 0) { a._aggregate(axis, *this, NVMatrixAggs::Sum(), NVMatrixBinaryOps::WeightedAdd(scaleThis, scaleSum), stream); } else { a._aggregate(axis, *this, NVMatrixAggs::Sum(), NVMatrixBinaryOps::SecondScaled(scaleSum), stream); } } void NVMatrix::addMax(NVMatrix& a, int axis, float scaleThis, float scaleMax) { addMax(a, axis, scaleThis, scaleMax, getDefaultStream()); } void NVMatrix::addMax(NVMatrix& a, int axis, float scaleThis, float scaleMax, cudaStream_t stream) { if (scaleThis != 0) { a._aggregate(axis, *this, NVMatrixAggs::Max(), NVMatrixBinaryOps::WeightedAdd(scaleThis, scaleMax), stream); } else { a._aggregate(axis, *this, NVMatrixAggs::Max(), NVMatrixBinaryOps::SecondScaled(scaleMax), stream); } } void NVMatrix::sum(int axis, NVMatrix& target) { sum(axis, target, getDefaultStream()); } void NVMatrix::sum(int axis, NVMatrix& target, cudaStream_t stream) { _aggregate(axis, target, NVMatrixAggs::Sum(), NVMatrixBinaryOps::Second(), stream); } void NVMatrix::sum(int axis, NVMatrix& target, NVMatrix& tmp) { sum(axis, target, getDefaultStream(), tmp); } void NVMatrix::sum(int axis, NVMatrix& target, cudaStream_t stream, NVMatrix& tmp) { _aggregate(axis, target, NVMatrixAggs::Sum(), NVMatrixBinaryOps::Second(), stream, tmp); } void NVMatrix::sumOfSquares(int axis, NVMatrix& target) { sumOfSquares(axis, target, getDefaultStream()); } void NVMatrix::sumOfSquares(int axis, NVMatrix& target, cudaStream_t stream) { _aggregate(axis, target, NVMatrixAggs::Sum(), NVMatrixOps::Square(), NVMatrixBinaryOps::Second(), stream); } void NVMatrix::min(int axis, NVMatrix& target) { _aggregate(axis, target, NVMatrixAggs::Min(), NVMatrixBinaryOps::Second()); } NVMatrix& NVMatrix::max(int axis) { return _aggregate(axis, NVMatrixAggs::Max(), NVMatrixBinaryOps::Second()); } NVMatrix& NVMatrix::sum(int axis) { return _aggregate(axis, NVMatrixAggs::Sum(), NVMatrixBinaryOps::Second()); } NVMatrix& NVMatrix::min(int axis) { return _aggregate(axis, NVMatrixAggs::Min(), NVMatrixBinaryOps::Second()); } NVMatrix& NVMatrix::sumOfSquares(int axis) { return _aggregate(axis, NVMatrixAggs::Sum(), NVMatrixOps::Square(), NVMatrixBinaryOps::Second()); } void NVMatrix::_sum_setParams(int n, dim3* blocks, dim3* threads) { *threads = dim3(DP_BLOCKSIZE); *blocks = dim3(std::min(CPUSUM_MAX, DIVUP(n, DP_BLOCKSIZE))); } float NVMatrix::mean() { return sum() / getNumElements(); } float NVMatrix::sum() { return _totalAgg(NVMatrixAggs::Sum()); } float NVMatrix::sum(NVMatrix& tmpbuf) { return _totalAgg(NVMatrixAggs::Sum(), tmpbuf, getDefaultStream()); } float NVMatrix::max() { return _totalAgg(NVMatrixAggs::Max()); } float NVMatrix::min() { return _totalAgg(NVMatrixAggs::Min()); } float NVMatrix::countNan() { return _totalAgg(NVMatrixAggs::CountNan()); } float NVMatrix::countInf() { return _totalAgg(NVMatrixAggs::CountInf()); } template float NVMatrix::_totalAgg(Agg agg) { return _totalAgg(agg, getDefaultStream()); } template float NVMatrix::_totalAgg(Agg agg, cudaStream_t stream) { NVMatrix tmp; return _totalAgg(agg, tmp, stream); } template float NVMatrix::_totalAgg(Agg agg, NVMatrix& tmpbuf, cudaStream_t stream) { assert(isContiguous()); dim3 blocks, threads; // Sum most of it on GPU _sum_setParams(getNumElements(), &blocks, &threads); tmpbuf.resize(1, blocks.x); kTotalAgg<<>>(getDevData(), tmpbuf.getDevData(), getNumElements(), agg); getLastCudaError("kTotalAgg: Kernel execution failed"); // Don't need to sync because we copyToHost in the same stream, so it's serialized // NVMatrix::syncStream(stream); return tmpbuf.cpuAgg(agg, stream); } template float NVMatrix::cpuAgg(Agg agg, cudaStream_t stream) { Matrix bufCPU(getNumRows(), getNumCols()); copyToHost(bufCPU, false, stream); if (getNumElements() > 1) { // Sum remainder on CPU if (typeid(Agg) == typeid(NVMatrixAggs::Sum)) { return bufCPU.sum(); } else if (typeid(Agg) == typeid(NVMatrixAggs::Max)) { return bufCPU.max(); } else if (typeid(Agg) == typeid(NVMatrixAggs::Min)) { return bufCPU.min(); } else if (typeid(Agg) == typeid(NVMatrixAggs::CountNan)) { return bufCPU.hasNan(); //yea, it's not the same, who cares } else if (typeid(Agg) == typeid(NVMatrixAggs::CountInf)) { return bufCPU.hasInf(); } else { assert(false); } } return bufCPU(0,0); } float NVMatrix::dotProduct(NVMatrix& b) { return dotProduct(b, getDefaultStream()); } float NVMatrix::dotProduct(NVMatrix& b, cudaStream_t stream) { NVMatrix tmp; return dotProduct(b, tmp, stream); } /* * Fast dot product only for matrices with same transposedness. */ float NVMatrix::dotProduct(NVMatrix& b, NVMatrix& tmp, cudaStream_t stream) { assert(isContiguous() && b.isContiguous()); assert(isSameDims(b)); assert(isTrans() == b.isTrans()); // see? dim3 blocks, threads; _sum_setParams(getNumElements(), &blocks, &threads); // NVMatrix target(1, blocks.x); tmp.resize(1, blocks.x); kDotProduct_r<<>>(getDevData(), b.getDevData(), tmp.getDevData(), getNumElements()); getLastCudaError("kDotProduct_r: Kernel execution failed"); // cudaThreadSynchronize(); // syncStream(stream); // return tmp._totalAgg(NVMatrixAggs::Sum(), stream); return tmp.cpuAgg(NVMatrixAggs::Sum(), stream); } float NVMatrix::norm2() { return dotProduct(*this); } float NVMatrix::norm() { return sqrt(norm2()); } void NVMatrix::print(int startRow, int rows, int startCol, int cols) const { // cudaThreadSynchronize(); syncDevice(); Matrix hm = Matrix(_numRows, _numCols); copyToHost(hm); hm.print(startRow, rows, startCol, cols); } void NVMatrix::print(int rows, int cols) const { print(0, rows, 0, cols); } void NVMatrix::printShape(const char* name) const { printf("%s: %dx%d\n", name, _numRows, _numCols); } void NVMatrix::alloc(int numElements) { _memSegment = DEVICE_MEMORY_MANAGER::getInstance(getDeviceID()).malloc(numElements * sizeof(float)); } void NVMatrix::dealloc() { DEVICE_MEMORY_MANAGER::getInstance(_memSegment->getDeviceID()).free(_memSegment); _memSegment = NULL; deallocTexture(); } void NVMatrix::deallocTexture() { if (_texObj != 0) { checkCudaErrors(cudaDestroyTextureObject(_texObj)); _texObj = 0; } } cudaTextureObject_t NVMatrix::getTextureObject() { if (_texObj == 0) { assert(isContiguous()); //size_t memFree, memTotal; struct cudaResourceDesc resDesc; memset(&resDesc, 0, sizeof(resDesc)); resDesc.resType = cudaResourceTypeLinear; resDesc.res.linear.devPtr = getDevData(); resDesc.res.linear.sizeInBytes = getNumDataBytes(); resDesc.res.linear.desc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat); struct cudaTextureDesc texDesc; memset(&texDesc, 0, sizeof(texDesc)); checkCudaErrors(cudaCreateTextureObject(&_texObj, &resDesc, &texDesc, NULL)); } assert(_texObj != 0); return _texObj; } NVMatrix& NVMatrix::construct() const { return *new NVMatrix(); } NVMatrix& NVMatrix::construct(bool isTrans) const { return *new NVMatrix(isTrans); } NVMatrix& NVMatrix::construct(int numRows, int numCols, bool isTrans) const { return *new NVMatrix(numRows, numCols, isTrans); } NVMatrix& NVMatrix::construct(const Matrix& like, bool copy) const { return *new NVMatrix(like, copy); } NVMatrix& NVMatrix::construct(const NVMatrix& like, bool copy) const { return *new NVMatrix(like, copy); } NVMatrix& NVMatrix::construct(const NVMatrix& like) const { return *new NVMatrix(like); } NVMatrix& NVMatrix::construct(const Matrix& like) const { return *new NVMatrix(like); } NVMatrix& NVMatrix::construct(MemorySegment* mem, int numRows, int numCols, int stride, bool isTrans) const { return *new NVMatrix(mem, numRows, numCols, stride, isTrans); } std::pair NVMatrix::getCudaMemorySize() { size_t memFree, memTotal; checkCudaErrors(cudaMemGetInfo(&memFree, &memTotal)); return std::pair(memFree, memTotal); } /* ================ * HostNVMatrix * ================ */ HostNVMatrix::~HostNVMatrix() { if (_ownsData && _numElements > 0) { dealloc(); } else { // dealloc frees the mem segment. But if this is a view, // then we need to delete the mem segment object. // assert(_memSegment == NULL || _memSegment->getSize() == 0); delete _memSegment; } _deleted = true; } HostNVMatrix::HostNVMatrix() : NVMatrix() { _init(false); } HostNVMatrix::HostNVMatrix(bool isTrans) { _init(isTrans); } HostNVMatrix::HostNVMatrix(int numRows, int numCols, bool isTrans) { _init(isTrans); resize(numRows, numCols); } HostNVMatrix::HostNVMatrix(const Matrix& like, bool copy) { _init(like.isTrans()); resize(like.getNumRows(), like.getNumCols()); if (copy) { copyFromHost(like); } } HostNVMatrix::HostNVMatrix(const NVMatrix& like, bool copy) { _init(like.isTrans()); resize(like.getNumRows(), like.getNumCols()); if (copy) { like.copy(*this); } } HostNVMatrix::HostNVMatrix(const NVMatrix& like) { _init(like.isTrans()); resize(like.getNumRows(), like.getNumCols()); } HostNVMatrix::HostNVMatrix(const Matrix& like) { _init(false); resize(like.getNumRows(), like.getNumCols()); } HostNVMatrix::HostNVMatrix(MemorySegment* mem, int numRows, int numCols, int stride, bool isTrans) : NVMatrix(mem, numRows, numCols, stride, isTrans) { } NVMatrix& HostNVMatrix::construct() const { return *new HostNVMatrix(); } NVMatrix& HostNVMatrix::construct(bool isTrans) const { return *new HostNVMatrix(isTrans); } NVMatrix& HostNVMatrix::construct(int numRows, int numCols, bool isTrans) const { return *new HostNVMatrix(numRows, numCols, isTrans); } NVMatrix& HostNVMatrix::construct(const Matrix& like, bool copy) const { return *new HostNVMatrix(like, copy); } NVMatrix& HostNVMatrix::construct(const NVMatrix& like, bool copy) const { return *new HostNVMatrix(like, copy); } NVMatrix& HostNVMatrix::construct(const NVMatrix& like) const { return *new HostNVMatrix(like); } NVMatrix& HostNVMatrix::construct(const Matrix& like) const { return *new HostNVMatrix(like); } NVMatrix& HostNVMatrix::construct(MemorySegment* mem, int numRows, int numCols, int stride, bool isTrans) const { return *new HostNVMatrix(mem, numRows, numCols, stride, isTrans); } void HostNVMatrix::copyFromHost(const Matrix& hostMatrix, bool resizeTarget, cudaStream_t stream) { if (resizeTarget) { resize(hostMatrix); } else { assert(isSameDims(hostMatrix)); } setTrans(hostMatrix.isTrans()); if (getNumElements() > 0) { checkCudaErrors(cudaMemcpy2D(getDevData(), _stride * sizeof(float), hostMatrix.getData(), hostMatrix.getLeadingDim() * sizeof(float), getLeadingDim() * sizeof(float), getFollowingDim(), cudaMemcpyHostToHost)); // syncStream(stream); } } void HostNVMatrix::copyFromHost(const Matrix& hostMatrix, bool resizeTarget) { copyFromHost(hostMatrix, resizeTarget, 0); } void HostNVMatrix::copyFromHost(const Matrix& hostMatrix) { copyFromHost(hostMatrix, false, 0); } void HostNVMatrix::copyToHost(Matrix& hostMatrix, bool resizeTarget, cudaStream_t stream) const { if (resizeTarget) { hostMatrix.resize(getNumRows(), getNumCols()); } else { assert(isSameDims(hostMatrix)); } hostMatrix.setTrans(_isTrans); if (getNumElements() > 0) { checkCudaErrors(cudaMemcpy2D(hostMatrix.getData(), hostMatrix.getLeadingDim() * sizeof(float), getDevData(), _stride * sizeof(float), getLeadingDim() * sizeof(float), getFollowingDim(), cudaMemcpyHostToHost)); // syncStream(stream); } } void HostNVMatrix::copyToHost(Matrix& hostMatrix, bool resizeTarget) const { copyToHost(hostMatrix, resizeTarget, 0); } void HostNVMatrix::copyToHost(Matrix& hostMatrix) const { copyToHost(hostMatrix, false, 0); } void HostNVMatrix::alloc(int numElements) { // checkCudaErrors(cudaHostAlloc(&_devData, numElements * sizeof(float), cudaHostAllocPortable)); _memSegment = HOST_MEMORY_MANAGER::getInstance().malloc(numElements * sizeof(float)); // _memSegment = FastHostMemoryManager::getInstance().malloc(numElements * sizeof(float)); } void HostNVMatrix::dealloc() { // FastHostMemoryManager::getInstance().free(_memSegment); HOST_MEMORY_MANAGER::getInstance().free(_memSegment); _memSegment = NULL; // checkCudaErrors(cudaFreeHost(_devData)); } cudaTextureObject_t HostNVMatrix::getTextureObject() { assert(false); return 0; } ================================================ FILE: caffe2/contrib/cuda-convnet2/nvmatrix/src/nvmatrix_kernels.cu ================================================ /* * Copyright 2014 Google Inc. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include #include #include "../include/nvmatrix_kernels.cuh" __global__ void kTile(const float* src, float* tgt, const uint srcWidth, const uint srcHeight, const uint tgtWidth, const uint tgtHeight) { const int idx = blockIdx.x * blockDim.x + threadIdx.x; const int numThreads = blockDim.x * gridDim.x; // const unsigned int numEls = tgtWidth * tgtHeight; for (uint i = idx; i < tgtWidth * tgtHeight; i += numThreads) { const uint y = i / tgtWidth; const uint x = i % tgtWidth; const uint srcY = y % srcHeight; const uint srcX = x % srcWidth; tgt[i] = src[srcY * srcWidth + srcX]; } } __global__ void kDotProduct_r(float* a, float* b, float* target, const uint numElements) { __shared__ float shmem[DP_BLOCKSIZE]; uint eidx = DP_BLOCKSIZE * blockIdx.x + threadIdx.x; shmem[threadIdx.x] = 0; if (eidx < gridDim.x * DP_BLOCKSIZE) { for (; eidx < numElements; eidx += gridDim.x * DP_BLOCKSIZE) { shmem[threadIdx.x] += a[eidx] * b[eidx]; } } __syncthreads(); if (threadIdx.x < 256) { shmem[threadIdx.x] += shmem[threadIdx.x + 256]; } __syncthreads(); if (threadIdx.x < 128) { shmem[threadIdx.x] += shmem[threadIdx.x + 128]; } __syncthreads(); if (threadIdx.x < 64) { shmem[threadIdx.x] += shmem[threadIdx.x + 64]; } __syncthreads(); if (threadIdx.x < 32) { volatile float* mysh = &shmem[threadIdx.x]; *mysh += mysh[32]; *mysh += mysh[16]; *mysh += mysh[8]; *mysh += mysh[4]; *mysh += mysh[2]; *mysh += mysh[1]; if (threadIdx.x == 0) { target[blockIdx.x] = *mysh; } } } __global__ void kSetupCurand(curandState *state, unsigned long long seed) { const uint tidx = NUM_RND_THREADS_PER_BLOCK * blockIdx.x + threadIdx.x; /* Each thread gets same seed, a different sequence number, no offset */ curand_init(seed, tidx, 0, &state[tidx]); } ================================================ FILE: caffe2/contrib/cuda-convnet2/python_util/__init__.py ================================================ # Copyright 2014 Google Inc. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. ================================================ FILE: caffe2/contrib/cuda-convnet2/python_util/data.py ================================================ # Copyright 2014 Google Inc. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import numpy as n from numpy.random import randn, rand, random_integers import os from threading import Thread from util import * BATCH_META_FILE = "batches.meta" class DataLoaderThread(Thread): def __init__(self, path, tgt): Thread.__init__(self) self.path = path self.tgt = tgt def run(self): self.tgt += [unpickle(self.path)] class DataProvider: BATCH_REGEX = re.compile('^data_batch_(\d+)(\.\d+)?$') def __init__(self, data_dir, batch_range=None, init_epoch=1, init_batchnum=None, dp_params={}, test=False): if batch_range == None: batch_range = DataProvider.get_batch_nums(data_dir) if init_batchnum is None or init_batchnum not in batch_range: init_batchnum = batch_range[0] self.data_dir = data_dir self.batch_range = batch_range self.curr_epoch = init_epoch self.curr_batchnum = init_batchnum self.dp_params = dp_params self.batch_meta = self.get_batch_meta(data_dir) self.data_dic = None self.test = test self.batch_idx = batch_range.index(init_batchnum) def get_next_batch(self): if self.data_dic is None or len(self.batch_range) > 1: self.data_dic = self.get_batch(self.curr_batchnum) epoch, batchnum = self.curr_epoch, self.curr_batchnum self.advance_batch() return epoch, batchnum, self.data_dic def get_batch(self, batch_num): fname = self.get_data_file_name(batch_num) if os.path.isdir(fname): # batch in sub-batches sub_batches = sorted(os.listdir(fname), key=alphanum_key) #print sub_batches num_sub_batches = len(sub_batches) tgts = [[] for i in xrange(num_sub_batches)] threads = [DataLoaderThread(os.path.join(fname, s), tgt) for (s, tgt) in zip(sub_batches, tgts)] for thread in threads: thread.start() for thread in threads: thread.join() return [t[0] for t in tgts] return unpickle(self.get_data_file_name(batch_num)) def get_data_dims(self,idx=0): return self.batch_meta['num_vis'] if idx == 0 else 1 def advance_batch(self): self.batch_idx = self.get_next_batch_idx() self.curr_batchnum = self.batch_range[self.batch_idx] if self.batch_idx == 0: # we wrapped self.curr_epoch += 1 def get_next_batch_idx(self): return (self.batch_idx + 1) % len(self.batch_range) def get_next_batch_num(self): return self.batch_range[self.get_next_batch_idx()] # get filename of current batch def get_data_file_name(self, batchnum=None): if batchnum is None: batchnum = self.curr_batchnum return os.path.join(self.data_dir, 'data_batch_%d' % batchnum) @classmethod def get_instance(cls, data_dir, batch_range=None, init_epoch=1, init_batchnum=None, type="default", dp_params={}, test=False): # why the fuck can't i reference DataProvider in the original definition? #cls.dp_classes['default'] = DataProvider type = type or DataProvider.get_batch_meta(data_dir)['dp_type'] # allow data to decide data provider if type.startswith("dummy-"): name = "-".join(type.split('-')[:-1]) + "-n" if name not in dp_types: raise DataProviderException("No such data provider: %s" % type) _class = dp_classes[name] dims = int(type.split('-')[-1]) return _class(dims) elif type in dp_types: _class = dp_classes[type] return _class(data_dir, batch_range, init_epoch, init_batchnum, dp_params, test) raise DataProviderException("No such data provider: %s" % type) @classmethod def register_data_provider(cls, name, desc, _class): if name in dp_types: raise DataProviderException("Data provider %s already registered" % name) dp_types[name] = desc dp_classes[name] = _class @staticmethod def get_batch_meta(data_dir): return unpickle(os.path.join(data_dir, BATCH_META_FILE)) @staticmethod def get_batch_filenames(srcdir): return sorted([f for f in os.listdir(srcdir) if DataProvider.BATCH_REGEX.match(f)], key=alphanum_key) @staticmethod def get_batch_nums(srcdir): names = DataProvider.get_batch_filenames(srcdir) return sorted(list(set(int(DataProvider.BATCH_REGEX.match(n).group(1)) for n in names))) @staticmethod def get_num_batches(srcdir): return len(DataProvider.get_batch_nums(srcdir)) class DummyDataProvider(DataProvider): def __init__(self, data_dim): #self.data_dim = data_dim self.batch_range = [1] self.batch_meta = {'num_vis': data_dim, 'data_in_rows':True} self.curr_epoch = 1 self.curr_batchnum = 1 self.batch_idx = 0 def get_next_batch(self): epoch, batchnum = self.curr_epoch, self.curr_batchnum self.advance_batch() data = rand(512, self.get_data_dims()).astype(n.single) return self.curr_epoch, self.curr_batchnum, {'data':data} class LabeledDataProvider(DataProvider): def __init__(self, data_dir, batch_range=None, init_epoch=1, init_batchnum=None, dp_params={}, test=False): DataProvider.__init__(self, data_dir, batch_range, init_epoch, init_batchnum, dp_params, test) def get_num_classes(self): return len(self.batch_meta['label_names']) class LabeledDummyDataProvider(DummyDataProvider): def __init__(self, data_dim, num_classes=10, num_cases=7): #self.data_dim = data_dim self.batch_range = [1] self.batch_meta = {'num_vis': data_dim, 'label_names': [str(x) for x in range(num_classes)], 'data_in_rows':True} self.num_cases = num_cases self.num_classes = num_classes self.curr_epoch = 1 self.curr_batchnum = 1 self.batch_idx=0 self.data = None def get_num_classes(self): return self.num_classes def get_next_batch(self): epoch, batchnum = self.curr_epoch, self.curr_batchnum self.advance_batch() if self.data is None: data = rand(self.num_cases, self.get_data_dims()).astype(n.single) # <--changed to rand labels = n.require(n.c_[random_integers(0,self.num_classes-1,self.num_cases)], requirements='C', dtype=n.single) self.data, self.labels = data, labels else: data, labels = self.data, self.labels # print data.shape, labels.shape return self.curr_epoch, self.curr_batchnum, [data.T, labels.T ] dp_types = {"dummy-n": "Dummy data provider for n-dimensional data", "dummy-labeled-n": "Labeled dummy data provider for n-dimensional data"} dp_classes = {"dummy-n": DummyDataProvider, "dummy-labeled-n": LabeledDummyDataProvider} class DataProviderException(Exception): pass ================================================ FILE: caffe2/contrib/cuda-convnet2/python_util/gpumodel.py ================================================ # Copyright 2014 Google Inc. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import numpy as n import os from time import time, asctime, localtime, strftime from util import * from data import * from options import * from math import ceil, floor, sqrt from data import DataProvider, dp_types import sys import shutil import platform from os import linesep as NL from threading import Thread import tempfile as tf class ModelStateException(Exception): pass class CheckpointWriter(Thread): def __init__(self, path, dic): Thread.__init__(self) self.path = path self.dic = dic def run(self): save_dir = os.path.dirname(self.path) save_file = os.path.basename(self.path) # Write checkpoint to temporary filename tmpfile = tf.NamedTemporaryFile(dir=os.path.dirname(save_dir), delete=False) pickle(tmpfile, self.dic) # Also closes tf # Move it to final filename os.rename(tmpfile.name, self.path) # Delete old checkpoints for f in os.listdir(save_dir): if f != save_file: os.remove(os.path.join(save_dir, f)) # GPU Model interface class IGPUModel: def __init__(self, model_name, op, load_dic, filename_options=[], dp_params={}): # these are input parameters self.model_name = model_name self.op = op self.options = op.options self.load_dic = load_dic self.filename_options = filename_options self.dp_params = dp_params self.device_ids = self.op.get_value('gpu') self.fill_excused_options() self.checkpoint_writer = None #assert self.op.all_values_given() for o in op.get_options_list(): setattr(self, o.name, o.value) self.loaded_from_checkpoint = load_dic is not None # these are things that the model must remember but they're not input parameters if self.loaded_from_checkpoint: self.model_state = load_dic["model_state"] self.save_file = self.options["save_file_override"].value if self.options["save_file_override"].value_given else self.options['load_file'].value if not os.path.isdir(self.save_file) and os.path.exists(self.save_file): self.save_file = os.path.dirname(self.save_file) # print self.options["save_file_override"].value, self.save_file else: self.model_state = {} self.save_file = self.options["save_file_override"].value if self.options["save_file_override"].value_given else os.path.join(self.options['save_path'].value, model_name + "_" + '_'.join(['%s_%s' % (char, self.options[opt].get_str_value()) for opt, char in filename_options]) + '_' + strftime('%Y-%m-%d_%H.%M.%S')) self.model_state["train_outputs"] = [] self.model_state["test_outputs"] = [] self.model_state["epoch"] = 1 self.model_state["batchnum"] = self.train_batch_range[0] # print self.save_file self.init_data_providers() if load_dic: self.train_data_provider.advance_batch() # model state often requries knowledge of data provider, so it's initialized after try: self.init_model_state() except ModelStateException, e: print e sys.exit(1) for var, val in self.model_state.iteritems(): setattr(self, var, val) self.import_model() self.init_model_lib() def import_model(self): print "=========================" print "Importing %s C++ module" % ('_' + self.model_name) self.libmodel = __import__('_' + self.model_name) def fill_excused_options(self): pass def init_data_providers(self): self.dp_params['convnet'] = self try: self.test_data_provider = DataProvider.get_instance(self.data_path, self.test_batch_range, type=self.dp_type, dp_params=self.dp_params, test=True) self.train_data_provider = DataProvider.get_instance(self.data_path, self.train_batch_range, self.model_state["epoch"], self.model_state["batchnum"], type=self.dp_type, dp_params=self.dp_params, test=False) except DataProviderException, e: print "Unable to create data provider: %s" % e self.print_data_providers() sys.exit() def init_model_state(self): pass def init_model_lib(self): pass def start(self): if self.test_only: self.test_outputs += [self.get_test_error()] self.print_test_results() else: self.train() self.cleanup() if self.force_save: self.save_state().join() sys.exit(0) def train(self): print "=========================" print "Training %s" % self.model_name self.op.print_values() print "=========================" self.print_model_state() print "Running on CUDA device(s) %s" % ", ".join("%d" % d for d in self.device_ids) print "Current time: %s" % asctime(localtime()) print "Saving checkpoints to %s" % self.save_file print "=========================" next_data = self.get_next_batch() while self.epoch <= self.num_epochs: data = next_data self.epoch, self.batchnum = data[0], data[1] self.print_iteration() sys.stdout.flush() compute_time_py = time() self.start_batch(data) # load the next batch while the current one is computing next_data = self.get_next_batch() batch_output = self.finish_batch() self.train_outputs += [batch_output] self.print_train_results() if self.get_num_batches_done() % self.testing_freq == 0: self.sync_with_host() self.test_outputs += [self.get_test_error()] self.print_test_results() self.print_test_status() self.conditional_save() self.print_elapsed_time(time() - compute_time_py) def cleanup(self): if self.checkpoint_writer is not None: self.checkpoint_writer.join() self.checkpoint_writer = None def print_model_state(self): pass def get_num_batches_done(self): return len(self.train_batch_range) * (self.epoch - 1) + self.batchnum - self.train_batch_range[0] + 1 def get_next_batch(self, train=True): dp = self.train_data_provider if not train: dp = self.test_data_provider return self.parse_batch_data(dp.get_next_batch(), train=train) def parse_batch_data(self, batch_data, train=True): return batch_data[0], batch_data[1], batch_data[2]['data'] def start_batch(self, batch_data, train=True): self.libmodel.startBatch(batch_data[2], not train) def finish_batch(self): return self.libmodel.finishBatch() def print_iteration(self): print "\t%d.%d..." % (self.epoch, self.batchnum), def print_elapsed_time(self, compute_time_py): print "(%.3f sec)" % (compute_time_py) def print_train_results(self): batch_error = self.train_outputs[-1][0] if not (batch_error > 0 and batch_error < 2e20): print "Crazy train error: %.6f" % batch_error self.cleanup() print "Train error: %.6f " % (batch_error), def print_test_results(self): batch_error = self.test_outputs[-1][0] print "%s\t\tTest error: %.6f" % (NL, batch_error), def print_test_status(self): status = (len(self.test_outputs) == 1 or self.test_outputs[-1][0] < self.test_outputs[-2][0]) and "ok" or "WORSE" print status, def sync_with_host(self): if self.checkpoint_writer is not None: self.checkpoint_writer.join() self.checkpoint_writer = None self.libmodel.syncWithHost() def conditional_save(self): batch_error = self.test_outputs[-1][0] if batch_error > 0 and batch_error < self.max_test_err: self.save_state() else: print "\tTest error > %g, not saving." % self.max_test_err, def aggregate_test_outputs(self, test_outputs): test_error = tuple([sum(t[r] for t in test_outputs) / (1 if self.test_one else len(self.test_batch_range)) for r in range(len(test_outputs[-1]))]) return test_error def get_test_error(self): next_data = self.get_next_batch(train=False) test_outputs = [] while True: data = next_data start_time_test = time() self.start_batch(data, train=False) load_next = (not self.test_one or self.test_only) and data[1] < self.test_batch_range[-1] if load_next: # load next batch next_data = self.get_next_batch(train=False) test_outputs += [self.finish_batch()] if self.test_only: # Print the individual batch results for safety print "batch %d: %s" % (data[1], str(test_outputs[-1])), self.print_elapsed_time(time() - start_time_test) if not load_next: break sys.stdout.flush() return self.aggregate_test_outputs(test_outputs) def set_var(self, var_name, var_val): setattr(self, var_name, var_val) self.model_state[var_name] = var_val return var_val def get_var(self, var_name): return self.model_state[var_name] def has_var(self, var_name): return var_name in self.model_state def save_state(self): for att in self.model_state: if hasattr(self, att): self.model_state[att] = getattr(self, att) dic = {"model_state": self.model_state, "op": self.op} checkpoint_file = "%d.%d" % (self.epoch, self.batchnum) checkpoint_file_full_path = os.path.join(self.save_file, checkpoint_file) if not os.path.exists(self.save_file): os.makedirs(self.save_file) assert self.checkpoint_writer is None self.checkpoint_writer = CheckpointWriter(checkpoint_file_full_path, dic) self.checkpoint_writer.start() print "-------------------------------------------------------" print "Saved checkpoint to %s" % self.save_file print "=======================================================", return self.checkpoint_writer def get_progress(self): num_batches_total = self.num_epochs * len(self.train_batch_range) return min(1.0, max(0.0, float(self.get_num_batches_done()-1) / num_batches_total)) @staticmethod def load_checkpoint(load_dir): if os.path.isdir(load_dir): return unpickle(os.path.join(load_dir, sorted(os.listdir(load_dir), key=alphanum_key)[-1])) return unpickle(load_dir) @staticmethod def get_options_parser(): op = OptionsParser() op.add_option("load-file", "load_file", StringOptionParser, "Load file", default="", excuses=OptionsParser.EXCUSE_ALL) op.add_option("save-path", "save_path", StringOptionParser, "Save path", excuses=['save_file_override']) op.add_option("save-file", "save_file_override", StringOptionParser, "Save file override", excuses=['save_path']) op.add_option("train-range", "train_batch_range", RangeOptionParser, "Data batch range: training") op.add_option("test-range", "test_batch_range", RangeOptionParser, "Data batch range: testing") op.add_option("data-provider", "dp_type", StringOptionParser, "Data provider", default="default") op.add_option("test-freq", "testing_freq", IntegerOptionParser, "Testing frequency", default=25) op.add_option("epochs", "num_epochs", IntegerOptionParser, "Number of epochs", default=500) op.add_option("data-path", "data_path", StringOptionParser, "Data path") op.add_option("max-test-err", "max_test_err", FloatOptionParser, "Maximum test error for saving") op.add_option("test-only", "test_only", BooleanOptionParser, "Test and quit?", default=0) op.add_option("test-one", "test_one", BooleanOptionParser, "Test on one batch at a time?", default=1) op.add_option("force-save", "force_save", BooleanOptionParser, "Force save before quitting", default=0) op.add_option("gpu", "gpu", ListOptionParser(IntegerOptionParser), "GPU override") return op @staticmethod def print_data_providers(): print "Available data providers:" for dp, desc in dp_types.iteritems(): print " %s: %s" % (dp, desc) @staticmethod def parse_options(op): try: load_dic = None options = op.parse() load_location = None # print options['load_file'].value_given, options['save_file_override'].value_given # print options['save_file_override'].value if options['load_file'].value_given: load_location = options['load_file'].value elif options['save_file_override'].value_given and os.path.exists(options['save_file_override'].value): load_location = options['save_file_override'].value if load_location is not None: load_dic = IGPUModel.load_checkpoint(load_location) old_op = load_dic["op"] old_op.merge_from(op) op = old_op op.eval_expr_defaults() return op, load_dic except OptionMissingException, e: print e op.print_usage() except OptionException, e: print e except UnpickleError, e: print "Error loading checkpoint:" print e sys.exit() ================================================ FILE: caffe2/contrib/cuda-convnet2/python_util/options.py ================================================ # Copyright 2014 Google Inc. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import sys from getopt import getopt import os import re #import types TERM_BOLD_START = "\033[1m" TERM_BOLD_END = "\033[0m" class Option: def __init__(self, letter, name, desc, parser, set_once, default, excuses, requires, save): assert not name is None self.letter = letter self.name = name self.desc = desc self.parser = parser self.set_once = set_once self.default = default self.excuses = excuses self.requires = requires self.save = save self.value = None self.value_given = False self.prefixed_letter = min(2, len(letter)) * '-' + letter def set_value(self, value, parse=True): try: self.value = self.parser.parse(value) if parse else value self.value_given = True # print self.name, self.value except OptionException, e: raise OptionException("Unable to parse option %s (%s): %s" % (self.prefixed_letter, self.desc, e)) def set_default(self): if not self.default is None: self.value = self.default def eval_expr_default(self, env): try: if isinstance(self.default, OptionExpression) and not self.value_given: self.value = self.default.evaluate(env) if not self.parser.is_type(self.value): raise OptionException("expression result %s is not of right type (%s)" % (self.value, self.parser.get_type_str())) except Exception, e: raise OptionException("Unable to set default value for option %s (%s): %s" % (self.prefixed_letter, self.desc, e)) def get_str_value(self, get_default_str=False): val = self.value if get_default_str: val = self.default if val is None: return "" if isinstance(val, OptionExpression): return val.expr return self.parser.to_string(val) class OptionsParser: """An option parsing class. All options without default values are mandatory, unless a excuses option (usually a load file) is given. Does not support options without arguments.""" SORT_LETTER = 1 SORT_DESC = 2 SORT_EXPR_LAST = 3 EXCUSE_ALL = "all" def __init__(self): self.options = {} def add_option(self, letter, name, parser, desc, set_once=False, default=None, excuses=[], requires=[], save=True): """ The letter parameter is the actual parameter that the user will have to supply on the command line. The name parameter is some name to be given to this option and must be a valid python variable name. An explanation of the "default" parameter: The default value, if specified, should have the same type as the option. You can also specify an expression as the default value. In this case, the default value of the parameter will be the output of the expression. The expression may assume all other option names as local variables. For example, you can define the hidden bias learning rate to be 10 times the weight learning rate by setting this default: default=OptionExpression("eps_w * 10") (assuming an option named eps_w exists). However, it is up to you to make sure you do not make any circular expression definitions. Note that the order in which the options are parsed is arbitrary. In particular, expression default values that depend on other expression default values will often raise errors (depending on the order in which they happen to be parsed). Therefore it is best not to make the default value of one variable depend on the value of another if the other variable's default value is itself an expression. An explanation of the "excuses" parameter: All options are mandatory, but certain options can exclude other options from being mandatory. For example, if the excuses parameter for option "load_file" is ["num_hid", "num_vis"], then the options num_hid and num_vis are not mandatory as long as load_file is specified. Use the special flag EXCUSE_ALL to allow an option to make all other options optional. """ assert name not in self.options self.options[name] = Option(letter, name, desc, parser, set_once, default, excuses, requires, save) def set_value(self, name, value, parse=True): self.options[name].set_value(value, parse=parse) def get_value(self, name): return self.options[name].value def delete_option(self, name): if name in self.options: del self.options[name] def parse(self, eval_expr_defaults=False): """Parses the options in sys.argv based on the options added to this parser. The default behavior is to leave any expression default options as OptionExpression objects. Set eval_expr_defaults=True to circumvent this.""" short_opt_str = ''.join(["%s:" % self.options[name].letter for name in self.options if len(self.options[name].letter) == 1]) long_opts = ["%s=" % self.options[name].letter for name in self.options if len(self.options[name].letter) > 1] (go, ga) = getopt(sys.argv[1:], short_opt_str, longopts=long_opts) dic = dict(go) for o in self.get_options_list(sort_order=self.SORT_EXPR_LAST): if o.prefixed_letter in dic: o.set_value(dic[o.prefixed_letter]) else: # check if excused or has default excused = max([o2.prefixed_letter in dic for o2 in self.options.values() if o2.excuses == self.EXCUSE_ALL or o.name in o2.excuses]) if not excused and o.default is None: raise OptionMissingException("Option %s (%s) not supplied" % (o.prefixed_letter, o.desc)) o.set_default() # check requirements if o.prefixed_letter in dic: for o2 in self.get_options_list(sort_order=self.SORT_LETTER): if o2.name in o.requires and o2.prefixed_letter not in dic: raise OptionMissingException("Option %s (%s) requires option %s (%s)" % (o.prefixed_letter, o.desc, o2.prefixed_letter, o2.desc)) if eval_expr_defaults: self.eval_expr_defaults() return self.options def merge_from(self, op2): """Merges the options in op2 into this instance, but does not overwrite this instances's SET options with op2's default values.""" for name, o in self.options.iteritems(): if name in op2.options and ((op2.options[name].value_given and op2.options[name].value != self.options[name].value) or not op2.options[name].save): if op2.options[name].set_once: raise OptionException("Option %s (%s) cannot be changed" % (op2.options[name].prefixed_letter, op2.options[name].desc)) self.options[name] = op2.options[name] for name in op2.options: if name not in self.options: self.options[name] = op2.options[name] def eval_expr_defaults(self): env = dict([(name, o.value) for name, o in self.options.iteritems()]) for o in self.options.values(): o.eval_expr_default(env) def all_values_given(self): return max([o.value_given for o in self.options.values() if o.default is not None]) def get_options_list(self, sort_order=SORT_LETTER): """ Returns the list of Option objects in this OptionParser, sorted as specified""" cmp = lambda x, y: (x.desc < y.desc and -1 or 1) if sort_order == self.SORT_LETTER: cmp = lambda x, y: (x.letter < y.letter and -1 or 1) elif sort_order == self.SORT_EXPR_LAST: cmp = lambda x, y: (type(x.default) == OptionExpression and 1 or -1) return sorted(self.options.values(), cmp=cmp) def print_usage(self, print_constraints=False): print "%s usage:" % os.path.basename(sys.argv[0]) opslist = self.get_options_list() usage_strings = [] num_def = 0 for o in opslist: excs = ' ' if o.default is None: excs = ', '.join(sorted([o2.prefixed_letter for o2 in self.options.values() if o2.excuses == self.EXCUSE_ALL or o.name in o2.excuses])) reqs = ', '.join(sorted([o2.prefixed_letter for o2 in self.options.values() if o2.name in o.requires])) usg = (OptionsParser._bold(o.prefixed_letter) + " <%s>" % o.parser.get_type_str(), o.desc, ("[%s]" % o.get_str_value(get_default_str=True)) if not o.default is None else None, excs, reqs) if o.default is None: usage_strings += [usg] else: usage_strings.insert(num_def, usg) num_def += 1 col_widths = [self._longest_value(usage_strings, key=lambda x:x[i]) for i in range(len(usage_strings[0]) - 1)] col_names = [" Option", "Description", "Default"] if print_constraints: col_names += ["Excused by", "Requires"] for i, s in enumerate(col_names): print self._bold(s.ljust(col_widths[i])), print "" for l, d, de, ex, req in usage_strings: if de is None: de = ' ' print (" %s -" % l.ljust(col_widths[0])), d.ljust(col_widths[1]), de.ljust(col_widths[2]), else: print (" [%s] -" % l.ljust(col_widths[0])), d.ljust(col_widths[1]), de.ljust(col_widths[2]), if print_constraints: print ex.ljust(col_widths[3]), req else: print "" def print_values(self): longest_desc = self._longest_value(self.options.values(), key=lambda x:x.desc) longest_def_value = self._longest_value([v for v in self.options.values() if not v.value_given and not v.default is None], key=lambda x:x.get_str_value()) for o in self.get_options_list(sort_order=self.SORT_DESC): print "%s: %s %s" % (o.desc.ljust(longest_desc), o.get_str_value().ljust(longest_def_value), (not o.value_given and not o.default is None) and "[DEFAULT]" or "") @staticmethod def _longest_value(values, key=lambda x:x): mylen = lambda x: 0 if x is None else len(x) return mylen(key(max(values, key=lambda x:mylen(key(x))))) @staticmethod def _bold(str): return TERM_BOLD_START + str + TERM_BOLD_END class OptionException(Exception): pass class OptionMissingException(OptionException): pass class OptionParser: @staticmethod def parse(value): return str(value) @staticmethod def to_string(value): return str(value) @staticmethod def get_type_str(): pass class IntegerOptionParser(OptionParser): @staticmethod def parse(value): try: return int(value) except: raise OptionException("argument is not an integer") @staticmethod def get_type_str(): return "int" @staticmethod def is_type(value): return type(value) == int class BooleanOptionParser(OptionParser): @staticmethod def parse(value): try: v = int(value) if not v in (0,1): raise OptionException return v except: raise OptionException("argument is not a boolean") @staticmethod def get_type_str(): return "0/1" @staticmethod def is_type(value): return type(value) == int and value in (0, 1) class StringOptionParser(OptionParser): @staticmethod def get_type_str(): return "string" @staticmethod def is_type(value): return type(value) == str class FloatOptionParser(OptionParser): @staticmethod def parse(value): try: return float(value) except: raise OptionException("argument is not a float") @staticmethod def to_string(value): return "%.6g" % value @staticmethod def get_type_str(): return "float" @staticmethod def is_type(value): return type(value) == float class RangeOptionParser(OptionParser): @staticmethod def parse(value): m = re.match("^(\d+)\-(\d+)$", value) try: if m: return range(int(m.group(1)), int(m.group(2)) + 1) return [int(value)] except: raise OptionException("argument is neither an integer nor a range") @staticmethod def to_string(value): return "%d-%d" % (value[0], value[-1]) @staticmethod def get_type_str(): return "int[-int]" @staticmethod def is_type(value): return type(value) == list class ListOptionParser(OptionParser): """ A parser that parses a delimited list of items. If the "parsers" argument is a list of parsers, then the list of items must have the form and length specified by that list. Example: ListOptionParser([FloatOptionParser, IntegerOptionParser]) would parse "0.5,3" but not "0.5,3,0.6" or "0.5" or "3,0.5". If the "parsers" argument is another parser, then the list of items may be of arbitrary length, but each item must be parseable by the given parser. Example: ListOptionParser(FloatOptionParser) would parse "0.5" and "0.5,0.3" and "0.5,0.3,0.6", etc. """ def __init__(self, parsers, sepchar=','): self.parsers = parsers self.sepchar = sepchar def parse(self, value): values = value.split(self.sepchar) if type(self.parsers) == list and len(values) != len(self.parsers): raise OptionException("requires %d arguments, given %d" % (len(self.parsers), len(values))) try: if type(self.parsers) == list: return [p.parse(v) for p, v in zip(self.parsers, values)] return [self.parsers.parse(v) for v in values] except: raise OptionException("argument is not of the form %s" % self.get_type_str()) def to_string(self, value): if type(self.parsers) == list: return self.sepchar.join([p.to_string(v) for p, v in zip(self.parsers, value)]) return self.sepchar.join([self.parsers.to_string(v) for v in value]) def get_type_str(self): if type(self.parsers) == list: return self.sepchar.join([p.get_type_str() for p in self.parsers]) return "%s%s..." % (self.parsers.get_type_str(), self.sepchar) @staticmethod def is_type(value): return type(value) == list class OptionExpression: """ This allows you to specify option values in terms of other option values. Example: op.add_option("eps-w", "eps_w", ListOptionParser(FloatOptionParser), "Weight learning rates for each layer") op.add_option("eps-b", "eps_b", ListOptionParser(FloatOptionParser), "Bias learning rates for each layer", default=OptionExpression("[o * 10 for o in eps_w]")) This says: the default bias learning rate for each layer is 10 times the weight learning rate for that layer. """ def __init__(self, expr): self.expr = expr def evaluate(self, options): locals().update(options) try: return eval(self.expr) except Exception, e: raise OptionException("expression '%s': unable to parse: %s" % (self.expr, e)) ================================================ FILE: caffe2/contrib/cuda-convnet2/python_util/util.py ================================================ # Copyright 2014 Google Inc. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import re import cPickle import os from cStringIO import StringIO class UnpickleError(Exception): pass GPU_LOCK_NO_SCRIPT = -2 GPU_LOCK_NO_LOCK = -1 def pickle(filename, data): fo = filename if type(filename) == str: fo = open(filename, "w") cPickle.dump(data, fo, protocol=cPickle.HIGHEST_PROTOCOL) fo.close() def unpickle(filename): if not os.path.exists(filename): raise UnpickleError("Path '%s' does not exist." % filename) fo = open(filename, 'r') z = StringIO() file_size = os.fstat(fo.fileno()).st_size # Read 1GB at a time to avoid overflow while fo.tell() < file_size: z.write(fo.read(1 << 30)) fo.close() dict = cPickle.loads(z.getvalue()) z.close() return dict def is_intel_machine(): VENDOR_ID_REGEX = re.compile('^vendor_id\s+: (\S+)') f = open('/proc/cpuinfo') for line in f: m = VENDOR_ID_REGEX.match(line) if m: f.close() return m.group(1) == 'GenuineIntel' f.close() return False # Returns the CPUs associated with a given GPU def get_cpus_for_gpu(gpu): #proc = subprocess.Popen(['nvidia-smi', '-q', '-i', str(gpu)], stdout=subprocess.PIPE) #lines = proc.communicate()[0] #lines = subprocess.check_output(['nvidia-smi', '-q', '-i', str(gpu)]).split(os.linesep) with open('/proc/driver/nvidia/gpus/%d/information' % gpu) as f: for line in f: if line.startswith('Bus Location'): bus_id = line.split(':', 1)[1].strip() bus_id = bus_id[:7] + ':' + bus_id[8:] ff = open('/sys/module/nvidia/drivers/pci:nvidia/%s/local_cpulist' % bus_id) cpus_str = ff.readline() ff.close() cpus = [cpu for s in cpus_str.split(',') for cpu in range(int(s.split('-')[0]),int(s.split('-')[1])+1)] return cpus return [-1] def get_cpu(): if is_intel_machine(): return 'intel' return 'amd' def is_windows_machine(): return os.name == 'nt' def tryint(s): try: return int(s) except: return s def alphanum_key(s): return [tryint(c) for c in re.split('([0-9]+)', s)] ================================================ FILE: caffe2/contrib/cuda-convnet2/shownet.py ================================================ # Copyright 2014 Google Inc. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import os import sys from tarfile import TarFile, TarInfo from matplotlib import pylab as pl import numpy as n import getopt as opt from python_util.util import * from math import sqrt, ceil, floor from python_util.gpumodel import IGPUModel import random as r import numpy.random as nr from convnet import ConvNet from python_util.options import * from PIL import Image from time import sleep class ShowNetError(Exception): pass class ShowConvNet(ConvNet): def __init__(self, op, load_dic): ConvNet.__init__(self, op, load_dic) def init_data_providers(self): self.need_gpu = self.op.get_value('show_preds') class Dummy: def advance_batch(self): pass if self.need_gpu: ConvNet.init_data_providers(self) else: self.train_data_provider = self.test_data_provider = Dummy() def import_model(self): if self.need_gpu: ConvNet.import_model(self) def init_model_state(self): if self.op.get_value('show_preds'): self.softmax_name = self.op.get_value('show_preds') def init_model_lib(self): if self.need_gpu: ConvNet.init_model_lib(self) def plot_cost(self): if self.show_cost not in self.train_outputs[0][0]: raise ShowNetError("Cost function with name '%s' not defined by given convnet." % self.show_cost) # print self.test_outputs train_errors = [eval(self.layers[self.show_cost]['outputFilter'])(o[0][self.show_cost], o[1])[self.cost_idx] for o in self.train_outputs] test_errors = [eval(self.layers[self.show_cost]['outputFilter'])(o[0][self.show_cost], o[1])[self.cost_idx] for o in self.test_outputs] if self.smooth_test_errors: test_errors = [sum(test_errors[max(0,i-len(self.test_batch_range)):i])/(i-max(0,i-len(self.test_batch_range))) for i in xrange(1,len(test_errors)+1)] numbatches = len(self.train_batch_range) test_errors = n.row_stack(test_errors) test_errors = n.tile(test_errors, (1, self.testing_freq)) test_errors = list(test_errors.flatten()) test_errors += [test_errors[-1]] * max(0,len(train_errors) - len(test_errors)) test_errors = test_errors[:len(train_errors)] numepochs = len(train_errors) / float(numbatches) pl.figure(1) x = range(0, len(train_errors)) pl.plot(x, train_errors, 'k-', label='Training set') pl.plot(x, test_errors, 'r-', label='Test set') pl.legend() ticklocs = range(numbatches, len(train_errors) - len(train_errors) % numbatches + 1, numbatches) epoch_label_gran = int(ceil(numepochs / 20.)) epoch_label_gran = int(ceil(float(epoch_label_gran) / 10) * 10) if numepochs >= 10 else epoch_label_gran ticklabels = map(lambda x: str((x[1] / numbatches)) if x[0] % epoch_label_gran == epoch_label_gran-1 else '', enumerate(ticklocs)) pl.xticks(ticklocs, ticklabels) pl.xlabel('Epoch') # pl.ylabel(self.show_cost) pl.title('%s[%d]' % (self.show_cost, self.cost_idx)) # print "plotted cost" def make_filter_fig(self, filters, filter_start, fignum, _title, num_filters, combine_chans, FILTERS_PER_ROW=16): MAX_ROWS = 24 MAX_FILTERS = FILTERS_PER_ROW * MAX_ROWS num_colors = filters.shape[0] f_per_row = int(ceil(FILTERS_PER_ROW / float(1 if combine_chans else num_colors))) filter_end = min(filter_start+MAX_FILTERS, num_filters) filter_rows = int(ceil(float(filter_end - filter_start) / f_per_row)) filter_pixels = filters.shape[1] filter_size = int(sqrt(filters.shape[1])) fig = pl.figure(fignum) fig.text(.5, .95, '%s %dx%d filters %d-%d' % (_title, filter_size, filter_size, filter_start, filter_end-1), horizontalalignment='center') num_filters = filter_end - filter_start if not combine_chans: bigpic = n.zeros((filter_size * filter_rows + filter_rows + 1, filter_size*num_colors * f_per_row + f_per_row + 1), dtype=n.single) else: bigpic = n.zeros((3, filter_size * filter_rows + filter_rows + 1, filter_size * f_per_row + f_per_row + 1), dtype=n.single) for m in xrange(filter_start,filter_end ): filter = filters[:,:,m] y, x = (m - filter_start) / f_per_row, (m - filter_start) % f_per_row if not combine_chans: for c in xrange(num_colors): filter_pic = filter[c,:].reshape((filter_size,filter_size)) bigpic[1 + (1 + filter_size) * y:1 + (1 + filter_size) * y + filter_size, 1 + (1 + filter_size*num_colors) * x + filter_size*c:1 + (1 + filter_size*num_colors) * x + filter_size*(c+1)] = filter_pic else: filter_pic = filter.reshape((3, filter_size,filter_size)) bigpic[:, 1 + (1 + filter_size) * y:1 + (1 + filter_size) * y + filter_size, 1 + (1 + filter_size) * x:1 + (1 + filter_size) * x + filter_size] = filter_pic pl.xticks([]) pl.yticks([]) if not combine_chans: pl.imshow(bigpic, cmap=pl.cm.gray, interpolation='nearest') else: bigpic = bigpic.swapaxes(0,2).swapaxes(0,1) pl.imshow(bigpic, interpolation='nearest') def plot_filters(self): FILTERS_PER_ROW = 16 filter_start = 0 # First filter to show if self.show_filters not in self.layers: raise ShowNetError("Layer with name '%s' not defined by given convnet." % self.show_filters) layer = self.layers[self.show_filters] filters = layer['weights'][self.input_idx] # filters = filters - filters.min() # filters = filters / filters.max() if layer['type'] == 'fc': # Fully-connected layer num_filters = layer['outputs'] channels = self.channels filters = filters.reshape(channels, filters.shape[0]/channels, filters.shape[1]) elif layer['type'] in ('conv', 'local'): # Conv layer num_filters = layer['filters'] channels = layer['filterChannels'][self.input_idx] if layer['type'] == 'local': filters = filters.reshape((layer['modules'], channels, layer['filterPixels'][self.input_idx], num_filters)) filters = filters[:, :, :, self.local_plane] # first map for now (modules, channels, pixels) filters = filters.swapaxes(0,2).swapaxes(0,1) num_filters = layer['modules'] # filters = filters.swapaxes(0,1).reshape(channels * layer['filterPixels'][self.input_idx], num_filters * layer['modules']) # num_filters *= layer['modules'] FILTERS_PER_ROW = layer['modulesX'] else: filters = filters.reshape(channels, filters.shape[0]/channels, filters.shape[1]) # Convert YUV filters to RGB if self.yuv_to_rgb and channels == 3: R = filters[0,:,:] + 1.28033 * filters[2,:,:] G = filters[0,:,:] + -0.21482 * filters[1,:,:] + -0.38059 * filters[2,:,:] B = filters[0,:,:] + 2.12798 * filters[1,:,:] filters[0,:,:], filters[1,:,:], filters[2,:,:] = R, G, B combine_chans = not self.no_rgb and channels == 3 # Make sure you don't modify the backing array itself here -- so no -= or /= if self.norm_filters: #print filters.shape filters = filters - n.tile(filters.reshape((filters.shape[0] * filters.shape[1], filters.shape[2])).mean(axis=0).reshape(1, 1, filters.shape[2]), (filters.shape[0], filters.shape[1], 1)) filters = filters / n.sqrt(n.tile(filters.reshape((filters.shape[0] * filters.shape[1], filters.shape[2])).var(axis=0).reshape(1, 1, filters.shape[2]), (filters.shape[0], filters.shape[1], 1))) #filters = filters - n.tile(filters.min(axis=0).min(axis=0), (3, filters.shape[1], 1)) #filters = filters / n.tile(filters.max(axis=0).max(axis=0), (3, filters.shape[1], 1)) #else: filters = filters - filters.min() filters = filters / filters.max() self.make_filter_fig(filters, filter_start, 2, 'Layer %s' % self.show_filters, num_filters, combine_chans, FILTERS_PER_ROW=FILTERS_PER_ROW) def plot_predictions(self): epoch, batch, data = self.get_next_batch(train=False) # get a test batch num_classes = self.test_data_provider.get_num_classes() NUM_ROWS = 2 NUM_COLS = 4 NUM_IMGS = NUM_ROWS * NUM_COLS if not self.save_preds else data[0].shape[1] NUM_TOP_CLASSES = min(num_classes, 5) # show this many top labels NUM_OUTPUTS = self.model_state['layers'][self.softmax_name]['outputs'] PRED_IDX = 1 label_names = [lab.split(',')[0] for lab in self.test_data_provider.batch_meta['label_names']] if self.only_errors: preds = n.zeros((data[0].shape[1], NUM_OUTPUTS), dtype=n.single) else: preds = n.zeros((NUM_IMGS, NUM_OUTPUTS), dtype=n.single) #rand_idx = nr.permutation(n.r_[n.arange(1), n.where(data[1] == 552)[1], n.where(data[1] == 795)[1], n.where(data[1] == 449)[1], n.where(data[1] == 274)[1]])[:NUM_IMGS] rand_idx = nr.randint(0, data[0].shape[1], NUM_IMGS) if NUM_IMGS < data[0].shape[1]: data = [n.require(d[:,rand_idx], requirements='C') for d in data] # data += [preds] # Run the model print [d.shape for d in data], preds.shape self.libmodel.startFeatureWriter(data, [preds], [self.softmax_name]) IGPUModel.finish_batch(self) print preds data[0] = self.test_data_provider.get_plottable_data(data[0]) if self.save_preds: if not gfile.Exists(self.save_preds): gfile.MakeDirs(self.save_preds) preds_thresh = preds > 0.5 # Binarize predictions data[0] = data[0] * 255.0 data[0][data[0]<0] = 0 data[0][data[0]>255] = 255 data[0] = n.require(data[0], dtype=n.uint8) dir_name = '%s_predictions_batch_%d' % (os.path.basename(self.save_file), batch) tar_name = os.path.join(self.save_preds, '%s.tar' % dir_name) tfo = gfile.GFile(tar_name, "w") tf = TarFile(fileobj=tfo, mode='w') for img_idx in xrange(NUM_IMGS): img = data[0][img_idx,:,:,:] imsave = Image.fromarray(img) prefix = "CORRECT" if data[1][0,img_idx] == preds_thresh[img_idx,PRED_IDX] else "FALSE_POS" if preds_thresh[img_idx,PRED_IDX] == 1 else "FALSE_NEG" file_name = "%s_%.2f_%d_%05d_%d.png" % (prefix, preds[img_idx,PRED_IDX], batch, img_idx, data[1][0,img_idx]) # gf = gfile.GFile(file_name, "w") file_string = StringIO() imsave.save(file_string, "PNG") tarinf = TarInfo(os.path.join(dir_name, file_name)) tarinf.size = file_string.tell() file_string.seek(0) tf.addfile(tarinf, file_string) tf.close() tfo.close() # gf.close() print "Wrote %d prediction PNGs to %s" % (preds.shape[0], tar_name) else: fig = pl.figure(3, figsize=(12,9)) fig.text(.4, .95, '%s test samples' % ('Mistaken' if self.only_errors else 'Random')) if self.only_errors: # what the net got wrong if NUM_OUTPUTS > 1: err_idx = [i for i,p in enumerate(preds.argmax(axis=1)) if p not in n.where(data[2][:,i] > 0)[0]] else: err_idx = n.where(data[1][0,:] != preds[:,0].T)[0] print err_idx err_idx = r.sample(err_idx, min(len(err_idx), NUM_IMGS)) data[0], data[1], preds = data[0][:,err_idx], data[1][:,err_idx], preds[err_idx,:] import matplotlib.gridspec as gridspec import matplotlib.colors as colors cconv = colors.ColorConverter() gs = gridspec.GridSpec(NUM_ROWS*2, NUM_COLS, width_ratios=[1]*NUM_COLS, height_ratios=[2,1]*NUM_ROWS ) #print data[1] for row in xrange(NUM_ROWS): for col in xrange(NUM_COLS): img_idx = row * NUM_COLS + col if data[0].shape[0] <= img_idx: break pl.subplot(gs[(row * 2) * NUM_COLS + col]) #pl.subplot(NUM_ROWS*2, NUM_COLS, row * 2 * NUM_COLS + col + 1) pl.xticks([]) pl.yticks([]) img = data[0][img_idx,:,:,:] pl.imshow(img, interpolation='lanczos') show_title = data[1].shape[0] == 1 true_label = [int(data[1][0,img_idx])] if show_title else n.where(data[1][:,img_idx]==1)[0] #print true_label #print preds[img_idx,:].shape #print preds[img_idx,:].max() true_label_names = [label_names[i] for i in true_label] img_labels = sorted(zip(preds[img_idx,:], label_names), key=lambda x: x[0])[-NUM_TOP_CLASSES:] #print img_labels axes = pl.subplot(gs[(row * 2 + 1) * NUM_COLS + col]) height = 0.5 ylocs = n.array(range(NUM_TOP_CLASSES))*height pl.barh(ylocs, [l[0] for l in img_labels], height=height, \ color=['#ffaaaa' if l[1] in true_label_names else '#aaaaff' for l in img_labels]) #pl.title(", ".join(true_labels)) if show_title: pl.title(", ".join(true_label_names), fontsize=15, fontweight='bold') else: print true_label_names pl.yticks(ylocs + height/2, [l[1] for l in img_labels], x=1, backgroundcolor=cconv.to_rgba('0.65', alpha=0.5), weight='bold') for line in enumerate(axes.get_yticklines()): line[1].set_visible(False) #pl.xticks([width], ['']) #pl.yticks([]) pl.xticks([]) pl.ylim(0, ylocs[-1] + height) pl.xlim(0, 1) def start(self): self.op.print_values() # print self.show_cost if self.show_cost: self.plot_cost() if self.show_filters: self.plot_filters() if self.show_preds: self.plot_predictions() if pl: pl.show() sys.exit(0) @classmethod def get_options_parser(cls): op = ConvNet.get_options_parser() for option in list(op.options): if option not in ('gpu', 'load_file', 'inner_size', 'train_batch_range', 'test_batch_range', 'multiview_test', 'data_path', 'pca_noise', 'scalar_mean'): op.delete_option(option) op.add_option("show-cost", "show_cost", StringOptionParser, "Show specified objective function", default="") op.add_option("show-filters", "show_filters", StringOptionParser, "Show learned filters in specified layer", default="") op.add_option("norm-filters", "norm_filters", BooleanOptionParser, "Individually normalize filters shown with --show-filters", default=0) op.add_option("input-idx", "input_idx", IntegerOptionParser, "Input index for layer given to --show-filters", default=0) op.add_option("cost-idx", "cost_idx", IntegerOptionParser, "Cost function return value index for --show-cost", default=0) op.add_option("no-rgb", "no_rgb", BooleanOptionParser, "Don't combine filter channels into RGB in layer given to --show-filters", default=False) op.add_option("yuv-to-rgb", "yuv_to_rgb", BooleanOptionParser, "Convert RGB filters to YUV in layer given to --show-filters", default=False) op.add_option("channels", "channels", IntegerOptionParser, "Number of channels in layer given to --show-filters (fully-connected layers only)", default=0) op.add_option("show-preds", "show_preds", StringOptionParser, "Show predictions made by given softmax on test set", default="") op.add_option("save-preds", "save_preds", StringOptionParser, "Save predictions to given path instead of showing them", default="") op.add_option("only-errors", "only_errors", BooleanOptionParser, "Show only mistaken predictions (to be used with --show-preds)", default=False, requires=['show_preds']) op.add_option("local-plane", "local_plane", IntegerOptionParser, "Local plane to show", default=0) op.add_option("smooth-test-errors", "smooth_test_errors", BooleanOptionParser, "Use running average for test error plot?", default=1) op.options['load_file'].default = None return op if __name__ == "__main__": #nr.seed(6) try: op = ShowConvNet.get_options_parser() op, load_dic = IGPUModel.parse_options(op) model = ShowConvNet(op, load_dic) model.start() except (UnpickleError, ShowNetError, opt.GetoptError), e: print "----------------" print "Error:" print e ================================================ FILE: caffe2/contrib/cuda-convnet2/util/Makefile ================================================ # Copyright 2014 Google Inc. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. LINK_LIBS := -L$(ATLAS_LIB_PATH) -latlas -lcblas INCLUDES := -I./include COMMONFLAGS := CC_ARGS := CC=g++ ifndef debug CC_ARGS += -O3 endif OUT_DIR=./bin/$(OUT_SUFFIX) OUT_FILE=libutil.so ifeq ($(numpy), 1) PYTHON_VERSION=$(shell python -V 2>&1 | cut -d ' ' -f 2 | cut -d '.' -f 1,2) LINK_LIBS += -lpython$(PYTHON_VERSION) INCLUDES += -I$(PYTHON_INCLUDE_PATH) -I$(NUMPY_INCLUDE_PATH) COMMONFLAGS += -DNUMPY_INTERFACE OUT_FILE=libutilpy.so endif OBJECTS = matrix.cpp all: dir classes $(OUT_FILE) dir: mkdir -p $(OUT_DIR)/src SOURCES = $(shell echo src/*.cpp) CLASSES = $(SOURCES:.cpp=.o) classes: $(CLASSES) %.o: %.cpp $(CC) $(CC_ARGS) -c -fPIC $(BUILD_ARGS) $(COMMONFLAGS) $(INCLUDES) $< -o $(OUT_DIR)/$*.o $(OUT_FILE): classes cd $(OUT_DIR) && $(CC) $(CC_ARGS) $(BUILD_ARGS) $(COMMONFLAGS) -shared -Wl,-no-undefined -o $(OUT_FILE) $(CLASSES) $(LINK_LIBS) ln -sf $(OUT_DIR)/$(OUT_FILE) . clean: rm -rf $(OUT_DIR)/* ================================================ FILE: caffe2/contrib/cuda-convnet2/util/include/matrix.h ================================================ /* * Copyright 2014 Google Inc. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef MATRIX_H_ #define MATRIX_H_ #include "matrix_funcs.h" #ifdef NUMPY_INTERFACE #include #include #endif #include #include #include #include #include #include extern "C" { // #include #include "caffe2/utils/cblas.h" } #ifdef DOUBLE_PRECISION #define CBLAS_GEMM cblas_dgemm #define CBLAS_SCAL cblas_dscal #define CBLAS_AXPY cblas_daxpy #else #define CBLAS_GEMM cblas_sgemm #define CBLAS_SCAL cblas_sscal #define CBLAS_AXPY cblas_saxpy #endif /* DOUBLE_PRECISION */ #define MTYPE_MAX numeric_limits::max() typedef long long int int64; class Matrix { private: MTYPE* _data; bool _ownsData; int64 _numRows, _numCols; int64 _numElements; CBLAS_TRANSPOSE _trans; void _init(MTYPE* data, int64 numRows, int64 numCols, bool transpose, bool ownsData); void _tileTo2(Matrix& target) const; void _copyAllTo(Matrix& target) const; MTYPE _sum_column(int64 col) const; MTYPE _sum_row(int64 row) const; MTYPE _aggregate(MTYPE(*agg_func)(MTYPE, MTYPE), MTYPE initialValue) const; void _aggregate(int64 axis, Matrix& target, MTYPE(*agg_func)(MTYPE, MTYPE), MTYPE initialValue) const; MTYPE _aggregateRow(int64 row, MTYPE(*agg_func)(MTYPE, MTYPE), MTYPE initialValue) const; MTYPE _aggregateCol(int64 row, MTYPE(*agg_func)(MTYPE, MTYPE), MTYPE initialValue) const; void _updateDims(int64 numRows, int64 numCols); void _applyLoop(MTYPE(*func)(MTYPE)); void _applyLoop(MTYPE (*func)(MTYPE), Matrix& target); void _applyLoop2(const Matrix& a, MTYPE(*func)(MTYPE, MTYPE), Matrix& target) const; void _applyLoop2(const Matrix& a, MTYPE (*func)(MTYPE,MTYPE, MTYPE), MTYPE scalar, Matrix& target) const; void _applyLoop2(const Matrix& a, MTYPE (*func)(MTYPE,MTYPE, MTYPE, MTYPE), MTYPE scalar1, MTYPE scalar2, Matrix& target) const; void _applyLoopScalar(const MTYPE scalar, MTYPE(*func)(MTYPE, MTYPE), Matrix& target) const; void _checkBounds(int64 startRow, int64 endRow, int64 startCol, int64 endCol) const; void _divideByVector(const Matrix& vec, Matrix& target); inline int64 _getNumColsBackEnd() const { return _trans == CblasNoTrans ? _numCols : _numRows; } public: enum FUNCTION { TANH, RECIPROCAL, SQUARE, ABS, EXP, LOG, ZERO, ONE, LOGISTIC1, LOGISTIC2, SIGN }; Matrix(); Matrix(int64 numRows, int64 numCols); Matrix(int64 numRows, int64 numCols, bool transpose); #ifdef NUMPY_INTERFACE Matrix(const PyArrayObject *src); #endif Matrix(const Matrix &like); Matrix(MTYPE* data, int64 numRows, int64 numCols); Matrix(MTYPE* data, int64 numRows, int64 numCols, bool transpose); ~Matrix(); inline MTYPE& getCell(int64 i, int64 j) const { assert(i >= 0 && i < _numRows); assert(j >= 0 && j < _numCols); if (_trans == CblasTrans) { return _data[j * _numRows + i]; } return _data[i * _numCols + j]; } MTYPE& operator()(int64 i, int64 j) const { return getCell(i, j); } inline MTYPE* getData() const { return _data; } inline bool isView() const { return !_ownsData; } inline int64 getNumRows() const { return _numRows; } inline int64 getNumCols() const { return _numCols; } inline int64 getNumDataBytes() const { return _numElements * sizeof(MTYPE); } inline int64 getNumElements() const { return _numElements; } inline int64 getLeadingDim() const { return _trans == CblasTrans ? _numRows : _numCols; } inline int64 getFollowingDim() const { return _trans == CblasTrans ? _numCols : _numRows; } inline CBLAS_TRANSPOSE getBLASTrans() const { return _trans; } inline bool isSameDims(const Matrix& a) const { return a.getNumRows() == getNumRows() && a.getNumCols() == getNumCols(); } inline bool isTrans() const { return _trans == CblasTrans; } /* * Only use if you know what you're doing! * Does not update any dimensions. Just flips the _trans flag. * * Use transpose() if you want to get the transpose of this matrix. */ inline void setTrans(bool trans) { assert(isTrans() == trans || !isView()); _trans = trans ? CblasTrans : CblasNoTrans; } void apply(FUNCTION f); void apply(Matrix::FUNCTION f, Matrix& target); void subtractFromScalar(MTYPE scalar); void subtractFromScalar(MTYPE scalar, Matrix &target) const; void biggerThanScalar(MTYPE scalar); void smallerThanScalar(MTYPE scalar); void equalsScalar(MTYPE scalar); void biggerThanScalar(MTYPE scalar, Matrix& target) const; void smallerThanScalar(MTYPE scalar, Matrix& target) const; void equalsScalar(MTYPE scalar, Matrix& target) const; void biggerThan(Matrix& a); void biggerThan(Matrix& a, Matrix& target) const; void smallerThan(Matrix& a); void smallerThan(Matrix& a, Matrix& target) const; void minWith(Matrix &a); void minWith(Matrix &a, Matrix &target) const; void maxWith(Matrix &a); void maxWith(Matrix &a, Matrix &target) const; void equals(Matrix& a); void equals(Matrix& a, Matrix& target) const; void notEquals(Matrix& a) ; void notEquals(Matrix& a, Matrix& target) const; void add(const Matrix &m); void add(const Matrix &m, MTYPE scale); void add(const Matrix &m, MTYPE scaleThis, MTYPE scaleM); void add(const Matrix &m, Matrix& target); void add(const Matrix &m, MTYPE scaleM, Matrix &target); void add(const Matrix &m, MTYPE scaleThis, MTYPE scaleM, Matrix &target); void subtract(const Matrix &m); void subtract(const Matrix &m, Matrix& target); void subtract(const Matrix &m, MTYPE scale); void subtract(const Matrix &m, MTYPE scale, Matrix& target); void addVector(const Matrix& vec, MTYPE scale); void addVector(const Matrix& vec, MTYPE scale, Matrix& target); void addVector(const Matrix& vec); void addVector(const Matrix& vec, Matrix& target); void addScalar(MTYPE scalar); void addScalar(MTYPE scalar, Matrix& target) const; void maxWithScalar(MTYPE scalar); void maxWithScalar(MTYPE scalar, Matrix &target) const; void minWithScalar(MTYPE scalar); void minWithScalar(MTYPE scalar, Matrix &target) const; void eltWiseMultByVector(const Matrix& vec); void eltWiseMultByVector(const Matrix& vec, Matrix& target); void eltWiseDivideByVector(const Matrix& vec); void eltWiseDivideByVector(const Matrix& vec, Matrix& target); void resize(int64 newNumRows, int64 newNumCols); void resize(const Matrix& like); Matrix& slice(int64 startRow, int64 endRow, int64 startCol, int64 endCol) const; void slice(int64 startRow, int64 endRow, int64 startCol, int64 endCol, Matrix &target) const; Matrix& sliceRows(int64 startRow, int64 endRow) const; void sliceRows(int64 startRow, int64 endRow, Matrix& target) const; Matrix& sliceCols(int64 startCol, int64 endCol) const; void sliceCols(int64 startCol, int64 endCol, Matrix& target) const; void rightMult(const Matrix &b, MTYPE scale); void rightMult(const Matrix &b, Matrix &target) const; void rightMult(const Matrix &b); void rightMult(const Matrix &b, MTYPE scaleAB, Matrix &target) const; void addProduct(const Matrix &a, const Matrix &b, MTYPE scaleAB, MTYPE scaleThis); void addProduct(const Matrix& a, const Matrix& b); void eltWiseMult(const Matrix& a); void eltWiseMult(const Matrix& a, Matrix& target) const; void eltWiseDivide(const Matrix& a); void eltWiseDivide(const Matrix& a, Matrix &target) const; Matrix& transpose() const; Matrix& transpose(bool hard) const; Matrix& tile(int64 timesY, int64 timesX) const; void tile(int64 timesY, int64 timesX, Matrix& target) const; void copy(Matrix &dest, int64 srcStartRow, int64 srcEndRow, int64 srcStartCol, int64 srcEndCol, int64 destStartRow, int64 destStartCol) const; Matrix& copy() const; void copy(Matrix& target) const; Matrix& sum(int64 axis) const; void sum(int64 axis, Matrix &target) const; MTYPE sum() const; MTYPE max() const; Matrix& max(int64 axis) const; void max(int64 axis, Matrix& target) const; MTYPE min() const; Matrix& min(int64 axis) const; void min(int64 axis, Matrix& target) const; MTYPE norm() const; MTYPE norm2() const; void scale(MTYPE scale); void scale(MTYPE alpha, Matrix& target); void reshape(int64 numRows, int64 numCols); Matrix& reshaped(int64 numRows, int64 numCols); void printShape(const char* name) const; bool hasNan() const; bool hasInf() const; void randomizeNormal(MTYPE mean, MTYPE stdev); void randomizeUniform(); void randomizeNormal(); void print() const; void print(int64 startRow,int64 rows, int64 startCol,int64 cols) const; void print(int64 rows, int64 cols) const; }; typedef std::vector MatrixV; #endif /* MATRIX_H_ */ ================================================ FILE: caffe2/contrib/cuda-convnet2/util/include/matrix_funcs.h ================================================ /* * Copyright 2014 Google Inc. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef MATRIX_FUNCS_H_ #define MATRIX_FUNCS_H_ #include #include #include #ifdef DOUBLE_PRECISION #define MTYPE double #else #define MTYPE float #endif #define MYRAND ((double)rand() / ((double)RAND_MAX + 1)) inline MTYPE _zero(MTYPE x) { return 0; } inline MTYPE _one(MTYPE x) { return 1; } inline MTYPE _abs(MTYPE x) { return x > 0 ? x : -x; } inline MTYPE _square(MTYPE x) { return x * x; } inline MTYPE _sigma1(MTYPE x) { return (tanh(x / 2) + 1) / 2; } inline MTYPE _sigma2(MTYPE x) { return 1 / (1 + exp(-x)); } inline MTYPE _recip(MTYPE x) { return 1 / x; } inline MTYPE _exp(MTYPE x) { return exp(x); } inline MTYPE _log(MTYPE x) { return log(x); } inline MTYPE _tanh(MTYPE x) { return tanh(x); } inline MTYPE _sign(MTYPE x) { return x > 0 ? 1 : -1; } inline MTYPE _rand(MTYPE x) { return MYRAND; } inline MTYPE _divide(MTYPE x, MTYPE y) { return x / y; } inline MTYPE _mult(MTYPE x, MTYPE y) { return x * y; } inline MTYPE _add(MTYPE x, MTYPE y) { return x + y; } inline MTYPE _addSquare(MTYPE x, MTYPE y) { return x*x + y; } inline MTYPE _addWithScale(MTYPE x, MTYPE y, MTYPE scale) { return x + scale*y; } inline MTYPE _addWithScale2(MTYPE x, MTYPE y, MTYPE scaleThis, MTYPE scaleM) { return scaleThis * x + scaleM * y; } inline MTYPE _max(MTYPE x, MTYPE y) { return std::max(x, y); } inline MTYPE _min(MTYPE x, MTYPE y) { return std::min(x, y); } inline MTYPE _bigger(MTYPE x, MTYPE y) { return x > y; } inline MTYPE _smaller(MTYPE x, MTYPE y) { return x < y; } inline MTYPE _equal(MTYPE x, MTYPE y) { return x == y; } inline MTYPE _notEqual(MTYPE x, MTYPE y) { return x != y; } #endif /* MATRIX_FUNCS_H_ */ ================================================ FILE: caffe2/contrib/cuda-convnet2/util/include/queue.h ================================================ /* * Copyright 2014 Google Inc. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef QUEUE_H_ #define QUEUE_H_ #include #include /* * A thread-safe circular queue that automatically grows but never shrinks. */ template class Queue { private: T *_elements; int _numElements; int _head, _tail; int _maxSize; pthread_mutex_t *_queueMutex; pthread_cond_t *_queueCV; void _init(int initialSize) { _numElements = 0; _head = 0; _tail = 0; _maxSize = initialSize; _elements = new T[initialSize]; _queueCV = (pthread_cond_t*)(malloc(sizeof (pthread_cond_t))); _queueMutex = (pthread_mutex_t*)(malloc(sizeof (pthread_mutex_t))); pthread_mutex_init(_queueMutex, NULL); pthread_cond_init(_queueCV, NULL); } void expand() { T *newStorage = new T[_maxSize * 2]; memcpy(newStorage, _elements + _head, (_maxSize - _head) * sizeof(T)); memcpy(newStorage + _maxSize - _head, _elements, _tail * sizeof(T)); delete[] _elements; _elements = newStorage; _head = 0; _tail = _numElements; _maxSize *= 2; } public: Queue(int initialSize) { _init(initialSize); } Queue() { _init(1); } ~Queue() { pthread_mutex_destroy(_queueMutex); pthread_cond_destroy(_queueCV); delete[] _elements; free(_queueMutex); free(_queueCV); } void enqueue(T el) { pthread_mutex_lock(_queueMutex); if (_numElements == _maxSize) { expand(); } _elements[_tail] = el; _tail = (_tail + 1) % _maxSize; _numElements++; pthread_cond_signal(_queueCV); pthread_mutex_unlock(_queueMutex); } /* * Blocks until not empty. */ T dequeue() { pthread_mutex_lock(_queueMutex); // Apparently, pthread_cond_signal may actually unblock // multiple threads, so a while loop is needed here. while (_numElements == 0) { pthread_cond_wait(_queueCV, _queueMutex); } T el = _elements[_head]; _head = (_head + 1) % _maxSize; _numElements--; pthread_mutex_unlock(_queueMutex); return el; } /* * Obviously this number can change by the time you actually look at it. */ inline int getNumElements() const { return _numElements; } }; #endif /* QUEUE_H_ */ ================================================ FILE: caffe2/contrib/cuda-convnet2/util/include/sync.h ================================================ /* * Copyright 2014 Google Inc. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef SYNC_H_ #define SYNC_H_ #include class Lock { private: pthread_mutex_t _mutex; public: Lock() { pthread_mutex_init(&_mutex, NULL); } ~Lock() { pthread_mutex_destroy(&_mutex); } void acquire() { pthread_mutex_lock(&_mutex); } void release() { pthread_mutex_unlock(&_mutex); } }; class ThreadSynchronizer { private: int _numThreads; int _numSynced; pthread_mutex_t *_syncMutex; pthread_cond_t *_syncThresholdCV; public: ThreadSynchronizer(int numThreads) { _numThreads = numThreads; _numSynced = 0; _syncMutex = (pthread_mutex_t*) malloc(sizeof(pthread_mutex_t)); _syncThresholdCV = (pthread_cond_t*) malloc(sizeof(pthread_cond_t)); pthread_mutex_init(_syncMutex, NULL); pthread_cond_init(_syncThresholdCV, NULL); } ~ThreadSynchronizer() { pthread_mutex_destroy(_syncMutex); pthread_cond_destroy(_syncThresholdCV); free(_syncMutex); free(_syncThresholdCV); } void sync() { pthread_mutex_lock(_syncMutex); _numSynced++; if (_numSynced == _numThreads) { _numSynced = 0; pthread_cond_broadcast(_syncThresholdCV); } else { pthread_cond_wait(_syncThresholdCV, _syncMutex); } pthread_mutex_unlock(_syncMutex); } }; #endif /* SYNC_H_ */ ================================================ FILE: caffe2/contrib/cuda-convnet2/util/include/thread.h ================================================ /* * Copyright 2014 Google Inc. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef THREAD_H_ #define THREAD_H_ #include #include #include #include #include #define NUM_CPUS_MAX 48 /* * Abstract joinable thread class. * The only thing the implementer has to fill in is the run method. */ class Thread { private: cpu_set_t *_cpu_set; pthread_attr_t _pthread_attr; pthread_t _threadID; bool _joinable, _startable; static void* start_pthread_func(void *obj) { void* retval = reinterpret_cast(obj)->run(); pthread_exit(retval); return retval; } protected: virtual void* run() = 0; public: Thread(bool joinable) : _cpu_set(NULL), _joinable(joinable), _startable(true) { pthread_attr_init(&_pthread_attr); } Thread(bool joinable, std::vector& cpus) : _cpu_set(NULL), _joinable(joinable), _startable(true) { pthread_attr_init(&_pthread_attr); setAffinity(cpus); } virtual ~Thread() { if (_cpu_set != NULL) { CPU_FREE(_cpu_set); } pthread_attr_destroy(&_pthread_attr); } void setAffinity(std::vector& cpus) { assert(_startable); _cpu_set = CPU_ALLOC(NUM_CPUS_MAX); size_t size = CPU_ALLOC_SIZE(NUM_CPUS_MAX); if (cpus.size() > 0 && cpus[0] >= 0) { CPU_ZERO_S(size, _cpu_set); for (int i = 0; i < cpus.size(); i++) { assert(cpus[i] < NUM_CPUS_MAX); CPU_SET_S(cpus[i], size, _cpu_set); // printf("set cpu %d\n", cpus[i]); } pthread_attr_setaffinity_np(&_pthread_attr, size, _cpu_set); } } pthread_t start() { assert(_startable); _startable = false; pthread_attr_setdetachstate(&_pthread_attr, _joinable ? PTHREAD_CREATE_JOINABLE : PTHREAD_CREATE_DETACHED); int n; if ((n = pthread_create(&_threadID, &_pthread_attr, &Thread::start_pthread_func, (void*)this))) { errno = n; perror("pthread_create error"); } return _threadID; } void join(void **status) { assert(_joinable); int n; if((n = pthread_join(_threadID, status))) { errno = n; perror("pthread_join error"); } } void join() { join(NULL); } pthread_t getThreadID() const { return _threadID; } bool isStartable() const { return _startable; } }; #endif /* THREAD_H_ */ ================================================ FILE: caffe2/contrib/cuda-convnet2/util/src/matrix.cpp ================================================ /* * Copyright 2014 Google Inc. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "../include/matrix.h" #include "../include/matrix_funcs.h" #if defined(_WIN64) || defined(_WIN32) double sqrt(int _X) {return sqrt((double) _X);} double log(int _X) {return log((double) _X);} #endif using namespace std; void Matrix::_init(MTYPE* data, int64 numRows, int64 numCols, bool transpose, bool ownsData) { _updateDims(numRows, numCols); _ownsData = ownsData; _trans = transpose ? CblasTrans : CblasNoTrans; _data = data; } Matrix::Matrix() { _init(NULL, 0, 0, false, true); } Matrix::Matrix(int64 numRows, int64 numCols) { _init(NULL, numRows, numCols, false, true); this->_data = numRows * numCols > 0 ? new MTYPE[this->_numElements] : NULL; } Matrix::Matrix(int64 numRows, int64 numCols, bool transpose) { _init(NULL, numRows, numCols, transpose, true); this->_data = numRows * numCols > 0 ? new MTYPE[this->_numElements] : NULL; } Matrix::Matrix(const Matrix &like) { _init(NULL, like.getNumRows(), like.getNumCols(), false, true); this->_data = new MTYPE[this->_numElements]; } /* construct a matrix with another matrix's data. the resultant * matrix does NOT own its data */ Matrix::Matrix(MTYPE* data, int64 numRows, int64 numCols) { _init(data, numRows, numCols, false, false); } /* construct a matrix with another matrix's data (and optionally transpose it). the resultant * matrix does NOT own its data -- it is a VIEW */ Matrix::Matrix(MTYPE* data, int64 numRows, int64 numCols, bool transpose) { _init(data, numRows, numCols, transpose, false); } #ifdef NUMPY_INTERFACE Matrix::Matrix(const PyArrayObject *src) { this->_data = NULL; this->_trans = CblasNoTrans; if (src != NULL) { this->_updateDims(PyArray_DIM(src,0), PyArray_DIM(src,1)); if (src->flags & NPY_CONTIGUOUS || src->flags & NPY_FORTRAN) { this->_data = (MTYPE*) src->data; this->_ownsData = false; this->_trans = src->flags & NPY_CONTIGUOUS ? CblasNoTrans : CblasTrans; } else { this->_data = new MTYPE[PyArray_DIM(src,0) * PyArray_DIM(src,1)]; for (int64 i = 0; i < PyArray_DIM(src,0); i++) { for (int64 j = 0; j < PyArray_DIM(src,1); j++) { (*this)(i,j) = *reinterpret_cast(PyArray_GETPTR2(src,i,j)); } } this->_ownsData = true; } } } #endif Matrix::~Matrix() { if(this->_data != NULL && this->_ownsData) { delete[] this->_data; } } void Matrix::_updateDims(int64 numRows, int64 numCols) { this->_numRows = numRows; this->_numCols = numCols; this->_numElements = numRows * numCols; } void Matrix::_checkBounds(int64 startRow, int64 endRow, int64 startCol, int64 endCol) const { assert(startRow >= 0 && startRow <= _numRows); assert(endRow >= 0 && endRow <= _numRows); assert(startCol >= 0 && startCol <= _numCols); assert(endCol >= 0 && endCol <= _numCols); } /* will return a view if possible */ Matrix& Matrix::slice(int64 startRow, int64 endRow, int64 startCol, int64 endCol) const { endRow = endRow < 0 ? this->_numRows : endRow; endCol = endCol < 0 ? this->_numCols : endCol; _checkBounds(startRow, endRow, startCol, endCol); if (!isTrans() && ((startCol == 0 && endCol == this->_numCols) || (startRow == endRow - 1))) { return *new Matrix(this->_data + startRow * this->_numCols + startCol, endRow - startRow, endCol - startCol); } else if (isTrans() && ((startRow == 0 && endRow == this->_numRows) || (startCol == endCol - 1))) { return *new Matrix(this->_data + startCol * this->_numRows + startRow, endRow - startRow, endCol - startCol, true); } Matrix& newSlice = *new Matrix(endRow - startRow, endCol - startCol); this->copy(newSlice, startRow, endRow, startCol, endCol, 0, 0); return newSlice; } /* this will NEVER return a view, unlike Matrix_slice */ void Matrix::slice(int64 startRow, int64 endRow, int64 startCol, int64 endCol, Matrix& target) const { endRow = endRow < 0 ? this->_numRows : endRow; endCol = endCol < 0 ? this->_numCols : endCol; _checkBounds(startRow, endRow, startCol, endCol); target.resize(endRow - startRow, endCol - startCol); this->copy(target, startRow, endRow, startCol, endCol, 0, 0); } Matrix& Matrix::sliceRows(int64 startRow, int64 endRow) const { return slice(startRow, endRow, 0, -1); } void Matrix::sliceRows(int64 startRow, int64 endRow, Matrix& target) const { slice(startRow, endRow, 0, -1, target); } Matrix& Matrix::sliceCols(int64 startCol, int64 endCol) const { return slice(0, -1, startCol, endCol); } void Matrix::sliceCols(int64 startCol, int64 endCol, Matrix& target) const { slice(0, -1, startCol, endCol, target); } void Matrix::subtractFromScalar(MTYPE scalar) { subtractFromScalar(scalar, *this); } void Matrix::subtractFromScalar(MTYPE scalar, Matrix& target) const { if(&target != this) { copy(target); } target.scale(-1); target.addScalar(scalar); } void Matrix::biggerThanScalar(MTYPE scalar) { biggerThanScalar(scalar, *this); } void Matrix::smallerThanScalar(MTYPE scalar) { smallerThanScalar(scalar, *this); } void Matrix::equalsScalar(MTYPE scalar) { equalsScalar(scalar, *this); } void Matrix::biggerThanScalar(MTYPE scalar, Matrix& target) const { target.resize(*this); _applyLoopScalar(scalar, &_bigger, target); } void Matrix::smallerThanScalar(MTYPE scalar, Matrix& target) const { target.resize(*this); _applyLoopScalar(scalar, &_smaller, target); } void Matrix::equalsScalar(MTYPE scalar, Matrix& target) const { target.resize(*this); _applyLoopScalar(scalar, &_equal, target); } void Matrix::add(const Matrix &m) { add(m, 1, *this); } void Matrix::add(const Matrix &m, Matrix& target) { add(m, 1, target); } void Matrix::add(const Matrix &m, MTYPE scale) { add(m, scale, *this); } void Matrix::subtract(const Matrix &m) { add(m, -1, *this); } void Matrix::subtract(const Matrix &m, Matrix& target) { add(m, -1, target); } void Matrix::subtract(const Matrix &m, MTYPE scale) { add(m, -scale, *this); } void Matrix::subtract(const Matrix &m, MTYPE scale, Matrix& target) { add(m, -scale, target); } void Matrix::add(const Matrix &m, MTYPE scaleM, Matrix &target) { add(m, 1, scaleM, target); } void Matrix::add(const Matrix &m, MTYPE scaleThis, MTYPE scaleM) { add(m, scaleThis, scaleM, *this); } void Matrix::add(const Matrix &m, MTYPE scaleThis, MTYPE scaleM, Matrix &target) { assert(this->isSameDims(m)); if (isTrans() != m.isTrans() || isTrans() != target.isTrans() || scaleThis != 1) { if (&target != this) { target.resize(*this); } if(scaleThis == 1 && scaleM == 1) { this->_applyLoop2(m, &_add, target); } else if (scaleThis == 1) { this->_applyLoop2(m, &_addWithScale, scaleM, target); } else { this->_applyLoop2(m, &_addWithScale2, scaleThis, scaleM, target); } } else { if (&target != this) { copy(target); } CBLAS_AXPY(getNumElements(), scaleM, m._data, 1, target._data, 1); } } void Matrix::addScalar(MTYPE scalar) { addScalar(scalar, *this); } void Matrix::addScalar(MTYPE scalar, Matrix& target) const { target.resize(*this); _applyLoopScalar(scalar, &_add, target); } void Matrix::maxWithScalar(MTYPE scalar) { maxWithScalar(scalar, *this); } void Matrix::maxWithScalar(MTYPE scalar, Matrix& target) const { target.resize(*this); _applyLoopScalar(scalar, &_max, target); } void Matrix::minWithScalar(MTYPE scalar) { minWithScalar(scalar, *this); } void Matrix::minWithScalar(MTYPE scalar, Matrix& target) const { target.resize(*this); _applyLoopScalar(scalar, &_min, target); } void Matrix::biggerThan(Matrix& a) { biggerThan(a, *this); } void Matrix::biggerThan(Matrix& a, Matrix& target) const { assert(isSameDims(a)); target.resize(*this); _applyLoop2(a, &_bigger, target); } void Matrix::smallerThan(Matrix& a) { smallerThan(a, *this); } void Matrix::smallerThan(Matrix& a, Matrix& target) const { assert(isSameDims(a)); target.resize(*this); _applyLoop2(a, &_smaller, target); } void Matrix::equals(Matrix& a) { equals(a, *this); } void Matrix::equals(Matrix& a, Matrix& target) const { assert(isSameDims(a)); target.resize(*this); _applyLoop2(a, &_equal, target); } void Matrix::notEquals(Matrix& a) { notEquals(a, *this); } void Matrix::notEquals(Matrix& a, Matrix& target) const { assert(isSameDims(a)); target.resize(*this); _applyLoop2(a, &_notEqual, target); } void Matrix::minWith(Matrix &a) { minWith(a, *this); } void Matrix::minWith(Matrix &a, Matrix& target) const { assert(isSameDims(a)); target.resize(*this); _applyLoop2(a, &_min, target); } void Matrix::maxWith(Matrix &a) { maxWith(a, *this); } void Matrix::maxWith(Matrix &a, Matrix& target) const { assert(isSameDims(a)); target.resize(*this); _applyLoop2(a, &_max, target); } /* this := this + scale*tile(vec) */ void Matrix::addVector(const Matrix& vec, MTYPE scale, Matrix& target) { if(&target != this) { copy(target); } assert(std::min(vec.getNumCols(), vec.getNumRows()) == 1); const bool rowVector = vec.getNumRows() == 1; const bool colVector = vec.getNumCols() == 1; assert((rowVector && vec.getNumCols() == target.getNumCols()) || (colVector && vec.getNumRows() == target.getNumRows())); if (rowVector && colVector) { addScalar(vec(0,0) * scale, target); return; } const int64 loopTil = rowVector ? target.getNumRows() : target.getNumCols(); const int64 dataInc = ((rowVector && target.isTrans()) || (!rowVector && !target.isTrans())) ? 1 : (rowVector ? target.getNumCols() : target.getNumRows()); const int64 myStride = ((target.isTrans() && rowVector) || (!target.isTrans() && !rowVector)) ? loopTil : 1; for (int64 i = 0; i < loopTil; i++) { CBLAS_AXPY(vec.getNumElements(), scale, vec._data, 1, target._data + dataInc * i, myStride); } } /* this := this + scale*tile(vec) */ void Matrix::addVector(const Matrix& vec, MTYPE scale) { addVector(vec, scale, *this); } void Matrix::addVector(const Matrix& vec) { addVector(vec, 1, *this); } void Matrix::addVector(const Matrix& vec, Matrix& target) { addVector(vec, 1, target); } void Matrix::eltWiseMultByVector(const Matrix& vec) { eltWiseMultByVector(vec, *this); } /* omg test these */ void Matrix::eltWiseMultByVector(const Matrix& vec, Matrix& target) { if(&target != this) { copy(target); } assert(std::min(vec.getNumCols(), vec.getNumRows()) == 1); const bool rowVector = vec.getNumRows() == 1; assert((rowVector && vec.getNumCols() == target.getNumCols()) || (!rowVector && vec.getNumRows() == target.getNumRows())); const int64 dataInc = ((rowVector && !target.isTrans()) || (!rowVector && target.isTrans())) ? 1 : (rowVector ? target.getNumRows() : target.getNumCols()); const int64 myStride = ((!target.isTrans() && !rowVector) || (target.isTrans() && rowVector)) ? 1 : vec.getNumElements(); const int64 numScaling = rowVector ? target.getNumRows() : target.getNumCols(); for (int64 i = 0; i < vec.getNumElements(); i++) { CBLAS_SCAL(numScaling, vec._data[i], target._data + dataInc * i, myStride); } } /* return := scale * this * b */ void Matrix::rightMult(const Matrix& b, MTYPE scale) { rightMult(b, scale, *this); } /* return := this * b */ void Matrix::rightMult(const Matrix& b) { rightMult(b, 1); } /* target := this * b * also resizes target if necessary.*/ void Matrix::rightMult(const Matrix &b, Matrix &target) const { rightMult(b, 1, target); } /* target := scaleAB * this * b * also resizes target if necessary.*/ void Matrix::rightMult(const Matrix &b, MTYPE scaleAB, Matrix &target) const { if(&target != this) { target.resize(this->_numRows, b._numCols); } target.addProduct(*this, b, scaleAB, 0); } /* this := scaleAB * a*b + scaleC * this * ALL SIZES MUST BE CORRECT. */ void Matrix::addProduct(const Matrix& a, const Matrix& b, MTYPE scaleAB, MTYPE scaleThis) { assert(a.getNumCols() == b.getNumRows()); assert(this->getNumRows() == a.getNumRows() && this->getNumCols() == b.getNumCols()); assert(!isTrans()); CBLAS_GEMM(CblasRowMajor, a._trans, b._trans, a._numRows, b._numCols, a._numCols, scaleAB, a._data, a._getNumColsBackEnd(), b._data, b._getNumColsBackEnd(), scaleThis, this->_data, this->_numCols); } void Matrix::addProduct(const Matrix& a, const Matrix& b) { addProduct(a, b, 1, 1); } Matrix& Matrix::transpose() const { return *new Matrix(this->_data, this->_numCols, this->_numRows, !isTrans()); } Matrix& Matrix::transpose(bool hard) const { if (!hard || isTrans()) { return transpose(); } Matrix &meTrans = *new Matrix(_numCols, _numRows); for (int64 i = 0; i < _numRows; i++) { for (int64 j = 0; j < _numCols; j++) { meTrans(j, i) = (*this)(i, j); } } return meTrans; } Matrix& Matrix::tile(int64 timesY, int64 timesX) const { Matrix& tiled = *new Matrix(this->_numRows * timesY, this->_numCols * timesX); _tileTo2(tiled); return tiled; } /* resizes target if necessary */ void Matrix::tile(int64 timesY, int64 timesX, Matrix& target) const { target.resize(this->_numRows * timesY, this->_numCols * timesX); _tileTo2(target); } /* a variant ... seems to be no faster than original. */ void Matrix::_tileTo2(Matrix& target) const { for(int64 y = 0; y < target._numRows; y += this->_numRows) { for(int64 x = 0; x < target._numCols; x += this->_numCols) { this->copy(target, 0, -1, 0, -1, y, x); } } } /* guarantees that result will be non-transposed */ void Matrix::resize(int64 newNumRows, int64 newNumCols) { if(this->_numRows != newNumRows || this->_numCols != newNumCols) { assert(!isView()); if (this->getNumElements() != newNumRows * newNumCols) { delete[] this->_data; //deleting NULL is ok, sez c++ this->_data = new MTYPE[newNumRows * newNumCols]; } this->_updateDims(newNumRows, newNumCols); this->_trans = CblasNoTrans; } } void Matrix::resize(const Matrix& like) { resize(like.getNumRows(), like.getNumCols()); } void Matrix::scale(MTYPE alpha) { scale(alpha, *this); } void Matrix::scale(MTYPE alpha, Matrix& target) { if (&target != this) { target.resize(*this); copy(target); } CBLAS_SCAL(getNumElements(), alpha, target._data, 1); } /* performs no resizing. * Warnings: * 1. ALL DIMENSIONS MUST BE CORRECT * 2. The source and destination memories better not overlap! */ void Matrix::copy(Matrix& dest, int64 srcStartRow, int64 srcEndRow, int64 srcStartCol, int64 srcEndCol, int64 destStartRow, int64 destStartCol) const { srcEndRow = srcEndRow < 0 ? this->_numRows : srcEndRow; srcEndCol = srcEndCol < 0 ? this->_numCols : srcEndCol; assert(destStartRow >= 0 && destStartCol >= 0); //some range-checking assert(srcEndRow <= _numRows && srcEndCol <= _numCols); assert(destStartRow + srcEndRow - srcStartRow <= dest.getNumRows()); assert(destStartCol + srcEndCol - srcStartCol <= dest.getNumCols()); // I found no evidence that memcpy is actually faster than just // copying element-by-element. if (!isTrans() && !dest.isTrans()) { int64 src_start_idx = this->_numCols * srcStartRow + srcStartCol; int64 dest_start_idx = dest._numCols * destStartRow + destStartCol; int64 copy_row_width = srcEndCol - srcStartCol; for (int64 i = srcStartRow; i < srcEndRow; i++) { memcpy(dest._data + dest_start_idx + dest._numCols * (i - srcStartRow), this->_data + src_start_idx + this->_numCols * (i - srcStartRow), sizeof(MTYPE) * copy_row_width); } } else { for (int64 i = srcStartRow; i < srcEndRow; i++) { for (int64 j = srcStartCol; j < srcEndCol; j++) { dest(i - srcStartRow + destStartRow, j - srcStartCol + destStartCol) = (*this)(i, j); } } } } /* preserves everything excluding transposedness. * new matrix owns its data */ Matrix& Matrix::copy() const { Matrix& copy = *new Matrix(*this); this->copy(copy); return copy; } /* resizes target if necessary */ void Matrix::copy(Matrix& target) const { target.resize(this->_numRows, this->_numCols); //target is now non-transposed if(this->isTrans() == target.isTrans()) { this->_copyAllTo(target); } else { //if I'm transposed, make sure that target is non-transposed copy this->copy(target, 0, -1, 0, -1, 0, 0); } } void Matrix::_copyAllTo(Matrix& target) const { assert(target.isTrans() == isTrans()); memcpy((void*) target._data, (void*) this->_data, this->getNumDataBytes()); target._trans = this->_trans; } MTYPE Matrix::min() const { return _aggregate(&_min, MTYPE_MAX); } Matrix& Matrix::min(int64 axis) const { Matrix& target = axis == 0 ? *new Matrix(1, this->_numCols) : *new Matrix(this->_numRows, 1); this->min(axis, target); return target; } void Matrix::min(int64 axis, Matrix& target) const { _aggregate(axis, target, &_min, MTYPE_MAX); } MTYPE Matrix::max() const { return _aggregate(&_max, -MTYPE_MAX); } Matrix& Matrix::max(int64 axis) const { Matrix& target = axis == 0 ? *new Matrix(1, this->_numCols) : *new Matrix(this->_numRows, 1); this->max(axis, target); return target; } void Matrix::max(int64 axis, Matrix& target) const { _aggregate(axis, target, &_max, -MTYPE_MAX); } MTYPE Matrix::sum() const { return _aggregate(&_add, 0); } MTYPE Matrix::norm() const { return sqrt(norm2()); } MTYPE Matrix::norm2() const { return _aggregate(&_addSquare, 0); } Matrix& Matrix::sum(int64 axis) const { Matrix& target = axis == 0 ? *new Matrix(1, this->_numCols) : *new Matrix(this->_numRows, 1); this->sum(axis, target); return target; } void Matrix::sum(int64 axis, Matrix& target) const { _aggregate(axis, target, &_add, 0); } void Matrix::_aggregate(int64 axis, Matrix& target, MTYPE (*agg_func)(MTYPE, MTYPE), MTYPE initialValue) const { if (axis == 0) { target.resize(1, this->_numCols); for (int64 j = 0; j < this->_numCols; j++) { target(0, j) = _aggregateCol(j, agg_func, initialValue); } } else { target.resize(this->_numRows, 1); for (int64 i = 0; i < this->_numRows; i++) { target(i, 0) = _aggregateRow(i, agg_func, initialValue); } } } MTYPE Matrix::_aggregateRow(int64 row, MTYPE (*agg_func)(MTYPE, MTYPE), MTYPE initialValue) const { MTYPE v = initialValue; for (int64 j = 0; j < this->_numCols; j++) { v = agg_func((*this)(row, j), v); } return v; } MTYPE Matrix::_aggregateCol(int64 col, MTYPE (*agg_func)(MTYPE, MTYPE), MTYPE initialValue) const { MTYPE v = initialValue; for (int64 i = 0; i < this->_numRows; i++) { v = agg_func((*this)(i, col), v); } return v; } MTYPE Matrix::_aggregate(MTYPE (*agg_func)(MTYPE, MTYPE), MTYPE initialValue) const { MTYPE v = initialValue; MTYPE* ptr = _data; for (int64 i = 0; i < getNumElements(); i++, ptr++) { v = agg_func(*ptr, v); } return v; } void Matrix::printShape(const char* name) const { printf("%s: %lldx%lld\n", name, getNumRows(), getNumCols()); } void Matrix::print() const { print(0,getNumRows(),0, getNumCols()); } void Matrix::print(int64 rows, int64 cols) const { print(0,rows,0, cols); } void Matrix::print(int64 startRow, int64 rows, int64 startCol, int64 cols) const { for (int64 i = startRow; i < std::min(startRow+rows, this->_numRows); i++) { for (int64 j = startCol; j < std::min(startCol+cols, this->_numCols); j++) { printf("%.15f ", (*this)(i, j)); } printf("\n"); } } void Matrix::apply(Matrix::FUNCTION f) { apply(f, *this); } void Matrix::apply(Matrix::FUNCTION f, Matrix& target) { MTYPE (*func)(MTYPE); if(f == EXP) { func = &_exp; } else if(f == TANH) { func = &_tanh; } else if(f == RECIPROCAL) { func = &_recip; } else if (f == SQUARE) { func = &_square; } else if(f == LOG) { func = &_log; } else if(f == ZERO) { func = &_zero; } else if (f == ONE) { func = &_one; } else if(f == LOGISTIC1) { func = &_sigma1; } else if(f == LOGISTIC2) { func = &_sigma2; } else if (f == ABS) { func = &_abs; } else if (f == SIGN) { func = &_sign; } else { return; //LOG(FATAL) << "Matrix::apply: Unknown function type"; } this->_applyLoop(func, target); } void Matrix::eltWiseMult(const Matrix& a, Matrix& target) const { assert(isSameDims(a)); target.resize(*this); this->_applyLoop2(a, &_mult, target); } void Matrix::eltWiseDivide(const Matrix& a, Matrix& target) const { assert(isSameDims(a)); target.resize(*this); this->_applyLoop2(a, &_divide, target); } void Matrix::eltWiseMult(const Matrix& a) { eltWiseMult(a, *this); } void Matrix::eltWiseDivide(const Matrix& a) { eltWiseDivide(a, *this); } void Matrix::randomizeUniform() { this->_applyLoop(&_rand); } void Matrix::randomizeNormal() { //LOG(FATAL) << "randomizeNormal only implemented on MKL!"; } void Matrix::randomizeNormal(MTYPE mean, MTYPE stdev) { //LOG(FATAL) << "randomizeNormal only implemented on MKL!"; } void Matrix::eltWiseDivideByVector(const Matrix& vec) { eltWiseDivideByVector(vec, *this); } /* This function allocates a chunk of memory at most as big as the input vector */ void Matrix::eltWiseDivideByVector(const Matrix& vec, Matrix& target) { assert(std::min(vec.getNumCols(), vec.getNumRows()) == 1); const bool rowVector = vec.getNumRows() == 1; assert((rowVector && vec.getNumCols() == getNumCols()) || (!rowVector && vec.getNumRows() == getNumRows())); if(&target != this) { target.resize(*this); } _divideByVector(vec, target); } void Matrix::_divideByVector(const Matrix& vec, Matrix& target) { Matrix& vecInverse = vec.copy(); vecInverse.apply(RECIPROCAL); eltWiseMultByVector(vecInverse,target); delete &vecInverse; } void Matrix::reshape(int64 numRows, int64 numCols) { assert(_numElements == numRows*numCols); _numRows = numRows; _numCols = numCols; } Matrix& Matrix::reshaped(int64 numRows, int64 numCols) { assert(_numElements == numRows*numCols); return *new Matrix(_data, numRows, numCols, isTrans()); } void Matrix::_applyLoop(MTYPE (*func)(MTYPE), Matrix& target) { MTYPE *ptr = this->_data, *tgtPtr = target._data; for (int64 i = 0; i < getNumElements(); i++, ptr++, tgtPtr++) { *tgtPtr = (*func)(*ptr); } } void Matrix::_applyLoop(MTYPE (*func)(MTYPE)) { _applyLoop(func, *this); } void Matrix::_applyLoop2(const Matrix& a, MTYPE (*func)(MTYPE,MTYPE), Matrix& target) const { for (int64 i = 0; i < getNumRows(); i++) { for (int64 j = 0; j < getNumCols(); j++) { target(i, j) = (*func)((*this)(i, j), a(i, j)); } } } void Matrix::_applyLoop2(const Matrix& a, MTYPE (*func)(MTYPE,MTYPE, MTYPE), MTYPE scalar, Matrix& target) const { for (int64 i = 0; i < getNumRows(); i++) { for (int64 j = 0; j < getNumCols(); j++) { target(i, j) = (*func)((*this)(i, j), a(i, j), scalar); } } } void Matrix::_applyLoop2(const Matrix& a, MTYPE (*func)(MTYPE,MTYPE, MTYPE, MTYPE), MTYPE scalar1, MTYPE scalar2, Matrix& target) const { for (int64 i = 0; i < getNumRows(); i++) { for (int64 j = 0; j < getNumCols(); j++) { target(i, j) = (*func)((*this)(i, j), a(i, j), scalar1, scalar2); } } } void Matrix::_applyLoopScalar(const MTYPE scalar, MTYPE(*func)(MTYPE, MTYPE), Matrix& target) const { MTYPE *myPtr = _data; MTYPE *targetPtr = target._data; for (int64 i = 0; i < getNumElements(); i++, myPtr++, targetPtr++) { *targetPtr = (*func)(*myPtr, scalar); } } bool Matrix::hasNan() const { for (int64 r = 0; r < _numRows; r++) { for (int64 c = 0; c < _numCols; c++) { if (isnan((*this)(r,c))) { return true; } } } return false; } bool Matrix::hasInf() const { for (int64 r = 0; r < _numRows; r++) { for (int64 c = 0; c < _numCols; c++) { if (isinf((*this)(r,c))) { return true; } } } return false; } ================================================ FILE: caffe2/contrib/docker-ubuntu-14.04/Dockerfile ================================================ FROM ubuntu:14.04 MAINTAINER caffe-dev # A docker container with CUDA and caffe2 installed. # Note: this should install everything but cudnn, which requires you to have a # manual registration and download from the NVidia website. After creating this # docker image, the Caffe2 repository is located at /opt/caffe2. You can install # cudnn manually and re-compile caffe2. ################################################################################ # Step 1: set up cuda on the ubuntu box. ################################################################################ RUN apt-get update && apt-get install -q -y \ build-essential \ wget RUN cd /tmp && \ wget http://developer.download.nvidia.com/compute/cuda/7_0/Prod/local_installers/cuda_7.0.28_linux.run && \ chmod +x cuda_*_linux.run && ./cuda_*_linux.run -extract=`pwd` && \ ./NVIDIA-Linux-x86_64-*.run -s --no-kernel-module && \ ./cuda-linux64-rel-*.run -noprompt && \ rm -rf * # Ensure the CUDA libs and binaries are in the correct environment variables ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64 ENV PATH=$PATH:/usr/local/cuda/bin # Run nvcc to make sure things are set correctly. RUN nvcc --version ################################################################################ # Step 2: set up caffe2 pre-requisites ################################################################################ RUN apt-get update && apt-get install -q -y \ git \ libeigen3-dev \ libgoogle-glog-dev \ libleveldb-dev \ liblmdb-dev \ libopencv-dev \ libprotobuf-dev \ libsnappy-dev \ zlib1g-dev \ libbz2-dev \ protobuf-compiler \ python-dev \ python-pip RUN cd /tmp && \ git clone https://github.com/facebook/rocksdb.git && \ cd /tmp/rocksdb && \ make && make install && \ cd / && \ rm -rf /tmp/rocksdb # Caffe2 works best with openmpi 1.8.5 or above (which has cuda support). # If you do not need openmpi, skip this step. RUN cd /tmp && \ wget http://www.open-mpi.org/software/ompi/v1.10/downloads/openmpi-1.10.0.tar.gz && \ tar xzvf openmpi-1.10.0.tar.gz && \ cd /tmp/openmpi-1.10.0 && \ ./configure --with-cuda --with-threads && \ make && make install && \ cd / && \ rm -rf /tmp/openmpi-1.10.0 && \ rm /tmp/openmpi-1.10.0.tar.gz # Caffe2 requires zeromq 4.0 or above, manually install. # If you do not need zeromq, skip this step. RUN apt-get install -q -y autoconf libtool RUN mkdir /tmp/zeromq-build && \ cd /tmp/zeromq-build && \ wget https://github.com/zeromq/zeromq4-1/archive/v4.1.3.tar.gz && \ tar xzvf v4.1.3.tar.gz --strip 1 && \ ./autogen.sh && \ ./configure --without-libsodium && \ make && make install && \ cd / && \ rm -rf /tmp/zeromq-build # pip self upgrade RUN pip install --upgrade pip # Python dependencies RUN pip install \ matplotlib \ numpy \ protobuf ################################################################################ # Step 3: install optional dependencies ("good to have" features) ################################################################################ RUN apt-get install -q -y \ gfortran \ graphviz \ libatlas-base-dev \ vim RUN pip install \ flask \ ipython \ notebook \ pydot \ python-nvd3 \ scipy \ tornado # This is intentional. scikit-image has to be after scipy. RUN pip install \ scikit-image ################################################################################ # Step 4: set up caffe2 ################################################################################ # Get the repository, and build. RUN cd /opt && \ git clone https://github.com/Yangqing/caffe2.git && \ cd /opt/caffe2 && \ make # Now, we know that some of the caffe tests will fail. How do we deal with # those? ================================================ FILE: caffe2/contrib/gloo/CMakeLists.txt ================================================ if(USE_GLOO) set(Caffe2_CONTRIB_GLOO_CPU_SRC "${CMAKE_CURRENT_SOURCE_DIR}/allgather_ops.cc" "${CMAKE_CURRENT_SOURCE_DIR}/allreduce_ops.cc" "${CMAKE_CURRENT_SOURCE_DIR}/barrier_ops.cc" "${CMAKE_CURRENT_SOURCE_DIR}/broadcast_ops.cc" "${CMAKE_CURRENT_SOURCE_DIR}/common.cc" "${CMAKE_CURRENT_SOURCE_DIR}/common_world_ops.cc" "${CMAKE_CURRENT_SOURCE_DIR}/context.cc" "${CMAKE_CURRENT_SOURCE_DIR}/reduce_scatter_ops.cc" "${CMAKE_CURRENT_SOURCE_DIR}/store_handler.cc" ) set(Caffe2_CONTRIB_GLOO_GPU_SRC "${CMAKE_CURRENT_SOURCE_DIR}/allreduce_ops_gpu.cc" "${CMAKE_CURRENT_SOURCE_DIR}/broadcast_ops_gpu.cc" "${CMAKE_CURRENT_SOURCE_DIR}/common_world_ops_gpu.cc" ) set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${Caffe2_CONTRIB_GLOO_CPU_SRC} PARENT_SCOPE) set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} ${Caffe2_CONTRIB_GLOO_GPU_SRC} PARENT_SCOPE) endif() ================================================ FILE: caffe2/contrib/gloo/allgather_ops.cc ================================================ /** * Copyright (c) 2017-present, Facebook, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "allgather_ops.h" #include namespace caffe2 { namespace gloo { template void AllgatherOp::initializeAlgorithm() { if (init_.template IsType()) { algorithm_.reset(new ::gloo::AllgatherRing( init_.context, init_.template getInputs(), init_.template getOutput(), init_.size)); } else if (init_.template IsType()) { algorithm_.reset(new ::gloo::AllgatherRing( init_.context, init_.template getInputs(), init_.template getOutput(), init_.size)); } else if (init_.template IsType()) { algorithm_.reset(new ::gloo::AllgatherRing( init_.context, init_.template getInputs(), init_.template getOutput(), init_.size)); } else if (init_.template IsType()) { algorithm_.reset(new ::gloo::AllgatherRing<::gloo::float16>( init_.context, init_.template getInputs<::gloo::float16>(), init_.template getOutput<::gloo::float16>(), init_.size)); } else { CAFFE_ENFORCE(false, "Unhandled type: ", init_.meta.name()); } } namespace { REGISTER_CPU_OPERATOR_WITH_ENGINE(Allgather, GLOO, AllgatherOp); } // namespace } // namespace gloo } // namespace caffe2 ================================================ FILE: caffe2/contrib/gloo/allgather_ops.h ================================================ /** * Copyright (c) 2017-present, Facebook, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #pragma once #include #include "caffe2/contrib/gloo/common.h" #include "caffe2/core/operator.h" #include "caffe2/core/types.h" #include #include #include namespace caffe2 { namespace gloo { template class AllgatherOp final : public Operator { public: USE_OPERATOR_CONTEXT_FUNCTIONS; AllgatherOp(const OperatorDef& operator_def, Workspace* ws) : Operator(operator_def, ws), ws_(ws), status_blob_( OperatorBase::GetSingleArgument("status_blob", "")) { if (status_blob_ != "") { ws_->CreateBlob(status_blob_); } } virtual ~AllgatherOp() {} bool RunOnDevice() override { std::call_once(once_, [&] { initialize(); }); // If any parameter has changed in between runs, the initialized // algorithm is invalid and cannot be used. update(current_); CAFFE_ENFORCE(current_ == init_, "Inputs/outputs have changed"); try { algorithm_->run(); } catch (::gloo::IoException& ioe) { LOG(ERROR) << "Caught gloo IO exception: " << ioe.what(); if (status_blob_ != "") { signalFailure(ws_->GetBlob(status_blob_), ioe); return false; } else { throw ioe; } } return true; } protected: void initialize() { // Allocate output tensor CAFFE_ENFORCE_EQ(OutputSize(), 1); auto comm_size = OperatorBase::Input>(0)->size; const auto dims = std::vector(1, (InputSize() - 1) * Input(1).size() * comm_size); Output(0)->Resize(dims); // Store which inputs/outputs this instance initialized with update(init_); CAFFE_ENFORCE_EQ(init_.outputs.size(), 1); // Verify tensors all have same size size_t size = Input(1).size(); for (auto i = 2; i < InputSize(); i++) { CAFFE_ENFORCE_EQ(Input(i).size(), size); } // Verify tensors all have same type TypeMeta meta = Input(1).meta(); for (auto i = 2; i < InputSize(); i++) { CAFFE_ENFORCE(Input(i).meta() == meta); } // Finally initialize the algorithm initializeAlgorithm(); } void initializeAlgorithm(); std::once_flag once_; std::unique_ptr<::gloo::Algorithm> algorithm_; // Captures the parameters passed to Gloo when first initialized. // An instance is updated every time this op runs and is compared // to the reference instance for equality. If any parameter has // changed from run to run, the initialized algorithm is invalid. void update(GlooParameters& params) { params.context = OperatorBase::Input>(0); params.inputs.resize(InputSize() - 1); params.size = Input(1).size(); params.meta = Input(1).meta(); for (auto i = 0; i < params.inputs.size(); i++) { params.inputs[i] = Input(i + 1).template raw_data(); } params.outputs.resize(OutputSize()); params.outputs[0] = Output(0)->raw_mutable_data(params.meta); } GlooParameters init_; GlooParameters current_; Workspace* ws_; std::string status_blob_; }; } // namespace gloo } // namespace caffe2 ================================================ FILE: caffe2/contrib/gloo/allreduce_ops.cc ================================================ /** * Copyright (c) 2016-present, Facebook, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "allreduce_ops.h" #include #include #include #include namespace caffe2 { namespace gloo { template void AllreduceOp::initializeHalvingDoubling() { if (init_.template IsType()) { algorithm_.reset(new ::gloo::AllreduceHalvingDoubling( init_.context, init_.template getOutputs(), init_.size)); } else if (init_.template IsType<::caffe2::float16>()) { algorithm_.reset(new ::gloo::AllreduceHalvingDoubling<::gloo::float16>( init_.context, init_.template getOutputs<::gloo::float16>(), init_.size)); } else { CAFFE_ENFORCE(false, "Unhandled type: ", init_.meta.name()); } } template void AllreduceOp::initializeRingFull() { if (init_.template IsType()) { algorithm_.reset(new ::gloo::AllreduceRing( init_.context, init_.template getOutputs(), init_.size)); } else if (init_.template IsType<::caffe2::float16>()) { algorithm_.reset(new ::gloo::AllreduceRing<::gloo::float16>( init_.context, init_.template getOutputs<::gloo::float16>(), init_.size)); } else { CAFFE_ENFORCE(false, "Unhandled type: ", init_.meta.name()); } } template void AllreduceOp::initializeRingChunked() { if (init_.template IsType()) { algorithm_.reset(new ::gloo::AllreduceRingChunked( init_.context, init_.template getOutputs(), init_.size)); } else if (init_.template IsType<::caffe2::float16>()) { algorithm_.reset(new ::gloo::AllreduceRingChunked<::gloo::float16>( init_.context, init_.template getOutputs<::gloo::float16>(), init_.size)); } else { CAFFE_ENFORCE(false, "Unhandled type: ", init_.meta.name()); } } namespace { REGISTER_CPU_OPERATOR_WITH_ENGINE(Allreduce, GLOO, AllreduceOp); } // namespace } // namespace gloo } // namespace caffe2 ================================================ FILE: caffe2/contrib/gloo/allreduce_ops.h ================================================ /** * Copyright (c) 2016-present, Facebook, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #pragma once #include #include "caffe2/contrib/gloo/common.h" #include "caffe2/core/operator.h" #include "caffe2/utils/math.h" #include #include #include namespace caffe2 { namespace gloo { template class AllreduceOp final : public Operator { enum Mode { RING_FULL, RING_CHUNKED, HALVING_DOUBLING }; public: USE_OPERATOR_CONTEXT_FUNCTIONS; AllreduceOp(const OperatorDef& operator_def, Workspace* ws) : Operator(operator_def, ws), ws_(ws), status_blob_( OperatorBase::GetSingleArgument("status_blob", "")), gpu_direct_( OperatorBase::GetSingleArgument("gpu_direct", false)) { if (status_blob_ != "") { ws_->CreateBlob(status_blob_); } } virtual ~AllreduceOp() {} bool RunOnDevice() override { std::call_once(once_, [&] { initialize(); }); // If any parameter has changed in between runs, the initialized // algorithm is invalid and cannot be used. update(current_); CAFFE_ENFORCE(current_ == init_, "Inputs/outputs have changed"); try { algorithm_->run(); } catch (::gloo::IoException& ioe) { LOG(ERROR) << "Caught gloo IO exception: " << ioe.what(); if (status_blob_ != "") { signalFailure(ws_->GetBlob(status_blob_), ioe); return false; } else { throw ioe; } } return true; } protected: void initialize() { Mode mode = HALVING_DOUBLING; auto bytes = Input(1).nbytes(); // Store which inputs/outputs this instance initialized with update(init_); // Verify inputs == ouputs CAFFE_ENFORCE_EQ(init_.inputs.size(), init_.outputs.size()); for (auto i = 0; i < init_.inputs.size(); i++) { CAFFE_ENFORCE_EQ(init_.inputs[i], init_.outputs[i]); } // Verify tensors all have same size size_t size = Input(1).size(); for (auto i = 2; i < InputSize(); i++) { CAFFE_ENFORCE_EQ(Input(i).size(), size); } // Verify tensors all have same type TypeMeta meta = Input(1).meta(); for (auto i = 2; i < InputSize(); i++) { CAFFE_ENFORCE(Input(i).meta() == meta); } switch (mode) { case RING_FULL: initializeRingFull(); return; case RING_CHUNKED: initializeRingChunked(); return; case HALVING_DOUBLING: initializeHalvingDoubling(); return; } CAFFE_ENFORCE(false, "Unreachable code"); } void initializeHalvingDoubling(); void initializeRingFull(); void initializeRingChunked(); std::once_flag once_; std::unique_ptr<::gloo::Algorithm> algorithm_; // Captures the parameters passed to Gloo when first initialized. // An instance is updated every time this op runs and is compared // to the reference instance for equality. If any parameter has // changed from run to run, the initialized algorithm is invalid. void update(GlooParameters& params) { params.context = OperatorBase::Input>(0); params.inputs.resize(InputSize() - 1); params.outputs.resize(OutputSize()); for (auto i = 0; i < params.inputs.size(); i++) { params.inputs[i] = Input(i + 1).template raw_data(); params.outputs[i] = Output(i)->template raw_mutable_data(); } params.size = Output(0)->size(); params.meta = Output(0)->meta(); } GlooParameters init_; GlooParameters current_; Workspace* ws_; std::string status_blob_; const bool gpu_direct_; }; } // namespace gloo } // namespace caffe2 ================================================ FILE: caffe2/contrib/gloo/allreduce_ops_gpu.cc ================================================ /** * Copyright (c) 2016-present, Facebook, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "allreduce_ops.h" #include "caffe2/core/context_gpu.h" #include "caffe2/core/logging.h" #include #include #include #include namespace caffe2 { namespace gloo { namespace { // Decides on using GPUDirect based on device support. template