From 6caf5d5cfdae84480dbd31673045355c01b7b3da Mon Sep 17 00:00:00 2001 From: hanhaowen-mt <144977798+hanhaowen-mt@users.noreply.github.com> Date: Mon, 13 May 2024 19:26:02 +0800 Subject: [PATCH] paddle_musa v2.6.0 release initialization (#64265) * Revert "fix rpc_sync and rpc_async doc;test=develop (#64107)" This reverts commit 131999233ef997fc8d3f24b27830925b78cf17aa. * Revert "[Dy2St][2.6] Disable `test_sentiment` on release/2.6 (#63197)" This reverts commit 90138318312fbb60b0bdce8b0f4fb317879fe62e. * Revert "Revert "fix security (#62626) (#62683)" (#62890)" This reverts commit 89a60d773893640b7cec91d4857d634a61b02304. * Revert "Enhance several unit tests (#62477) (#62776)" This reverts commit 0348f3f9ab802ab8456cf8ee0fbac290af089088. * Revert "[Fix_ci] set PLUGIN_TAG release/2.6 (#62731)" This reverts commit 97ffa07890e8e0d780a91ded73fc56cace2c9242. * Revert "fix security (#62626) (#62683)" This reverts commit 6a735475b13c4f0800d63e7e63e9ca583d96c96a. * Revert "add more capi to support stride (#62716)" This reverts commit 683a141dcffd971da4a6dfefd3109ba9629d6f05. * Revert "[XPU] default no autotune (#62636)" This reverts commit fde63d149d63a9e1d4954416dc2e25cb0d4ff954. * Revert "[DCU] fix dcu compile failure (#62573)" This reverts commit d527fb55386daa88aca4ad57f8081e7dc81af0f9. * Revert "[AutoParallel] Adjust time restriction for test_semi_auto_parallel_hybrid_strategy.py (#62278)" This reverts commit fbf852dd832bc0e63ae31cd4aa37defd829e4c03. * Revert "disable llm_int8 ut (#62282)" This reverts commit e8165299fa3a926b5bc0c138c6307bf3d4967fd2. * Revert "fix openssl-cpu compile bug (#62079) (#62224)" This reverts commit 59c61db4b55f96153d7c9e214af60578ce5c69d0. * Revert "[CINN] Add IntrinsicOps into ir_codes_collector (#60556) (#62245)" This reverts commit 773ea41598eb981da15876aaf87c172382f533cc. * Revert "rm graph_reindex_test (#62057)" This reverts commit 521dc70381388499ba43cf05b9dc95f39b8da43a. * Revert "fix (#61923) (#62186)" This reverts commit d077553c5987ec619d777b5d4e930f0994406b76. * Revert "fix cpups training bug:executor trainer use_ps_gpu value;test=develop (#62111)" This reverts commit d8049754b69c6d62f4c77a9db161ea80d52091d2. * Revert "[cherry-pick 2.6] Fix bug of put_along_axis/take_along_axis (#62065)" This reverts commit 3a083c37467ec7d86493f4a6f50b975e2984ac8b. * Revert "[Cherry-pick] Fix indexing shape bug and Optimize (#62117)" This reverts commit 609f55eed26b0db6106dc8be45a782de3d4b619b. * Revert "cherry pick: reduce log for type promotion. (#62116)" This reverts commit f4d9adfbe60f6cfb0235bac2c8aed940225e7311. * Revert "fix test_communicator_half_async random core;test=develop (#62092)" This reverts commit dba99929f6f1e349d586ffa5455117d2249b7141. * Revert "fix the unqiue op that generate the wrong the inreverse result (#62104)" This reverts commit b89066ae29ab20f9c35e10f52093613d505d2f83. * Revert "[Cherry-pick] Fix Paddle-TRT UT fails (#61605)" This reverts commit 867ab0d632e375c93fbec471ed2f1cce2cbba920. * Revert "fix se (#61640) (#61702)" This reverts commit c0f4a4975a473ce64ffa1e8b5fdc7b73802669db. * Revert "fix dataloaer for toolkit (#61867) (#61994)" This reverts commit b50e906d8b019d75d916b17485cef32831270a85. * Revert "[Cherry-Pick] Fix CacheKV Quant Bug (#61966)" This reverts commit 04ac1c0f90d545f9a9f00fe44c9d291a5b93a786. * Revert "[Paddle-TRT] fix solve (#61806)" This reverts commit df0155fa0d9010fd900d2727de1d8b4f8e0fd805. * Revert "fix launch when elastic run (#61847) (#61878)" This reverts commit f09d9d88653e8c9370ad76f89a44b3b0fcd6f879. * Revert "Support Fake GroupWise Quant (#61900)" This reverts commit 2175de006de702f5ba514126ff7dbaf4958424f3. * Revert "repeat_interleave support bf16 dtype (#61854) (#61899)" This reverts commit 96c2aafdc4b8950d23fd54064833261d906b15ac. * Revert "[security] refine _get_program_cache_key (#61827) (#61896)" This reverts commit b6a38d09636566d78477c43f5610775cad5799e0. * Revert "merge (#61866)" This reverts commit 39010bfc8f2e5621b121ee2daa2da907c3d03f0b. * Revert "fix doc style (#61688)" This reverts commit 12e5c97c78b8ec9a8b38aeaafe74cde0980be871. * Revert "fix layer_norm decompose dtyte bugs, polish codes (#61631)" This reverts commit e5a85b63b70a5c3ba6fabd775566c40e95f19388. * Revert "remove _wget (#61356) (#61569)" This reverts commit 9250f66d63720b182b1002cd6d8554894e58502e. * Revert "cinn(py-dsl): skip eval string in python-dsl (#61380) (#61586)" This reverts commit a37f6fb60a69de3c7942369da14b23beec8a5c28. * Revert "Fix unique (#60840) (#61044)" This reverts commit 3452e612fd41b39e43b541595f52bf6be571ff87. * Revert "[CherryPick] Fix issue 60092 (#61427)" This reverts commit f0253852fe3ce43c3029b7a07b8bcf6fb9ab0d7b. * Revert "[cherry-pick] adapt c_embedding to phi namespace for custom devices (#60774) (#61045)" This reverts commit 0ccb9cbe1566029847ed26352c61b5b2e009cfad. * Revert "check eval for security (#61389)" This reverts commit 60325a1b548fb827fe40020c18f56ee78899d4bf. * Revert "[Security] fix download security problem (#61162) (#61388)" This reverts commit 5f3bbeb515c2a5c1ea9521bf74a5e6078a4488e2. * Revert "[Security] fix security problem for run_cmd (#61285) (#61398)" This reverts commit 9cd0c91934a313352eeb84dd3ee52df7821237a4. * Revert "[Security] fix security problem for prune_by_memory_estimation (#61382)" This reverts commit af9b8c5e55a40fd3143bcc2772a81cc48d7b5cf3. * Revert "Fix CVE-2024-0521 (#61032) (#61287)" This reverts commit f99d4f2d649d07fe7be420514c53090f11ec46a9. * Revert "fix _decompress security problem (#61294) (#61337)" This reverts commit 0227a0db164ffde43ef9a577409ee0f59a3c518c. * Revert "[Security] fix draw security problem (#61161) (#61338)" This reverts commit aeaa0cae925144334d693aac91efa92f98421190. * Revert "fix qat tests (#61211) (#61284)" This reverts commit ff119d07af2fc3f3f5d1399d102434a36802ea75. * Revert "fix core dump when fallback gather_nd_grad and MemoryAllocateHost (#61067)" This reverts commit ac1702b5eac63fbcdb12dae1e6427cd8e02a5108. * Revert "[cherry-pick] This PR enable offset of generator for custom device. (#60616) (#60772)" This reverts commit 0f732a53b229f43db4395b26fc473ce1f0a3c873. * Revert "[Cherry-pick] fix set_value with scalar grad (#60930)" This reverts commit 1aa5f4b7fb64b5eccb0ad287d6170e4eda25fca7. * Revert "[Dy2St][2.6] Increase `test_transformer` and `test_mobile_net` ut time (#60829) (#60875)" This reverts commit d788e9bc75524f161ccedd392200f16c27b904d5. * Revert "[Dy2St][2.6] Disable `test_transformer` on `release/2.6` and update README (#60786)" This reverts commit e738f49af3488010ea955d8c0a2cfa47c4978218. * Revert "fix bug of ci (#59926) (#60785)" This reverts commit 7b0d2e9377d20a38d32b04d1e156fb5c068d35a4. * Revert "[Dy2St][2.6] Disable `test_grad` on release/2.6 (#60662)" This reverts commit e50f43e99673a1a349b28877a2a1b0d5c69f84ab. * Revert "[cherry-pick]update pdsa-2023-019 (#60649)" This reverts commit ccdf5282b0b0aa495b59dfd5aa9d23e659b09147. * Revert "[cherry-pick]fix fleetutil get_online_pass_interval bug3 (#60620)" This reverts commit bbc13ebe07d682020ec9bc3eace3da020275fde1. * Revert "fix fused_rope diff (#60217) (#60593)" This reverts commit 97b65c7d43f4ce28366d17b907051b1ef3d9f643. * Revert "fix fleetutil get_online_pass_interval bug2; test=develop (#60545)" This reverts commit ae2e58805cbcf800c6d0487adeea7509c6a454d9. * Revert "update 2023 security advisory, test=document_fix (#60532)" This reverts commit 83ce809bb03960647622cdca7e3e4dcfca3b5e04. * Revert "add chunk allocator posix_memalign return value check (#60208) (#60495)" This reverts commit b065877d9ade6ea169755586b3dc957105ddd8cf. * Revert "tile (#60261)" This reverts commit 203754e47a957731d14293dfdcf189b98ccdc7d0. * Revert "[Cherry-pick] fix weight quant kernel bug when n div 64 != 0 (#60184)" This reverts commit 20d3558769fc80099c875d4a4ef917a3ad56a24d. * Revert "[Dy2St] Disable `test_bert` on CPU (#60173) (#60324)" This reverts commit a4cd8477c9671197510a815b1f46626fff3066be. * Revert "fix windows bug for common lib (#60308)" This reverts commit 1b696a124bdafc84020d2e9d3ac48df9e79a4f4e. * update to v2.6.0 * enable WITH_DISTRIBUTED in CMakeLists.txt and port related source file from cuda to musa * fix some bugs when WITH_DISTRIBUTED is enabled * delete useless cout in ../paddle/phi/backends/gpu/musa/musa_info.cc and set compute capacity to 9.9 for UT --- .gitmodules | 8 - CMakeLists.txt | 71 +- README.md | 2 +- README_cn.md | 4 +- README_ja.md | 2 +- cmake/configure.cmake | 13 + cmake/cupti.cmake | 6 +- cmake/external/cryptopp.cmake | 18 +- cmake/external/eigen.cmake | 70 + cmake/flags.cmake | 5 + cmake/generic.cmake | 143 +- cmake/inference_lib.cmake | 15 +- cmake/mccl.cmake | 51 + cmake/mudnn.cmake | 92 + cmake/musa.cmake | 128 ++ cmake/operators.cmake | 94 +- cmake/phi.cmake | 2 +- paddle/cinn/ir/ir_base.h | 9 +- paddle/cinn/ir/utils/ir_nodes_collector.cc | 67 +- paddle/common/array.h | 8 +- paddle/common/hostdevice.h | 6 +- paddle/common/macros.h | 2 +- .../distributed/collective/CMakeLists.txt | 4 +- .../collective/process_group_nccl.cc | 30 +- .../collective/process_group_nccl.h | 2 +- .../collective/processgroup_comm_utils.cc | 6 +- .../fluid/distributed/collective/reducer.cc | 6 +- .../distributed/common/chunk_allocator.h | 14 +- .../distributed/fleet_executor/carrier.cc | 2 +- .../fleet_executor/cond_interceptor.cc | 2 +- .../distributed/fleet_executor/dist_model.cc | 2 +- .../distributed/fleet_executor/message_bus.cc | 2 +- .../forwards/multiply_fwd_func.cc | 10 +- .../eager/auto_code_generator/CMakeLists.txt | 4 + .../generator/eager_gen.py | 2 +- .../generator/python_c_gen.py | 2 +- paddle/fluid/eager/nan_inf_utils.cc | 2 +- paddle/fluid/framework/CMakeLists.txt | 7 +- paddle/fluid/framework/conv_search_cache.h | 18 + paddle/fluid/framework/custom_operator.cc | 4 +- paddle/fluid/framework/data_feed.cc | 4 +- paddle/fluid/framework/data_feed.cu | 40 +- paddle/fluid/framework/data_feed.h | 2 +- paddle/fluid/framework/data_feed_factory.cc | 2 +- paddle/fluid/framework/data_type_transform.cc | 2 +- paddle/fluid/framework/details/CMakeLists.txt | 71 +- .../framework/details/all_reduce_op_handle.cc | 12 +- .../framework/details/all_reduce_op_handle.h | 8 +- .../framework/details/broadcast_op_handle.cc | 6 +- .../framework/details/broadcast_op_handle.h | 8 +- .../fluid/framework/details/build_strategy.cc | 12 +- .../fluid/framework/details/build_strategy.h | 2 +- .../details/eager_deletion_op_handle.cc | 21 +- .../details/eager_deletion_op_handle.h | 2 +- .../details/fetch_async_op_handle.cc | 2 +- .../framework/details/fetch_op_handle.cc | 2 +- .../details/fused_all_reduce_op_handle.cc | 21 +- .../details/fused_all_reduce_op_handle.h | 6 +- .../details/fused_broadcast_op_handle.h | 4 +- .../grad_merge_all_reduce_op_handle.cc | 6 +- .../details/grad_merge_all_reduce_op_handle.h | 6 +- .../framework/details/nan_inf_utils_detail.cc | 2 +- .../fluid/framework/details/nccl_op_handle.h | 61 +- .../fluid/framework/details/op_handle_base.cc | 34 +- .../fluid/framework/details/op_handle_base.h | 2 +- .../framework/details/reduce_op_handle.cc | 8 +- .../framework/details/reduce_op_handle.h | 6 +- .../details/scale_loss_grad_op_handle.cc | 4 +- .../details/share_tensor_buffer_op_handle.cc | 2 +- .../details/sparse_all_reduce_op_handle.cc | 6 +- paddle/fluid/framework/details/var_handle.h | 4 +- paddle/fluid/framework/device_worker.h | 20 +- .../fluid/framework/device_worker_factory.cc | 4 +- paddle/fluid/framework/dlpack_tensor.cc | 4 +- paddle/fluid/framework/fleet/CMakeLists.txt | 14 +- paddle/fluid/framework/fleet/box_wrapper.cu | 22 + paddle/fluid/framework/fleet/box_wrapper.h | 3 + .../fluid/framework/fleet/box_wrapper_impl.h | 13 +- paddle/fluid/framework/fleet/fleet_wrapper.cc | 5 +- paddle/fluid/framework/fleet/fleet_wrapper.h | 2 +- .../framework/fleet/heter_ps/CMakeLists.txt | 15 + .../fleet/heter_ps/graph_gpu_wrapper.cu | 6 +- .../fleet/heter_ps/graph_gpu_wrapper.h | 8 +- .../framework/fleet/heter_ps/heter_comm.h | 8 +- .../framework/fleet/heter_ps/heter_comm_inl.h | 16 +- .../framework/fleet/heter_ps/heter_ps.cu | 4 +- .../fluid/framework/fleet/heter_ps/heter_ps.h | 4 +- .../framework/fleet/heter_ps/heter_ps_base.h | 4 +- paddle/fluid/framework/fleet/heter_wrapper.cc | 6 +- paddle/fluid/framework/fleet/heter_wrapper.h | 2 +- paddle/fluid/framework/fleet/nccl_wrapper.cc | 22 +- paddle/fluid/framework/fleet/nccl_wrapper.h | 10 +- paddle/fluid/framework/fleet/ps_gpu_wrapper.h | 12 +- paddle/fluid/framework/garbage_collector.cc | 8 +- paddle/fluid/framework/garbage_collector.h | 2 +- paddle/fluid/framework/hogwild_worker.cc | 18 +- paddle/fluid/framework/ir/CMakeLists.txt | 8 +- paddle/fluid/framework/ir/cost_model.cc | 4 +- paddle/fluid/framework/ir/fuse_bn_act_pass.cc | 4 +- .../framework/ir/fuse_bn_add_act_pass.cc | 4 +- .../framework/ir/fusion_group/CMakeLists.txt | 2 +- .../ir/fusion_group/code_generator_tester.cc | 2 +- .../ir/fusion_group/cuda_resources.h | 2 +- paddle/fluid/framework/ir/graph_helper.cc | 6 +- ...est_reference_count_pass_last_lived_ops.cc | 2 +- .../all_reduce_deps_pass.cc | 2 +- .../fuse_all_reduce_op_pass.cc | 16 +- .../multi_devices_graph_pass.cc | 18 +- .../multi_devices_graph_pass.h | 4 +- .../instruction/instruction_util.cc | 6 +- .../interpreter/execution_config.cc | 2 +- .../interpreter/interpreter_util.cc | 2 +- .../interpreter/stream_analyzer.cc | 4 +- .../new_executor/interpreter_base_impl.h | 4 +- .../new_executor/new_executor_defs.cc | 4 +- .../new_executor/new_executor_defs.h | 4 +- .../framework/new_executor/pir_interpreter.cc | 10 +- .../fluid/framework/new_executor/profiler.h | 2 +- .../new_executor/program_interpreter.cc | 22 +- .../new_executor/program_interpreter.h | 4 +- paddle/fluid/framework/op_registry.h | 4 +- paddle/fluid/framework/operator.cc | 18 +- paddle/fluid/framework/operator.h | 2 +- paddle/fluid/framework/parallel_executor.cc | 60 +- paddle/fluid/framework/parallel_executor.h | 2 +- paddle/fluid/framework/phi_utils.cc | 2 +- paddle/fluid/framework/phi_utils.h | 2 +- paddle/fluid/framework/pipeline_trainer.cc | 4 +- paddle/fluid/framework/ps_gpu_trainer.cc | 2 +- paddle/fluid/framework/ps_gpu_worker.cc | 6 +- paddle/fluid/framework/pull_dense_worker.cc | 14 +- paddle/fluid/framework/section_worker.cc | 4 +- paddle/fluid/framework/tensor_util.cc | 14 +- paddle/fluid/framework/tensor_util.h | 8 +- paddle/fluid/framework/trainer.h | 12 +- paddle/fluid/framework/trainer_factory.cc | 6 +- paddle/fluid/framework/var_type_traits.cc | 7 + paddle/fluid/framework/var_type_traits.h | 20 +- paddle/fluid/imperative/CMakeLists.txt | 11 +- paddle/fluid/imperative/all_reduce.cc | 37 +- paddle/fluid/imperative/all_reduce.h | 2 +- paddle/fluid/imperative/amp_auto_cast.cc | 2 +- paddle/fluid/imperative/gloo_context.cc | 2 +- .../fluid/imperative/gradient_accumulator.cc | 18 +- paddle/fluid/imperative/nccl_context.cc | 22 +- paddle/fluid/imperative/nccl_context.h | 10 +- paddle/fluid/imperative/prepared_operator.cc | 6 +- paddle/fluid/imperative/reducer.cc | 10 +- paddle/fluid/imperative/reducer.cu | 2 +- paddle/fluid/imperative/reducer.h | 2 +- paddle/fluid/imperative/tracer.cc | 6 +- paddle/fluid/inference/CMakeLists.txt | 2 +- .../ir_params_sync_among_devices_pass.cc | 4 +- .../ir_params_sync_among_devices_pass.h | 2 +- paddle/fluid/inference/api/analysis_config.cc | 15 +- .../fluid/inference/api/analysis_predictor.cc | 35 +- .../fluid/inference/api/analysis_predictor.h | 2 +- paddle/fluid/inference/api/api_impl.cc | 2 +- .../inference/api/details/zero_copy_tensor.cc | 21 +- paddle/fluid/inference/api/infer_context.cc | 2 +- paddle/fluid/inference/api/infer_context.h | 4 +- .../inference/api/paddle_analysis_config.h | 3 +- paddle/fluid/inference/api/paddle_api.h | 3 + .../inference/api/paddle_pass_builder.cc | 5 +- .../fluid/inference/api/resource_manager.cc | 79 +- paddle/fluid/inference/api/resource_manager.h | 26 +- paddle/fluid/inference/lite/tensor_utils.cc | 2 +- paddle/fluid/inference/tensorrt/op_teller.cc | 8 +- .../tensorrt/plugin/c_allreduce_op_plugin.cu | 28 +- .../tensorrt/plugin/qkv_to_context_plugin.cu | 3 + paddle/fluid/inference/utils/CMakeLists.txt | 13 + paddle/fluid/inference/utils/benchmark.cc | 54 + paddle/fluid/inference/utils/benchmark.h | 56 + .../fluid/inference/utils/benchmark_tester.cc | 40 + .../inference/utils/table_printer_tester.cc | 82 + paddle/fluid/memory/CMakeLists.txt | 11 + paddle/fluid/memory/allocation/CMakeLists.txt | 14 +- paddle/fluid/memory/allocation/allocator.h | 14 +- .../memory/allocation/allocator_facade.cc | 32 +- .../memory/allocation/allocator_facade.h | 2 +- .../memory/allocation/buddy_allocator.cc | 6 +- .../fluid/memory/allocation/cuda_allocator.cc | 4 + .../cuda_device_context_allocator.h | 9 +- .../allocation/cuda_managed_allocator.cc | 5 + .../allocation/naive_best_fit_allocator.cc | 28 +- .../memory/allocation/pinned_allocator.cc | 4 + .../allocation/stream_safe_cuda_allocator.cc | 16 + .../allocation/stream_safe_cuda_allocator.h | 3 + .../memory/allocation/system_allocator.cc | 22 +- .../memory/allocation/system_allocator.h | 2 +- paddle/fluid/memory/malloc.cc | 2 +- paddle/fluid/memory/malloc.h | 2 +- paddle/fluid/memory/memcpy.cc | 83 +- paddle/fluid/operators/CMakeLists.txt | 12 +- paddle/fluid/operators/affine_channel_op.cu | 2 +- .../fluid/operators/array_to_lod_tensor_op.cc | 2 +- paddle/fluid/operators/batch_norm_op.cu | 2 +- .../fluid/operators/class_center_sample_op.cu | 23 +- .../fluid/operators/collective/CMakeLists.txt | 2 +- .../operators/collective/alltoall_op.cu.cc | 10 +- .../operators/collective/barrier_op.cu.cc | 12 +- .../operators/collective/c_allgather_op.cu.cc | 10 +- .../collective/c_allreduce_max_op.cu.cc | 4 +- .../operators/collective/c_allreduce_op.h | 20 +- .../collective/c_allreduce_sum_op.cu.cc | 4 +- .../operators/collective/c_broadcast_op.cu.cc | 14 +- .../collective/c_comm_init_all_op.cc | 4 +- .../collective/c_comm_init_multitrainer_op.cc | 10 +- .../operators/collective/c_comm_init_op.cc | 17 +- .../operators/collective/c_concat_op.cu.cc | 14 +- .../operators/collective/c_gen_nccl_id_op.cc | 14 +- .../fluid/operators/collective/c_reduce_op.h | 20 +- .../collective/c_reducescatter_op.cu.cc | 16 +- .../operators/collective/c_scatter_op.cu.cc | 10 +- .../c_softmax_with_cross_entropy_op.cu | 20 +- .../collective/c_sync_calc_stream_op.h | 2 +- .../collective/c_sync_comm_stream_op.h | 6 +- .../operators/collective/c_wait_comm_op.cc | 7 +- .../operators/collective/c_wait_compute_op.cc | 7 +- .../operators/collective/gen_nccl_id_op.cc | 14 +- .../collective/global_gather_op.cu.cc | 38 +- .../collective/global_scatter_op.cu.cc | 38 +- .../collective/mp_allreduce_sum_op.cu.cc | 4 +- .../collective/partial_allgather_op.cu.cc | 14 +- .../collective/partial_recv_op.cu.cc | 14 +- .../collective/partial_send_op.cu.cc | 14 +- .../operators/collective/recv_v2_op.cu.cc | 24 +- .../operators/collective/send_v2_op.cu.cc | 26 +- .../controlflow/conditional_block_op.h | 2 +- paddle/fluid/operators/controlflow/feed_op.cc | 2 +- .../operators/controlflow/get_places_op.cc | 4 +- .../operators/controlflow/while_op_helper.cc | 2 +- paddle/fluid/operators/data_norm_op.cu | 28 +- .../fluid/operators/detection/CMakeLists.txt | 4 +- .../fluid/operators/detection/bbox_util.cu.h | 2 +- .../detection/collect_fpn_proposals_op.cu | 2 +- paddle/fluid/operators/dgc_clip_by_norm_op.h | 76 +- .../elementwise/elementwise_op_function.h | 19 +- paddle/fluid/operators/expand_op.cc | 2 +- paddle/fluid/operators/fake_quantize_op.cu.h | 2 + paddle/fluid/operators/fused/CMakeLists.txt | 12 +- .../fluid/operators/fused/attn_bias_add.cu.h | 2 +- .../operators/fused/fused_attention_utils.h | 10 +- .../operators/fused/fused_dropout_common.h | 4 +- .../fused/fused_multi_transformer_op.cu.h | 12 +- .../operators/fused/fused_seqpool_cvm_op.cu | 59 + .../fluid/operators/fused/yolo_box_post_op.cu | 39 + .../get_tensor_from_selected_rows_op.cc | 2 +- .../fluid/operators/graph_khop_sampler_op.cu | 11 + .../operators/grid_sampler_cudnn_op.cu.cc | 2 +- paddle/fluid/operators/hinge_loss_op.cc | 2 +- paddle/fluid/operators/im2sequence_op.cc | 2 +- paddle/fluid/operators/isfinite_op.h | 8 +- paddle/fluid/operators/l1_norm_op.cc | 2 +- paddle/fluid/operators/load_op.cc | 2 +- .../fluid/operators/lod_tensor_to_array_op.cc | 2 +- paddle/fluid/operators/lookup_table_v2_op.cu | 3 + .../operators/margin_cross_entropy_op.cu | 38 +- .../operators/math/bert_encoder_functor.h | 8 +- paddle/fluid/operators/math/gru_compute.cc | 8 +- paddle/fluid/operators/math/inclusive_scan.h | 2 +- paddle/fluid/operators/math/prelu.h | 2 +- paddle/fluid/operators/math/sample_prob.cu | 5 + paddle/fluid/operators/math/sample_prob.h | 2 +- paddle/fluid/operators/matmul_op.cc | 12 +- paddle/fluid/operators/memcpy_h2d_op.h | 2 +- paddle/fluid/operators/merge_lod_tensor_op.cc | 2 +- paddle/fluid/operators/minus_op.cc | 2 +- paddle/fluid/operators/nccl/CMakeLists.txt | 11 +- .../fluid/operators/nccl/nccl_gpu_common.cc | 10 +- paddle/fluid/operators/nccl/nccl_gpu_common.h | 4 +- paddle/fluid/operators/nccl/nccl_op.cc | 24 +- paddle/fluid/operators/nccl/nccl_op.cu.cc | 34 +- .../optimizers/distributed_fused_lamb_op.cu | 188 +- .../operators/optimizers/sparse_momentum_op.h | 5 +- .../operators/pscore/send_and_recv_op.cc | 2 +- paddle/fluid/operators/rank_loss_op.cc | 2 +- .../fluid/operators/reader/buffered_reader.cc | 9 +- .../fluid/operators/reader/buffered_reader.h | 4 +- paddle/fluid/operators/reduce_ops/reduce_op.h | 4 +- paddle/fluid/operators/reshape_op.cc | 8 +- paddle/fluid/operators/save_op.cc | 2 +- paddle/fluid/operators/select_op_helper.h | 2 +- .../sequence_ops/sequence_reverse_op.h | 4 +- .../sequence_softmax_cudnn_op.cu.cc | 69 +- .../sequence_ops/sequence_softmax_op.cc | 2 +- .../sequence_ops/sequence_softmax_op.cu | 4 + paddle/fluid/operators/set_value_op.cc | 44 +- paddle/fluid/operators/split_lod_tensor_op.cc | 2 +- paddle/fluid/operators/svd_helper.h | 2 +- paddle/fluid/operators/sync_batch_norm_op.cu | 94 +- .../fluid/operators/sync_batch_norm_utils.h | 15 +- paddle/fluid/operators/top_k_op.cu | 3 +- paddle/fluid/operators/uniform_random_op.h | 4 +- paddle/fluid/platform/CMakeLists.txt | 66 +- paddle/fluid/platform/collective_helper.cc | 32 +- paddle/fluid/platform/collective_helper.h | 14 +- paddle/fluid/platform/device/CMakeLists.txt | 2 +- paddle/fluid/platform/device/device_wrapper.h | 2 +- .../fluid/platform/device/gpu/CMakeLists.txt | 12 + paddle/fluid/platform/device/gpu/gpu_helper.h | 4 +- paddle/fluid/platform/device/gpu/gpu_info.cc | 13 + paddle/fluid/platform/device/gpu/gpu_info.h | 2 +- .../platform/device/gpu/gpu_launch_config.h | 4 +- .../platform/device/gpu/gpu_resource_pool.cc | 12 +- .../platform/device/gpu/gpu_resource_pool.h | 7 +- paddle/fluid/platform/device/gpu/gpu_types.h | 124 +- .../platform/device/gpu/musa/musa_helper.h | 104 ++ .../fluid/platform/device/gpu/nccl_helper.h | 86 +- paddle/fluid/platform/device_context.cc | 10 +- paddle/fluid/platform/device_context.h | 16 +- paddle/fluid/platform/device_event.h | 2 +- paddle/fluid/platform/device_event_base.cc | 8 + paddle/fluid/platform/device_event_gpu.cc | 2 +- paddle/fluid/platform/dynload/CMakeLists.txt | 22 + .../fluid/platform/dynload/dynamic_loader.h | 1 + paddle/fluid/platform/dynload/mccl.cc | 43 + paddle/fluid/platform/dynload/mccl.h | 51 + paddle/fluid/platform/dynload/mublas.cc | 38 + paddle/fluid/platform/dynload/mublas.h | 55 + paddle/fluid/platform/dynload/mudnn.cc | 30 + paddle/fluid/platform/dynload/mudnn.h | 39 + paddle/fluid/platform/dynload/mufft.cc | 30 + paddle/fluid/platform/dynload/mufft.h | 93 + paddle/fluid/platform/dynload/murand.cc | 27 + paddle/fluid/platform/dynload/murand.h | 43 + paddle/fluid/platform/dynload/musa_driver.cc | 31 + paddle/fluid/platform/dynload/musa_driver.h | 58 + paddle/fluid/platform/dynload/musartc.cc | 31 + paddle/fluid/platform/dynload/musartc.h | 51 + paddle/fluid/platform/dynload/musparse.cc | 30 + paddle/fluid/platform/dynload/musparse.h | 41 + paddle/fluid/platform/dynload/nccl.cc | 16 +- paddle/fluid/platform/dynload/nccl.h | 30 +- paddle/fluid/platform/dynload/rccl.cc | 16 +- paddle/fluid/platform/dynload/rccl.h | 14 +- paddle/fluid/platform/enforce.h | 26 +- paddle/fluid/platform/enforce_test.cc | 4 +- paddle/fluid/platform/event.h | 5 + paddle/fluid/platform/gen_comm_id_helper.cc | 6 +- paddle/fluid/platform/gen_comm_id_helper.h | 2 +- paddle/fluid/platform/init.cc | 20 +- paddle/fluid/platform/place.h | 4 +- paddle/fluid/platform/profiler.cc | 2 +- paddle/fluid/platform/profiler.cu | 19 + paddle/fluid/platform/profiler.h | 4 +- .../platform/profiler/chrometracing_logger.cc | 40 +- .../platform/profiler/chrometracing_logger.h | 2 +- .../profiler/dump/deserialization_reader.cc | 4 +- .../profiler/dump/deserialization_reader.h | 2 +- .../profiler/dump/serialization_logger.cc | 2 +- .../profiler/dump/serialization_logger.h | 2 +- .../fluid/platform/profiler/event_python.cc | 6 +- paddle/fluid/platform/profiler/event_python.h | 6 +- paddle/fluid/platform/profiler/profiler.cc | 13 +- .../fluid/platform/profiler/profiler_test.cc | 8 + paddle/fluid/platform/profiler_helper.h | 19 +- .../fluid/platform/stream_callback_manager.cc | 15 +- .../fluid/platform/stream_callback_manager.h | 5 + paddle/fluid/primitive/composite/composite.h | 45 +- paddle/fluid/pybind/CMakeLists.txt | 30 +- paddle/fluid/pybind/communication.cc | 2 +- paddle/fluid/pybind/cuda_streams_py.cc | 22 +- paddle/fluid/pybind/cuda_streams_py.h | 4 +- paddle/fluid/pybind/distributed_py.cc | 4 +- paddle/fluid/pybind/eager.cc | 2 +- paddle/fluid/pybind/eager_functions.cc | 2 +- paddle/fluid/pybind/eager_math_op_patch.cc | 2 +- paddle/fluid/pybind/eager_method.cc | 285 ++- paddle/fluid/pybind/generator_py.cc | 2 +- paddle/fluid/pybind/imperative.cc | 6 +- paddle/fluid/pybind/inference_api.cc | 14 +- paddle/fluid/pybind/parallel_executor.cc | 8 +- paddle/fluid/pybind/place.cc | 20 +- paddle/fluid/pybind/process_group_utils.h | 4 +- paddle/fluid/pybind/pybind.cc | 59 +- paddle/fluid/pybind/slice_utils.h | 151 +- paddle/fluid/pybind/tensor.cc | 10 +- paddle/fluid/pybind/tensor_py.h | 17 +- paddle/phi/CMakeLists.txt | 17 +- paddle/phi/api/include/context_pool.h | 2 +- paddle/phi/api/include/tensor.h | 7 +- paddle/phi/api/lib/api_gen_utils.cc | 6 +- paddle/phi/api/lib/context_pool.cc | 4 +- paddle/phi/api/lib/data_transform.cc | 8 +- paddle/phi/api/lib/tensor.cc | 2 +- paddle/phi/api/lib/tensor_utils.cc | 40 +- paddle/phi/api/profiler/event.h | 32 +- paddle/phi/api/yaml/backward.yaml | 6 +- .../phi/api/yaml/generator/dist_bw_api_gen.py | 1 - paddle/phi/api/yaml/legacy_backward.yaml | 6 +- paddle/phi/api/yaml/op_compat.yaml | 2 +- paddle/phi/api/yaml/ops.yaml | 2 +- paddle/phi/backends/CMakeLists.txt | 6 +- paddle/phi/backends/context_pool.cc | 2 +- paddle/phi/backends/context_pool.h | 4 +- paddle/phi/backends/custom/custom_device.cc | 2 +- paddle/phi/backends/device_code.cc | 144 +- paddle/phi/backends/device_code.h | 16 +- paddle/phi/backends/device_memory_aligment.h | 2 +- paddle/phi/backends/dynload/CMakeLists.txt | 22 + paddle/phi/backends/dynload/dynamic_loader.cc | 48 + paddle/phi/backends/dynload/dynamic_loader.h | 1 + paddle/phi/backends/dynload/mccl.cc | 36 + paddle/phi/backends/dynload/mccl.h | 80 + paddle/phi/backends/dynload/mublas.cc | 38 + paddle/phi/backends/dynload/mublas.h | 128 ++ paddle/phi/backends/dynload/mudnn.cc | 41 + paddle/phi/backends/dynload/mudnn.h | 41 + paddle/phi/backends/dynload/mufft.cc | 43 + paddle/phi/backends/dynload/mufft.h | 155 ++ paddle/phi/backends/dynload/murand.cc | 28 + paddle/phi/backends/dynload/murand.h | 54 + paddle/phi/backends/dynload/musa_driver.cc | 33 + paddle/phi/backends/dynload/musa_driver.h | 69 + paddle/phi/backends/dynload/musartc.cc | 34 + paddle/phi/backends/dynload/musartc.h | 147 ++ paddle/phi/backends/dynload/musparse.cc | 29 + paddle/phi/backends/dynload/musparse.h | 76 + paddle/phi/backends/dynload/nccl.h | 14 +- paddle/phi/backends/dynload/rccl.h | 14 +- paddle/phi/backends/gpu/forwards.h | 19 + paddle/phi/backends/gpu/gpu_context.cc | 176 +- paddle/phi/backends/gpu/gpu_context.h | 22 +- paddle/phi/backends/gpu/gpu_decls.h | 81 +- paddle/phi/backends/gpu/gpu_device_function.h | 4 +- paddle/phi/backends/gpu/gpu_dnn.h | 5 +- paddle/phi/backends/gpu/gpu_helper.h | 4 +- paddle/phi/backends/gpu/gpu_info.h | 2 +- paddle/phi/backends/gpu/gpu_launch_config.h | 4 +- paddle/phi/backends/gpu/gpu_primitives.h | 186 +- paddle/phi/backends/gpu/gpu_resources.cc | 175 +- paddle/phi/backends/gpu/gpu_resources.h | 8 +- paddle/phi/backends/gpu/gpu_types.h | 70 +- paddle/phi/backends/gpu/musa/mudnn_desc.h | 202 +++ paddle/phi/backends/gpu/musa/mudnn_helper.h | 323 ++++ .../backends/gpu/musa/musa_device_function.h | 193 ++ paddle/phi/backends/gpu/musa/musa_helper.h | 74 + paddle/phi/backends/gpu/musa/musa_info.cc | 334 ++++ paddle/phi/capi/include/c_meta_tensor.h | 12 - paddle/phi/capi/include/c_tensor.h | 17 - paddle/phi/capi/include/wrapper_base.h | 66 - paddle/phi/capi/lib/c_device_context.cc | 2 +- paddle/phi/capi/lib/c_kernel_context.cc | 2 +- paddle/phi/capi/lib/c_meta_tensor.cc | 46 - paddle/phi/capi/lib/c_tensor.cc | 72 - paddle/phi/common/backend.h | 2 +- paddle/phi/common/bfloat16.h | 40 +- paddle/phi/common/complex.h | 19 +- paddle/phi/common/cpstring_impl.h | 6 +- paddle/phi/common/float16.h | 53 +- paddle/phi/common/memory_utils.cc | 6 +- paddle/phi/common/memory_utils.h | 23 +- paddle/phi/common/place.cc | 4 +- paddle/phi/common/transform.h | 17 +- paddle/phi/core/compat/convert_utils.cc | 6 +- paddle/phi/core/cuda_stream.h | 22 + paddle/phi/core/distributed/CMakeLists.txt | 2 +- .../auto_parallel/reshard/reshard_utils.cc | 4 +- .../auto_parallel/reshard/reshard_utils.h | 4 +- .../phi/core/distributed/check/CMakeLists.txt | 2 +- .../distributed/check/nccl_dynamic_check.cc | 38 +- .../distributed/check/nccl_dynamic_check.h | 10 +- .../core/distributed/comm_context_manager.cc | 16 +- .../core/distributed/comm_context_manager.h | 8 +- paddle/phi/core/distributed/comm_task.h | 9 +- .../phi/core/distributed/comm_task_manager.cc | 2 +- .../phi/core/distributed/nccl_comm_context.cc | 50 +- .../phi/core/distributed/nccl_comm_context.h | 31 +- paddle/phi/core/distributed/nccl_comm_task.cc | 55 +- paddle/phi/core/distributed/nccl_comm_task.h | 6 +- paddle/phi/core/distributed/nccl_tools.cc | 76 +- paddle/phi/core/distributed/nccl_tools.h | 36 +- paddle/phi/core/enforce.h | 272 ++- paddle/phi/core/flags.cc | 22 +- paddle/phi/core/generator.cc | 5 +- paddle/phi/core/hostdevice.h | 6 +- paddle/phi/core/kernel_factory.cc | 4 +- paddle/phi/core/kernel_registry.cc | 2 +- paddle/phi/core/kernel_registry.h | 2 +- paddle/phi/core/kernel_utils.h | 2 +- paddle/phi/core/mixed_vector.cc | 4 +- paddle/phi/core/string_tensor.cc | 4 +- paddle/phi/core/tensor_utils.cc | 16 +- paddle/phi/core/utils/data_type.h | 29 +- paddle/phi/core/utils/type_info.cc | 4 +- paddle/phi/core/utils/visit_place.h | 4 +- paddle/phi/core/visit_type.h | 4 +- paddle/phi/infermeta/multiary.cc | 2 +- paddle/phi/kernels/CMakeLists.txt | 76 +- paddle/phi/kernels/array_kernel.cc | 8 +- paddle/phi/kernels/assign_kernel.cc | 2 +- paddle/phi/kernels/autotune/gpu_timer.h | 39 +- paddle/phi/kernels/batch_norm_kernel.cc | 2 +- .../kernels/check_memory_continue_kernel.cc | 2 +- paddle/phi/kernels/coalesce_tensor_kernel.cc | 14 + .../phi/kernels/cpu/cum_maxmin_grad_kernel.cc | 8 +- paddle/phi/kernels/cpu/decode_jpeg_kernel.cc | 2 +- paddle/phi/kernels/cpu/gelu_grad_kernel.cc | 2 +- paddle/phi/kernels/cpu/gelu_kernel.cc | 2 +- .../kernels/cpu/put_along_axis_grad_kernel.cc | 149 +- .../phi/kernels/cpu/put_along_axis_kernel.cc | 40 +- .../cpu/repeat_interleave_grad_kernel.cc | 6 +- .../kernels/cpu/repeat_interleave_kernel.cc | 6 +- .../phi/kernels/cpu/set_value_grad_kernel.cc | 17 - .../cpu/take_along_axis_grad_kernel.cc | 3 +- .../phi/kernels/cpu/take_along_axis_kernel.cc | 6 +- .../kernels/custom/c_embedding_grad_kernel.cc | 93 - .../phi/kernels/custom/c_embedding_kernel.cc | 84 - paddle/phi/kernels/dist_grad_kernel.cc | 2 +- paddle/phi/kernels/empty_kernel.cc | 2 +- paddle/phi/kernels/flatten_grad_kernel.cc | 2 +- paddle/phi/kernels/flatten_kernel.cc | 2 +- paddle/phi/kernels/full_kernel.cc | 2 +- paddle/phi/kernels/funcs/CMakeLists.txt | 8 +- paddle/phi/kernels/funcs/activation_functor.h | 2 +- paddle/phi/kernels/funcs/algorithm.h | 4 +- paddle/phi/kernels/funcs/blas/blas.h | 14 +- paddle/phi/kernels/funcs/blas/blas_impl.h | 4 +- paddle/phi/kernels/funcs/blas/blas_impl.mu.h | 1602 +++++++++++++++++ paddle/phi/kernels/funcs/broadcast_function.h | 4 +- .../phi/kernels/funcs/check_numerics_utils.h | 2 +- .../kernels/funcs/concat_and_split_functor.cu | 2 +- .../phi/kernels/funcs/detail/gru_cpu_kernel.h | 2 +- .../phi/kernels/funcs/detail/gru_gpu_kernel.h | 4 +- paddle/phi/kernels/funcs/detail/gru_kernel.h | 10 +- .../kernels/funcs/detail/lstm_cpu_kernel.h | 2 +- paddle/phi/kernels/funcs/detail/lstm_kernel.h | 4 +- .../phi/kernels/funcs/detail/strided_memcpy.h | 6 +- paddle/phi/kernels/funcs/diagonal.h | 6 +- .../phi/kernels/funcs/distribution_helper.h | 48 +- paddle/phi/kernels/funcs/dropout_impl.cu.h | 23 +- paddle/phi/kernels/funcs/elementwise_base.h | 6 +- .../phi/kernels/funcs/elementwise_functor.h | 2 +- .../phi/kernels/funcs/elementwise_grad_base.h | 4 +- .../funcs/emb_eltwise_layer_norm_functor.cu | 7 +- paddle/phi/kernels/funcs/fc_functor.cu | 6 +- paddle/phi/kernels/funcs/fft.cu | 7 +- paddle/phi/kernels/funcs/fft_cache.h | 2 + paddle/phi/kernels/funcs/fft_fill_conj.h | 4 +- paddle/phi/kernels/funcs/for_range.h | 2 +- .../kernels/funcs/gather_scatter_functor.cc | 456 +---- .../kernels/funcs/gather_scatter_functor.cu | 951 +--------- .../kernels/funcs/gather_scatter_functor.h | 183 -- paddle/phi/kernels/funcs/gru_compute.cc | 8 +- paddle/phi/kernels/funcs/inclusive_scan.h | 2 +- paddle/phi/kernels/funcs/index_calculator.h | 2 +- paddle/phi/kernels/funcs/index_put_utils.h | 163 +- .../phi/kernels/funcs/interpolate_function.h | 4 +- paddle/phi/kernels/funcs/isfinite_functor.h | 6 +- paddle/phi/kernels/funcs/layer_norm_impl.cu.h | 2 +- paddle/phi/kernels/funcs/layer_norm_util.h | 4 +- paddle/phi/kernels/funcs/load_store_util.h | 2 +- paddle/phi/kernels/funcs/math_cuda_utils.h | 17 +- paddle/phi/kernels/funcs/math_function.cc | 2 +- paddle/phi/kernels/funcs/math_function.h | 2 +- paddle/phi/kernels/funcs/matrix_inverse.cu | 2 +- paddle/phi/kernels/funcs/matrix_solve.cu | 2 +- paddle/phi/kernels/funcs/mode.h | 4 +- paddle/phi/kernels/funcs/mufft_util.h | 130 ++ .../kernels/funcs/multihead_matmul_functor.cu | 10 +- paddle/phi/kernels/funcs/norm_utils.cu.h | 2 +- paddle/phi/kernels/funcs/pooling.h | 6 +- paddle/phi/kernels/funcs/reduce_function.h | 6 +- paddle/phi/kernels/funcs/segmented_array.h | 2 +- paddle/phi/kernels/funcs/select_impl.cu.h | 4 +- .../kernels/funcs/skip_layernorm_functor.cu | 8 +- .../kernels/funcs/skip_layernorm_functor.h | 6 + paddle/phi/kernels/funcs/softmax.cu | 36 +- paddle/phi/kernels/funcs/softmax.h | 2 +- paddle/phi/kernels/funcs/sparse/softmax.cu.h | 4 + paddle/phi/kernels/funcs/sparse/sparse_blas.h | 4 + paddle/phi/kernels/funcs/squared_l2_norm.h | 6 +- paddle/phi/kernels/funcs/strided_memcpy.h | 2 +- .../phi/kernels/funcs/top_k_function_cuda.h | 29 +- .../cutlass/fused_conv2d_add_act_kernel.cu | 1 + paddle/phi/kernels/fusion/gpu/block_attn.h | 1 - .../fusion/gpu/fused_bias_act_kernel.cu | 4 +- .../kernels/fusion/gpu/fused_bias_act_utils.h | 4 +- ...dropout_residual_layer_norm_grad_kernel.cu | 6 +- ...bias_dropout_residual_layer_norm_kernel.cu | 4 +- .../gpu/fused_bn_activation_grad_kernel.cu | 2 +- .../fusion/gpu/fused_bn_activation_kernel.cu | 2 +- .../fused_bn_add_activation_grad_kernel.cu | 2 +- .../gpu/fused_bn_add_activation_kernel.cu | 2 +- .../gpu/fused_dropout_add_grad_kernel.cu | 6 +- .../fusion/gpu/fused_dropout_add_kernel.cu | 6 +- .../fused_fc_elementwise_layernorm_kernel.cu | 4 +- .../fusion/gpu/fused_layernorm_kernel.cu | 11 +- .../phi/kernels/fusion/gpu/fused_rope_utils.h | 16 +- .../fused_softmax_mask_upper_triangle_utils.h | 8 +- .../fusion/gpu/fused_softmax_mask_utils.h | 10 +- .../gpu/masked_multihead_attention_kernel.cu | 4 +- paddle/phi/kernels/fusion/gpu/mmha_util.cu.h | 2 +- .../fusion/gpu/multihead_matmul_kernel.cu | 2 + .../phi/kernels/gpu/activation_grad_kernel.cu | 12 +- paddle/phi/kernels/gpu/activation_kernel.cu | 12 +- paddle/phi/kernels/gpu/all_gather_kernel.cu | 4 +- paddle/phi/kernels/gpu/all_reduce_kernel.cu | 21 +- paddle/phi/kernels/gpu/all_to_all_kernel.cu | 41 +- paddle/phi/kernels/gpu/allclose_kernel.cu | 2 + paddle/phi/kernels/gpu/arg_min_max_kernel.cu | 4 +- paddle/phi/kernels/gpu/argsort_grad_kernel.cu | 3 +- paddle/phi/kernels/gpu/argsort_kernel.cu | 2 +- paddle/phi/kernels/gpu/auc_kernel.cu | 10 +- .../phi/kernels/gpu/batch_norm_grad_kernel.cu | 16 +- paddle/phi/kernels/gpu/batch_norm_kernel.cu | 18 +- paddle/phi/kernels/gpu/bernoulli_kernel.cu | 10 +- paddle/phi/kernels/gpu/broadcast_kernel.cu | 4 +- .../phi/kernels/gpu/check_numerics_kernel.cu | 6 + paddle/phi/kernels/gpu/cholesky_kernel.cu | 2 +- .../kernels/gpu/cholesky_solve_grad_kernel.cu | 2 +- .../phi/kernels/gpu/cholesky_solve_kernel.cu | 2 +- .../kernels/gpu/cross_entropy_grad_kernel.cu | 4 +- .../phi/kernels/gpu/cross_entropy_kernel.cu | 34 +- paddle/phi/kernels/gpu/cum_kernel.cu | 2 + .../phi/kernels/gpu/cum_maxmin_grad_kernel.cu | 8 +- paddle/phi/kernels/gpu/cumprod_grad_kernel.cu | 2 + paddle/phi/kernels/gpu/decode_jpeg_kernel.cu | 2 +- paddle/phi/kernels/gpu/dgc_kernel.cu | 2 +- paddle/phi/kernels/gpu/dirichlet_kernel.cu | 12 + paddle/phi/kernels/gpu/dist_concat_kernel.cu | 4 +- paddle/phi/kernels/gpu/dist_kernel.cu | 2 +- .../gpu/distribute_fpn_proposals_kernel.cu | 2 +- paddle/phi/kernels/gpu/eigh_kernel.cu | 2 +- paddle/phi/kernels/gpu/eigvalsh_kernel.cu | 2 +- .../phi/kernels/gpu/embedding_grad_kernel.cu | 3 + .../kernels/gpu/generate_proposals_kernel.cu | 2 +- .../phi/kernels/gpu/graph_reindex_kernel.cu | 12 + .../gpu/graph_sample_neighbors_kernel.cu | 17 + .../kernels/gpu/graph_send_ue_recv_funcs.h | 9 + paddle/phi/kernels/gpu/group_norm_kernel.cu | 13 +- paddle/phi/kernels/gpu/group_norm_utils.h | 2 +- .../phi/kernels/gpu/gumbel_softmax_kernel.cu | 5 +- paddle/phi/kernels/gpu/instance_norm_utils.h | 2 +- .../kernels/gpu/interpolate_grad_kernel.cu | 2 +- paddle/phi/kernels/gpu/kthvalue_kernel.cu | 12 +- paddle/phi/kernels/gpu/layer_norm_kernel.cu | 2 +- .../phi/kernels/gpu/logsumexp_function.cu.h | 58 + paddle/phi/kernels/gpu/lstsq_kernel.cu | 2 +- paddle/phi/kernels/gpu/lu_kernel.cu | 2 +- paddle/phi/kernels/gpu/matrix_rank_kernel.cu | 2 +- .../phi/kernels/gpu/matrix_rank_tol_kernel.cu | 2 +- .../phi/kernels/gpu/multiclass_nms3_kernel.cu | 2 +- paddle/phi/kernels/gpu/multinomial_kernel.cu | 12 +- .../phi/kernels/gpu/nll_loss_grad_kernel.cu | 2 + paddle/phi/kernels/gpu/nll_loss_kernel.cu | 2 + paddle/phi/kernels/gpu/nonzero_kernel.cu | 2 +- paddle/phi/kernels/gpu/nop_kernel.cu | 2 +- paddle/phi/kernels/gpu/norm_grad_kernel.cu | 2 +- paddle/phi/kernels/gpu/norm_kernel.cu | 2 +- paddle/phi/kernels/gpu/p_recv_kernel.cu | 17 +- paddle/phi/kernels/gpu/p_send_kernel.cu | 19 +- paddle/phi/kernels/gpu/poisson_kernel.cu | 20 +- .../kernels/gpu/put_along_axis_grad_kernel.cu | 122 +- .../phi/kernels/gpu/put_along_axis_kernel.cu | 40 +- paddle/phi/kernels/gpu/qr_kernel.cu | 2 +- paddle/phi/kernels/gpu/randperm_kernel.cu | 15 +- paddle/phi/kernels/gpu/reduce.h | 2 +- paddle/phi/kernels/gpu/reduce_grad.h | 2 +- paddle/phi/kernels/gpu/reduce_kernel.cu | 17 +- .../phi/kernels/gpu/reduce_scatter_kernel.cu | 6 +- .../gpu/repeat_interleave_grad_kernel.cu | 6 +- .../kernels/gpu/repeat_interleave_kernel.cu | 6 +- paddle/phi/kernels/gpu/rms_norm_kernel.cu | 9 +- paddle/phi/kernels/gpu/rnn_functor.h | 55 + paddle/phi/kernels/gpu/rnn_kernel.cu.cc | 2 +- .../kernels/gpu/send_u_recv_grad_kernel.cu | 2 + paddle/phi/kernels/gpu/send_u_recv_kernel.cu | 2 + .../kernels/gpu/send_ue_recv_grad_kernel.cu | 27 + paddle/phi/kernels/gpu/send_ue_recv_kernel.cu | 3 + paddle/phi/kernels/gpu/send_uv_grad_kernel.cu | 15 + .../phi/kernels/gpu/set_value_grad_kernel.cu | 17 - paddle/phi/kernels/gpu/sgd_kernel.cu | 16 + .../kernels/gpu/shuffle_batch_grad_kernel.cu | 2 +- .../phi/kernels/gpu/shuffle_batch_kernel.cu | 4 +- paddle/phi/kernels/gpu/shuffle_batch_utils.h | 2 +- .../gpu/sigmoid_cross_entropy_with_logits.h | 3 +- paddle/phi/kernels/gpu/strided_copy_kernel.cu | 11 +- paddle/phi/kernels/gpu/svd_kernel.cu | 2 +- .../gpu/take_along_axis_grad_kernel.cu | 3 +- .../phi/kernels/gpu/take_along_axis_kernel.cu | 6 +- .../phi/kernels/gpu/top_p_sampling_kernel.cu | 34 +- paddle/phi/kernels/gpu/unique_kernel.cu | 34 +- .../phi/kernels/gpu/viterbi_decode_kernel.cu | 2 +- .../gpu/weighted_sample_neighbors_kernel.cu | 6 + .../kernels/gpudnn/affine_grid_grad_kernel.cu | 2 +- .../phi/kernels/gpudnn/affine_grid_kernel.cu | 2 +- paddle/phi/kernels/gpudnn/softmax_gpudnn.h | 12 + .../phi/kernels/gpudnn/softmax_grad_kernel.cu | 2 +- paddle/phi/kernels/gpudnn/softmax_kernel.cu | 2 +- paddle/phi/kernels/group_norm_kernel.h | 2 +- .../phi/kernels/impl/clip_grad_kernel_impl.h | 4 +- paddle/phi/kernels/impl/clip_kernel_impl.h | 4 +- paddle/phi/kernels/impl/complex_kernel_impl.h | 2 +- paddle/phi/kernels/impl/diag_embed_impl.h | 4 +- .../phi/kernels/impl/dot_grad_kernel_impl.h | 12 +- .../impl/elementwise_grad_kernel_impl.h | 2 +- .../kernels/impl/elementwise_kernel_impl.h | 2 +- .../phi/kernels/impl/fft_grad_kernel_impl.h | 2 +- paddle/phi/kernels/impl/isclose_kernel_impl.h | 4 +- .../phi/kernels/impl/kron_grad_kernel_impl.h | 4 +- paddle/phi/kernels/impl/kron_kernel_impl.h | 4 +- .../kernels/impl/matmul_grad_kernel_impl.h | 4 +- .../phi/kernels/impl/polygamma_kernel_impl.h | 4 +- paddle/phi/kernels/impl/pool_kernel_impl.h | 4 +- .../kernels/impl/quant_linear_kernel_impl.h | 2 +- paddle/phi/kernels/impl/renorm_impl.h | 6 +- .../impl/repeat_interleave_grad_kernel_impl.h | 10 +- .../impl/repeat_interleave_kernel_impl.h | 8 +- .../kernels/impl/segment_pool_kernel_impl.h | 7 +- .../kernels/impl/sequence_mask_kernel_impl.h | 4 +- .../kernels/impl/set_value_grad_kernel_impl.h | 22 - .../phi/kernels/impl/solve_grad_kernel_impl.h | 4 +- .../phi/kernels/impl/trace_grad_kernel_impl.h | 4 +- .../kernels/impl/unstack_grad_kernel_impl.h | 4 +- paddle/phi/kernels/impl/unstack_kernel_impl.h | 6 +- .../phi/kernels/impl/warprnnt_kernel_impl.h | 2 +- .../impl/weight_quantize_kernel_gpu_impl.h | 11 +- paddle/phi/kernels/is_empty_kernel.cc | 2 +- paddle/phi/kernels/kps/elementwise_kernel.cu | 2 +- paddle/phi/kernels/layer_norm_kernel.h | 2 +- paddle/phi/kernels/memcpy_kernel.cc | 4 +- paddle/phi/kernels/npu_identity_kernel.cc | 2 +- .../kernels/primitive/compute_primitives.h | 6 + .../kernels/primitive/datamover_primitives.h | 5 + paddle/phi/kernels/prod_kernel.cc | 2 +- .../phi/kernels/put_along_axis_grad_kernel.h | 3 - paddle/phi/kernels/put_along_axis_kernel.h | 1 - paddle/phi/kernels/reduce_all_kernel.cc | 2 +- paddle/phi/kernels/reduce_amax_kernel.cc | 2 +- paddle/phi/kernels/reduce_amin_kernel.cc | 2 +- paddle/phi/kernels/reduce_any_kernel.cc | 2 +- paddle/phi/kernels/reduce_mean_kernel.cc | 2 +- paddle/phi/kernels/reduce_min_kernel.cc | 2 +- paddle/phi/kernels/reduce_sum_kernel.cc | 2 +- paddle/phi/kernels/reverse_kernel.cc | 2 +- .../selected_rows/activation_kernel.cc | 2 +- .../kernels/selected_rows/assign_kernel.cc | 2 +- .../elementwise_multiply_kernel.cc | 2 +- .../phi/kernels/selected_rows/full_kernel.cc | 6 +- .../kernels/selected_rows/isfinite_kernel.cc | 4 +- .../merge_selected_rows_kernel.cc | 2 +- .../phi/kernels/selected_rows/scale_kernel.cc | 2 +- .../phi/kernels/selected_rows/shape_kernel.cc | 2 +- .../kernels/selected_rows/uniform_kernel.cc | 2 +- paddle/phi/kernels/set_value_grad_kernel.h | 10 - paddle/phi/kernels/shape_kernel.cc | 2 +- .../kernels/sparse/gpu/softmax_grad_kernel.cu | 3 + .../kernels/sparse/gpu/sparse_utils_kernel.cu | 7 + paddle/phi/kernels/squeeze_grad_kernel.cc | 2 +- paddle/phi/kernels/squeeze_kernel.cc | 2 +- .../phi/kernels/stride/as_complex_kernel.cc | 2 +- paddle/phi/kernels/stride/as_real_kernel.cc | 2 +- .../phi/kernels/stride/complex_grad_kernel.cc | 2 +- paddle/phi/kernels/stride/complex_kernel.cc | 2 +- .../phi/kernels/strided_slice_grad_kernel.cc | 2 +- paddle/phi/kernels/strided_slice_kernel.cc | 2 +- paddle/phi/kernels/strings/case_utils.h | 2 +- paddle/phi/kernels/strings/gpu/copy_utils.h | 10 +- .../kernels/strings/strings_empty_kernel.cc | 2 +- paddle/phi/kernels/strings/unicode.cc | 10 +- paddle/phi/kernels/strings/unicode.h | 2 +- paddle/phi/kernels/transfer_layout_kernel.cc | 4 +- paddle/phi/kernels/unsqueeze_grad_kernel.cc | 2 +- paddle/phi/kernels/unsqueeze_kernel.cc | 2 +- .../phi/kernels/xpu/set_value_grad_kernel.cc | 31 - paddle/phi/tools/CMakeLists.txt | 4 + patches/eigen/Complex.h.patch | 33 +- patches/eigen/Eigen_CORE.patch | 13 + ...c_Core_util_ConfigureVectorization.h.patch | 21 + .../eigen/Eigen_src_Core_util_Macros.h.patch | 51 + .../eigen/Eigen_src_Core_util_Meta.h.patch | 58 + patches/eigen/TensorReductionGpu.h | 2 +- .../unsupported_Eigen_CXX11_Tensor.patch | 13 + ...11_src_Tensor_TensorContractionGpu.h.patch | 22 + ...X11_src_Tensor_TensorDeviceDefault.h.patch | 15 + ...n_CXX11_src_Tensor_TensorDeviceGpu.h.patch | 15 + ...src_Tensor_TensorGpuHipCudaDefines.h.patch | 40 + ...n_CXX11_src_Tensor_TensorReduction.h.patch | 13 + python/CMakeLists.txt | 2 + python/cinn/compiler/expr_executor.py | 9 +- python/env_dict.py.in | 1 + python/paddle/__init__.py | 1 + python/paddle/base/__init__.py | 1 + .../base/dygraph/tensor_patch_methods.py | 11 +- python/paddle/base/executor.py | 12 +- python/paddle/base/framework.py | 17 +- python/paddle/base/layers/math_op_patch.py | 6 +- python/paddle/base/variable_index.py | 143 +- python/paddle/dataset/common.py | 6 - python/paddle/device/__init__.py | 2 + python/paddle/device/cuda/graphs.py | 3 +- python/paddle/distributed/auto_tuner/prune.py | 43 +- .../distributed/fleet/base/role_maker.py | 9 +- .../paddle/distributed/fleet/launch_utils.py | 2 +- .../distributed/fleet/layers/mpu/mp_layers.py | 2 +- python/paddle/distributed/fleet/utils/fs.py | 33 +- .../fleet/utils/sequence_parallel_utils.py | 1 + .../launch/controllers/collective.py | 10 +- .../paddle/distributed/launch/utils/nvsmi.py | 2 + python/paddle/distributed/rpc/rpc.py | 4 +- .../paddle/distributed/utils/launch_utils.py | 2 +- python/paddle/hapi/hub.py | 1 + .../incubate/distributed/fleet/fleet_util.py | 22 +- .../paddle/io/dataloader/dataloader_iter.py | 7 +- .../paddle/jit/dy2static/convert_operators.py | 7 +- python/paddle/nn/functional/conv.py | 1 + python/paddle/nn/functional/vision.py | 2 +- python/paddle/nn/quant/format.py | 39 +- .../paddle/quantization/observers/__init__.py | 3 +- .../quantization/observers/groupwise.py | 113 -- python/paddle/quantization/quantize.py | 17 +- python/paddle/tensor/manipulation.py | 74 +- .../utils/cpp_extension/extension_utils.py | 1 - python/paddle/utils/download.py | 49 +- python/setup.py.in | 2 +- security/README.md | 36 +- security/README_cn.md | 38 +- security/README_ja.md | 36 +- security/advisory/pdsa-2023-004_cn.md | 2 +- security/advisory/pdsa-2023-006.md | 31 - security/advisory/pdsa-2023-006_cn.md | 31 - security/advisory/pdsa-2023-007.md | 31 - security/advisory/pdsa-2023-007_cn.md | 31 - security/advisory/pdsa-2023-008.md | 31 - security/advisory/pdsa-2023-008_cn.md | 31 - security/advisory/pdsa-2023-009.md | 31 - security/advisory/pdsa-2023-009_cn.md | 31 - security/advisory/pdsa-2023-010.md | 33 - security/advisory/pdsa-2023-010_cn.md | 33 - security/advisory/pdsa-2023-011.md | 32 - security/advisory/pdsa-2023-011_cn.md | 32 - security/advisory/pdsa-2023-012.md | 35 - security/advisory/pdsa-2023-012_cn.md | 35 - security/advisory/pdsa-2023-013.md | 32 - security/advisory/pdsa-2023-013_cn.md | 32 - security/advisory/pdsa-2023-014.md | 32 - security/advisory/pdsa-2023-014_cn.md | 32 - security/advisory/pdsa-2023-015.md | 33 - security/advisory/pdsa-2023-015_cn.md | 33 - security/advisory/pdsa-2023-016.md | 32 - security/advisory/pdsa-2023-016_cn.md | 32 - security/advisory/pdsa-2023-017.md | 33 - security/advisory/pdsa-2023-017_cn.md | 33 - security/advisory/pdsa-2023-018.md | 32 - security/advisory/pdsa-2023-018_cn.md | 32 - security/advisory/pdsa-2023-019.md | 35 - security/advisory/pdsa-2023-019_cn.md | 35 - security/advisory/pdsa-2023-020.md | 28 - security/advisory/pdsa-2023-020_cn.md | 28 - security/advisory/pdsa-2023-021.md | 33 - security/advisory/pdsa-2023-021_cn.md | 33 - security/advisory/pdsa-2023-022.md | 30 - security/advisory/pdsa-2023-022_cn.md | 30 - security/advisory/pdsa-2023-023.md | 28 - security/advisory/pdsa-2023-023_cn.md | 28 - .../hybrid_strategy/CMakeLists.txt | 2 +- test/collective/fleet/CMakeLists.txt | 4 +- .../run_server_for_communicator_half_async.py | 38 - .../fleet/test_communicator_half_async.py | 118 +- .../fleet/test_dygraph_sharding_stage2.py | 9 +- .../fleet/test_parallel_dygraph_mp_layers.py | 5 +- .../fleet/test_parallel_dygraph_qat.py | 2 +- test/cpp/fluid/CMakeLists.txt | 2 + test/cpp/fluid/inference/CMakeLists.txt | 1 + test/cpp/fluid/inference/utils/CMakeLists.txt | 16 + .../fluid/inference/utils/io_utils_tester.cc | 154 ++ test/cpp/fluid/nccl/CMakeLists.txt | 2 +- test/cpp/fluid/nccl/nccl_op_test.cu.cc | 12 +- test/cpp/imperative/CMakeLists.txt | 3 +- test/cpp/imperative/nccl_context_test.cc | 10 +- test/cpp/inference/api/tester_helper.h | 12 + .../inference/api/trt_dynamic_shape_test.cc | 1 - test/cpp/inference/test.cmake | 7 +- test/custom_runtime/CMakeLists.txt | 2 +- .../test_collective_process_group_xccl.py | 5 +- test/custom_runtime/test_custom_cpu_plugin.py | 3 +- .../test_custom_cpu_profiler_plugin.py | 3 +- .../test_custom_cpu_to_static.py | 3 +- test/custom_runtime/test_custom_op_setup.py | 3 +- .../test_fleet_launch_custom_device.sh | 2 +- test/dygraph_to_static/CMakeLists.txt | 14 +- test/dygraph_to_static/test_list.py | 1 - test/dygraph_to_static/test_mobile_net.py | 11 +- test/indexing/test_getitem.py | 34 - test/indexing/test_setitem.py | 130 +- test/ir/inference/program_config.py | 28 +- test/ir/inference/test_trt_convert_assign.py | 5 +- test/ir/inference/test_trt_convert_cast.py | 1 - .../test_trt_convert_lookup_table.py | 1 - test/ir/inference/test_trt_convert_solve.py | 5 +- test/legacy_test/CMakeLists.txt | 4 +- test/legacy_test/c_embedding_op_base.py | 25 +- test/legacy_test/test_adaptive_avg_pool1d.py | 1 + test/legacy_test/test_dist_hapi_model.py | 2 +- test/legacy_test/test_download.py | 15 +- .../test_parallel_dygraph_dataparallel.py | 2 +- ...t_parallel_dygraph_dataparallel_cpuonly.py | 2 +- test/legacy_test/test_put_along_axis_op.py | 762 +------- test/legacy_test/test_repeat_interleave_op.py | 19 - test/legacy_test/test_set_value_op.py | 82 - .../test_sparse_fused_attention_op.py | 5 - test/legacy_test/test_yolov3_loss_op.py | 3 +- test/quantization/test_groupwise.py | 69 - test/quantization/test_llm_int8_linear.py | 90 +- ..._post_training_quantization_mobilenetv1.py | 70 +- ...est_post_training_quantization_resnet50.py | 2 +- test/quantization/test_ptq.py | 42 - test/quantization/test_weight_only_linear.py | 42 - .../xpu/test_parallel_dygraph_dataparallel.py | 2 +- third_party/cryptopp | 1 - third_party/cryptopp-cmake | 1 - tools/enforce/grep_invalid_enforce.sh | 2 +- tools/parallel_UT_rule.py | 6 + 915 files changed, 11920 insertions(+), 8842 deletions(-) create mode 100644 cmake/mccl.cmake create mode 100644 cmake/mudnn.cmake create mode 100644 cmake/musa.cmake mode change 100644 => 100755 paddle/fluid/inference/tensorrt/op_teller.cc create mode 100644 paddle/fluid/inference/utils/benchmark.cc create mode 100644 paddle/fluid/inference/utils/benchmark.h create mode 100644 paddle/fluid/inference/utils/benchmark_tester.cc create mode 100644 paddle/fluid/inference/utils/table_printer_tester.cc create mode 100644 paddle/fluid/platform/device/gpu/musa/musa_helper.h create mode 100644 paddle/fluid/platform/dynload/mccl.cc create mode 100644 paddle/fluid/platform/dynload/mccl.h create mode 100644 paddle/fluid/platform/dynload/mublas.cc create mode 100644 paddle/fluid/platform/dynload/mublas.h create mode 100644 paddle/fluid/platform/dynload/mudnn.cc create mode 100644 paddle/fluid/platform/dynload/mudnn.h create mode 100644 paddle/fluid/platform/dynload/mufft.cc create mode 100644 paddle/fluid/platform/dynload/mufft.h create mode 100644 paddle/fluid/platform/dynload/murand.cc create mode 100644 paddle/fluid/platform/dynload/murand.h create mode 100644 paddle/fluid/platform/dynload/musa_driver.cc create mode 100644 paddle/fluid/platform/dynload/musa_driver.h create mode 100644 paddle/fluid/platform/dynload/musartc.cc create mode 100644 paddle/fluid/platform/dynload/musartc.h create mode 100644 paddle/fluid/platform/dynload/musparse.cc create mode 100644 paddle/fluid/platform/dynload/musparse.h create mode 100644 paddle/phi/backends/dynload/mccl.cc create mode 100644 paddle/phi/backends/dynload/mccl.h create mode 100644 paddle/phi/backends/dynload/mublas.cc create mode 100644 paddle/phi/backends/dynload/mublas.h create mode 100644 paddle/phi/backends/dynload/mudnn.cc create mode 100644 paddle/phi/backends/dynload/mudnn.h create mode 100644 paddle/phi/backends/dynload/mufft.cc create mode 100644 paddle/phi/backends/dynload/mufft.h create mode 100644 paddle/phi/backends/dynload/murand.cc create mode 100644 paddle/phi/backends/dynload/murand.h create mode 100644 paddle/phi/backends/dynload/musa_driver.cc create mode 100644 paddle/phi/backends/dynload/musa_driver.h create mode 100644 paddle/phi/backends/dynload/musartc.cc create mode 100644 paddle/phi/backends/dynload/musartc.h create mode 100644 paddle/phi/backends/dynload/musparse.cc create mode 100644 paddle/phi/backends/dynload/musparse.h create mode 100644 paddle/phi/backends/gpu/musa/mudnn_desc.h create mode 100644 paddle/phi/backends/gpu/musa/mudnn_helper.h create mode 100644 paddle/phi/backends/gpu/musa/musa_device_function.h create mode 100644 paddle/phi/backends/gpu/musa/musa_helper.h create mode 100644 paddle/phi/backends/gpu/musa/musa_info.cc delete mode 100644 paddle/phi/kernels/custom/c_embedding_grad_kernel.cc delete mode 100644 paddle/phi/kernels/custom/c_embedding_kernel.cc create mode 100644 paddle/phi/kernels/funcs/blas/blas_impl.mu.h create mode 100644 paddle/phi/kernels/funcs/mufft_util.h create mode 100644 patches/eigen/Eigen_CORE.patch create mode 100644 patches/eigen/Eigen_src_Core_util_ConfigureVectorization.h.patch create mode 100644 patches/eigen/Eigen_src_Core_util_Macros.h.patch create mode 100644 patches/eigen/Eigen_src_Core_util_Meta.h.patch create mode 100644 patches/eigen/unsupported_Eigen_CXX11_Tensor.patch create mode 100644 patches/eigen/unsupported_Eigen_CXX11_src_Tensor_TensorContractionGpu.h.patch create mode 100644 patches/eigen/unsupported_Eigen_CXX11_src_Tensor_TensorDeviceDefault.h.patch create mode 100644 patches/eigen/unsupported_Eigen_CXX11_src_Tensor_TensorDeviceGpu.h.patch create mode 100644 patches/eigen/unsupported_Eigen_CXX11_src_Tensor_TensorGpuHipCudaDefines.h.patch create mode 100644 patches/eigen/unsupported_Eigen_CXX11_src_Tensor_TensorReduction.h.patch delete mode 100644 python/paddle/quantization/observers/groupwise.py delete mode 100644 security/advisory/pdsa-2023-006.md delete mode 100644 security/advisory/pdsa-2023-006_cn.md delete mode 100644 security/advisory/pdsa-2023-007.md delete mode 100644 security/advisory/pdsa-2023-007_cn.md delete mode 100644 security/advisory/pdsa-2023-008.md delete mode 100644 security/advisory/pdsa-2023-008_cn.md delete mode 100644 security/advisory/pdsa-2023-009.md delete mode 100644 security/advisory/pdsa-2023-009_cn.md delete mode 100644 security/advisory/pdsa-2023-010.md delete mode 100644 security/advisory/pdsa-2023-010_cn.md delete mode 100644 security/advisory/pdsa-2023-011.md delete mode 100644 security/advisory/pdsa-2023-011_cn.md delete mode 100644 security/advisory/pdsa-2023-012.md delete mode 100644 security/advisory/pdsa-2023-012_cn.md delete mode 100644 security/advisory/pdsa-2023-013.md delete mode 100644 security/advisory/pdsa-2023-013_cn.md delete mode 100644 security/advisory/pdsa-2023-014.md delete mode 100644 security/advisory/pdsa-2023-014_cn.md delete mode 100644 security/advisory/pdsa-2023-015.md delete mode 100644 security/advisory/pdsa-2023-015_cn.md delete mode 100644 security/advisory/pdsa-2023-016.md delete mode 100644 security/advisory/pdsa-2023-016_cn.md delete mode 100644 security/advisory/pdsa-2023-017.md delete mode 100644 security/advisory/pdsa-2023-017_cn.md delete mode 100644 security/advisory/pdsa-2023-018.md delete mode 100644 security/advisory/pdsa-2023-018_cn.md delete mode 100644 security/advisory/pdsa-2023-019.md delete mode 100644 security/advisory/pdsa-2023-019_cn.md delete mode 100644 security/advisory/pdsa-2023-020.md delete mode 100644 security/advisory/pdsa-2023-020_cn.md delete mode 100644 security/advisory/pdsa-2023-021.md delete mode 100644 security/advisory/pdsa-2023-021_cn.md delete mode 100644 security/advisory/pdsa-2023-022.md delete mode 100644 security/advisory/pdsa-2023-022_cn.md delete mode 100644 security/advisory/pdsa-2023-023.md delete mode 100644 security/advisory/pdsa-2023-023_cn.md delete mode 100644 test/collective/fleet/run_server_for_communicator_half_async.py create mode 100644 test/cpp/fluid/inference/CMakeLists.txt create mode 100644 test/cpp/fluid/inference/utils/CMakeLists.txt create mode 100644 test/cpp/fluid/inference/utils/io_utils_tester.cc delete mode 100644 test/quantization/test_groupwise.py delete mode 160000 third_party/cryptopp delete mode 160000 third_party/cryptopp-cmake diff --git a/.gitmodules b/.gitmodules index 0c41450793fc2..8b06f4fb771cb 100644 --- a/.gitmodules +++ b/.gitmodules @@ -110,11 +110,3 @@ path = third_party/cccl url = https://github.com/NVIDIA/cccl.git ignore = dirty -[submodule "third_party/cryptopp"] - path = third_party/cryptopp - url = https://github.com/weidai11/cryptopp.git - ignore = dirty -[submodule "third_party/cryptopp-cmake"] - path = third_party/cryptopp-cmake - url = https://github.com/noloader/cryptopp-cmake.git - ignore = dirty diff --git a/CMakeLists.txt b/CMakeLists.txt index e9f3fafe8d22a..da58f0095ae09 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -41,13 +41,14 @@ if(NOT CMAKE_BUILD_TYPE) endif() project(paddle CXX C) - +# set(CMAKE_VERBOSE_MAKEFILE ON) # enable language CUDA # TODO(Shibo Tao): remove find_package(CUDA) completely. find_package(CUDA QUIET) find_package(MKL CONFIG QUIET) option(WITH_ONEMKL "Compile PaddlePaddle with oneMKL" OFF) -option(WITH_GPU "Compile PaddlePaddle with NVIDIA GPU" ${CUDA_FOUND}) +option(WITH_GPU "Compile PaddlePaddle with NVIDIA GPU" OFF) +option(WITH_MUSA "Compile PaddlePaddle with MUSA" ON) option(WITH_MPI "Compile PaddlePaddle with MPI" OFF) option(WITH_TENSORRT "Compile PaddlePaddle with NVIDIA TensorRT" OFF) option(WITH_XPU "Compile PaddlePaddle with BAIDU KUNLUN XPU" OFF) @@ -89,6 +90,9 @@ endif() if(WITH_GPU AND WITH_ROCM) message(FATAL_ERROR "Error when compile CUDA and ROCM at the same time") endif() +if(WITH_GPU AND WITH_MUSA) + message(FATAL_ERROR "Error when compile CUDA and MUSA at the same time") +endif() if(WITH_GPU AND NOT APPLE) enable_language(CUDA) @@ -252,7 +256,7 @@ option(WITH_TESTING "Compile PaddlePaddle with unit testing" OFF) option(WITH_MULTINODE_TESTING "Test multinode apis and ops" OFF) option(WITH_MKL "Compile PaddlePaddle with MKL support." ${AVX_FOUND}) option(WITH_SYSTEM_BLAS "Use system blas library" OFF) -option(WITH_DISTRIBUTE "Compile with distributed support" OFF) +option(WITH_DISTRIBUTE "Compile with distributed support" ON) option(WITH_BRPC_RDMA "Use brpc rdma as the rpc protocal" OFF) option(ON_INFER "Turn on inference optimization and inference-lib generation" ON) @@ -285,6 +289,7 @@ option(WITH_LITE "Compile Paddle Fluid with Lite Engine" OFF) option(WITH_CINN "Compile PaddlePaddle with CINN" OFF) option(WITH_NCCL "Compile PaddlePaddle with NCCL support" ON) option(WITH_RCCL "Compile PaddlePaddle with RCCL support" ON) +option(WITH_MCCL "Compile PaddlePaddle with MCCL support" ON) option(WITH_XPU_BKCL "Compile PaddlePaddle with BAIDU KUNLUN XPU BKCL" OFF) option(WITH_CRYPTO "Compile PaddlePaddle with crypto support" ON) option(WITH_ARM "Compile PaddlePaddle with arm support" OFF) @@ -352,6 +357,7 @@ endif() if(LINUX AND NOT WITH_CUSTOM_DEVICE AND NOT WITH_GPU + AND NOT WITH_MUSA AND NOT WITH_ROCM AND NOT WITH_XPU AND NOT WITH_XPU_KP @@ -404,6 +410,14 @@ if(NOT WITH_GPU AND WITH_NCCL) CACHE STRING "Disable NCCL when compiling without GPU" FORCE) endif() +if(NOT WITH_MUSA AND WITH_MCCL) + message( + WARNING "Disable MCCL when compiling without MUSA. Force WITH_MCCL=OFF.") + set(WITH_MCCL + OFF + CACHE STRING "Disable MCCL when compiling without MUSA" FORCE) +endif() + if(NOT WITH_GPU AND WITH_CUDNN_DSO) message( WARNING @@ -461,6 +475,19 @@ else() endif() endif() +if(WITH_MCCL) + add_definitions("-DPADDLE_WITH_MCCL") + include(mccl) +else() + if(WITH_MUSA) + message( + WARNING + "If the environment is multi-card, the WITH_MCCL option needs to be turned on, otherwise only a single card can be used." + ) + endif() +endif() + + if(WITH_BRPC_RDMA) message(STATUS "Use brpc with rdma.") if(NOT WITH_DISTRIBUTE) @@ -486,6 +513,11 @@ if(WITH_ROCM) include(cupti) endif() +if(WITH_MUSA) + include(musa) + include(mudnn) +endif() + if(WITH_XPU_KP) include(xpu_kp) endif() @@ -498,6 +530,14 @@ if(NOT WITH_ROCM AND WITH_RCCL) CACHE STRING "Disable RCCL when compiling without ROCM" FORCE) endif() +if(NOT WITH_MUSA AND WITH_MCCL) + message( + WARNING "Disable MCCL when compiling without MUSA. Force WITH_MCCL=OFF.") + set(WITH_MCCL + OFF + CACHE STRING "Disable MCCL when compiling without MUSA" FORCE) +endif() + if(WITH_RCCL) add_definitions("-DPADDLE_WITH_RCCL") include(rccl) @@ -510,6 +550,18 @@ else() endif() endif() +if(WITH_MCCL) + add_definitions("-DPADDLE_WITH_MCCL") + include(mccl) +else() + if(WITH_MUSA) + message( + WARNING + "If the environment is multi-card, the WITH_MCCL option needs to be turned on, otherwise only a single card can be used." + ) + endif() +endif() + if(WITH_HETERPS AND WITH_PSLIB) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0") endif() @@ -560,6 +612,13 @@ if(WITH_RPC) OFF CACHE BOOL "Disable WITH_RPC when compiling with ROCM" FORCE) endif() + if(WITH_MUSA AND WITH_RPC) + message( + WARNING "Disable WITH_RPC when compiling with MUSA. Force WITH_RPC=OFF.") + set(WITH_RPC + OFF + CACHE BOOL "Disable WITH_RPC when compiling with MUSA" FORCE) + endif() if(WITH_XPU AND WITH_RPC) message( WARNING "Disable WITH_RPC when compiling with XPU. Force WITH_RPC=OFF.") @@ -631,6 +690,12 @@ include(configure) # add paddle env configuration include_directories("${PADDLE_SOURCE_DIR}") +# distribute need openssl +# openssl install tutorial: https://www.howtoforge.com/tutorial/how-to-install-openssl-from-source-on-linux/ +include_directories("/usr/local/ssl/include") +link_directories("/usr/local/ssl/lib64") + + if(WITH_NV_JETSON) set(WITH_ARM ON diff --git a/README.md b/README.md index 001352ea45fc4..8f708334ed28f 100644 --- a/README.md +++ b/README.md @@ -20,7 +20,7 @@ PaddlePaddle is originated from industrial practices with dedication and commitm ## Installation -### Latest PaddlePaddle Release: [v2.6](https://github.com/PaddlePaddle/Paddle/tree/release/2.6) +### Latest PaddlePaddle Release: [v2.5](https://github.com/PaddlePaddle/Paddle/tree/release/2.5) Our vision is to enable deep learning for everyone via PaddlePaddle. Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddle/releases) to track the latest features of PaddlePaddle. diff --git a/README_cn.md b/README_cn.md index cd45e4e3ecd2b..a13fa5ba21450 100644 --- a/README_cn.md +++ b/README_cn.md @@ -18,9 +18,9 @@ ## 安装 -### PaddlePaddle 最新版本: [v2.6](https://github.com/PaddlePaddle/Paddle/tree/release/2.6) +### PaddlePaddle最新版本: [v2.5](https://github.com/PaddlePaddle/Paddle/tree/release/2.5) -跟进 PaddlePaddle 最新特性请参考我们的[版本说明](https://github.com/PaddlePaddle/Paddle/releases) +跟进PaddlePaddle最新特性请参考我们的[版本说明](https://github.com/PaddlePaddle/Paddle/releases) ### 安装最新稳定版本: ``` diff --git a/README_ja.md b/README_ja.md index dad60eb7ffcf8..22c78a1a79bbd 100644 --- a/README_ja.md +++ b/README_ja.md @@ -20,7 +20,7 @@ PaddlePaddle は、工業化に対するコミットメントを持つ工業的 ## インストール -### PaddlePaddle の最新リリース: [v2.6](https://github.com/PaddlePaddle/Paddle/tree/release/2.6) +### PaddlePaddle の最新リリース: [v2.5](https://github.com/PaddlePaddle/Paddle/tree/release/2.5) 私たちのビジョンは、PaddlePaddle を通じて、誰もが深層学習を行えるようにすることです。 PaddlePaddle の最新機能を追跡するために、私たちの[リリースのお知らせ](https://github.com/PaddlePaddle/Paddle/releases)を参照してください。 diff --git a/cmake/configure.cmake b/cmake/configure.cmake index dc661fce388fe..29cca57db6589 100644 --- a/cmake/configure.cmake +++ b/cmake/configure.cmake @@ -175,6 +175,19 @@ elseif(WITH_ROCM) if(${MIOPEN_VERSION} VERSION_LESS 2090) message(FATAL_ERROR "Paddle needs MIOPEN >= 2.9 to compile") endif() +elseif(WITH_MUSA) + add_definitions(-DPADDLE_WITH_MUSA) + add_definitions(-DEIGEN_USE_GPU) + add_definitions(-DEIGEN_USE_MUSA) + if(MUPTI_FOUND) + include_directories(${CUPTI_INCLUDE_DIR}) + add_definitions(-DPADDLE_WITH_MUPTI) + else() + message(STATUS "Cannot find MUPTI, GPU Profiling is incorrect.") + endif() + if(NOT MUDNN_FOUND) + message(FATAL_ERROR "Paddle needs mudnn to compile") + endif() else() add_definitions(-DHPPL_STUB_FUNC) list(APPEND CMAKE_CXX_SOURCE_FILE_EXTENSIONS cu) diff --git a/cmake/cupti.cmake b/cmake/cupti.cmake index eb7ad44af2313..5967b468d65ce 100644 --- a/cmake/cupti.cmake +++ b/cmake/cupti.cmake @@ -1,4 +1,4 @@ -if(NOT WITH_GPU AND NOT WITH_ROCM) +if(NOT WITH_GPU AND NOT WITH_ROCM AND NOT WITH_MUSA) return() endif() @@ -6,6 +6,10 @@ if(WITH_ROCM) set(CUPTI_ROOT "${ROCM_PATH}/cuda/extras/CUPTI" CACHE PATH "CUPTI ROOT") +elseif(WITH_MUSA) + set(CUPTI_ROOT + "/usr/local/musa" + CACHE PATH "CUPTI ROOT") else() set(CUPTI_ROOT "/usr" diff --git a/cmake/external/cryptopp.cmake b/cmake/external/cryptopp.cmake index b3ec8f622923f..9daa4be7468e4 100644 --- a/cmake/external/cryptopp.cmake +++ b/cmake/external/cryptopp.cmake @@ -14,13 +14,12 @@ include(ExternalProject) -set(CRYPTOPP_SOURCE_DIR ${PADDLE_SOURCE_DIR}/third_party/cryptopp) -set(CRYPTOPP_CMAKE_SOURCE_DIR ${PADDLE_SOURCE_DIR}/third_party/cryptopp-cmake) set(CRYPTOPP_PREFIX_DIR ${THIRD_PARTY_PATH}/cryptopp) set(CRYPTOPP_INSTALL_DIR ${THIRD_PARTY_PATH}/install/cryptopp) set(CRYPTOPP_INCLUDE_DIR "${CRYPTOPP_INSTALL_DIR}/include" CACHE PATH "cryptopp include directory." FORCE) +set(CRYPTOPP_REPOSITORY ${GIT_URL}/weidai11/cryptopp.git) set(CRYPTOPP_TAG CRYPTOPP_8_2_0) if(WIN32) @@ -64,16 +63,17 @@ include_directories(${CRYPTOPP_INCLUDE_DIR}) ExternalProject_Add( extern_cryptopp ${EXTERNAL_PROJECT_LOG_ARGS} ${SHALLOW_CLONE} + GIT_REPOSITORY ${CRYPTOPP_REPOSITORY} + GIT_TAG ${CRYPTOPP_TAG} PREFIX ${CRYPTOPP_PREFIX_DIR} - SOURCE_DIR ${CRYPTOPP_SOURCE_DIR} UPDATE_COMMAND "" PATCH_COMMAND - COMMAND ${CMAKE_COMMAND} -E copy "${CRYPTOPP_CMAKE_SOURCE_DIR}/CMakeLists.txt" - "/CMakeLists.txt" - COMMAND - ${CMAKE_COMMAND} -E copy - "${CRYPTOPP_CMAKE_SOURCE_DIR}/cryptopp-config.cmake" - "/cryptopp-config.cmake" + COMMAND ${CMAKE_COMMAND} -E remove_directory "/cmake/" + COMMAND git clone ${GIT_URL}/noloader/cryptopp-cmake "/cmake" + COMMAND cd "/cmake" && git checkout tags/${CRYPTOPP_TAG} -b + ${CRYPTOPP_TAG} + COMMAND ${CMAKE_COMMAND} -E copy_directory "/cmake/" + "/" COMMAND ${CRYPTOPP_PATCH_COMMAND} INSTALL_DIR ${CRYPTOPP_INSTALL_DIR} CMAKE_ARGS ${CRYPTOPP_CMAKE_ARGS} diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake index 06e37b3c8a602..4051a09d767f6 100644 --- a/cmake/external/eigen.cmake +++ b/cmake/external/eigen.cmake @@ -60,6 +60,76 @@ if(CMAKE_COMPILER_IS_GNUCC) ${EIGEN_PATCH_COMMAND} && patch -Nd ${SOURCE_DIR}/Eigen/src/Core/arch/SSE/ < ${complex_header}) endif() + if(WITH_MUSA) + file( + TO_NATIVE_PATH + ${PADDLE_SOURCE_DIR}/patches/eigen/Eigen_src_Core_util_ConfigureVectorization.h.patch + configure_vectorization_header) + set(EIGEN_PATCH_COMMAND + ${EIGEN_PATCH_COMMAND} && patch -Nd ${SOURCE_DIR}/Eigen/src/Core/util/ + < ${configure_vectorization_header}) + file(TO_NATIVE_PATH + ${PADDLE_SOURCE_DIR}/patches/eigen/Eigen_src_Core_util_Macros.h.patch + util_macros_header) + set(EIGEN_PATCH_COMMAND + ${EIGEN_PATCH_COMMAND} && patch -Nd ${SOURCE_DIR}/Eigen/src/Core/util/ + < ${util_macros_header}) + file(TO_NATIVE_PATH + ${PADDLE_SOURCE_DIR}/patches/eigen/Eigen_src_Core_util_Meta.h.patch + meta_header) + set(EIGEN_PATCH_COMMAND ${EIGEN_PATCH_COMMAND} && patch -Nd + ${SOURCE_DIR}/Eigen/src/Core/util/ < ${meta_header}) + file(TO_NATIVE_PATH + ${PADDLE_SOURCE_DIR}/patches/eigen/unsupported_Eigen_CXX11_Tensor.patch + cxx11_tensor) + set(EIGEN_PATCH_COMMAND + ${EIGEN_PATCH_COMMAND} && patch -Nd + ${SOURCE_DIR}/unsupported/Eigen/CXX11/ < ${cxx11_tensor}) + file( + TO_NATIVE_PATH + ${PADDLE_SOURCE_DIR}/patches/eigen/unsupported_Eigen_CXX11_src_Tensor_TensorContractionGpu.h.patch + tensor_contraction_gpu_header) + set(EIGEN_PATCH_COMMAND + ${EIGEN_PATCH_COMMAND} && patch -Nd + ${SOURCE_DIR}/unsupported/Eigen/CXX11/src/Tensor/ < + ${tensor_contraction_gpu_header}) + file( + TO_NATIVE_PATH + ${PADDLE_SOURCE_DIR}/patches/eigen/unsupported_Eigen_CXX11_src_Tensor_TensorDeviceDefault.h.patch + tensor_device_default_header) + set(EIGEN_PATCH_COMMAND + ${EIGEN_PATCH_COMMAND} && patch -Nd + ${SOURCE_DIR}/unsupported/Eigen/CXX11/src/Tensor/ < + ${tensor_device_default_header}) + file( + TO_NATIVE_PATH + ${PADDLE_SOURCE_DIR}/patches/eigen/unsupported_Eigen_CXX11_src_Tensor_TensorGpuHipCudaDefines.h.patch + tensor_gpu_hip_cuda_defines_header) + set(EIGEN_PATCH_COMMAND + ${EIGEN_PATCH_COMMAND} && patch -Nd + ${SOURCE_DIR}/unsupported/Eigen/CXX11/src/Tensor/ < + ${tensor_gpu_hip_cuda_defines_header}) + file( + TO_NATIVE_PATH + ${PADDLE_SOURCE_DIR}/patches/eigen/unsupported_Eigen_CXX11_src_Tensor_TensorReduction.h.patch + tensor_reduction_header) + set(EIGEN_PATCH_COMMAND + ${EIGEN_PATCH_COMMAND} && patch -Nd + ${SOURCE_DIR}/unsupported/Eigen/CXX11/src/Tensor/ < + ${tensor_reduction_header}) + file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/Eigen_CORE.patch + eigen_core) + set(EIGEN_PATCH_COMMAND ${EIGEN_PATCH_COMMAND} && patch -Nd + ${SOURCE_DIR}/Eigen/ < ${eigen_core}) + file( + TO_NATIVE_PATH + ${PADDLE_SOURCE_DIR}/patches/eigen/unsupported_Eigen_CXX11_src_Tensor_TensorDeviceGpu.h.patch + tensor_device_gpu_header) + set(EIGEN_PATCH_COMMAND + ${EIGEN_PATCH_COMMAND} && patch -Nd + ${SOURCE_DIR}/unsupported/Eigen/CXX11/src/Tensor/ < + ${tensor_device_gpu_header}) + endif() endif() set(EIGEN_INCLUDE_DIR ${SOURCE_DIR}) diff --git a/cmake/flags.cmake b/cmake/flags.cmake index 7a4956e6e1556..8d6384d2f0a14 100644 --- a/cmake/flags.cmake +++ b/cmake/flags.cmake @@ -246,6 +246,11 @@ if(WITH_GPU) set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${SAFE_GPU_COMMON_FLAGS}") endif() +if(WITH_MUSA) + set(CMAKE_MUSA_FLAGS "${CMAKE_MUSA_FLAGS} ${SAFE_GPU_COMMON_FLAGS}") +endif() + + if(WITH_ROCM) set(HIP_HIPCC_FLAGS "${HIP_HIPCC_FLAGS} ${SAFE_GPU_COMMON_FLAGS}") endif() diff --git a/cmake/generic.cmake b/cmake/generic.cmake index c463dbc6064e1..788237cc4699b 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -453,6 +453,9 @@ function(cc_binary TARGET_NAME) if(WITH_ROCM) target_link_libraries(${TARGET_NAME} ${ROCM_HIPRTC_LIB}) endif() + if(WITH_MUSA) + target_link_libraries(${TARGET_NAME} ${MUSARTC_LIB}) + endif() check_coverage_opt(${TARGET_NAME} ${cc_binary_SRCS}) @@ -481,6 +484,12 @@ function(cc_test_build TARGET_NAME) if(WITH_ROCM) target_link_libraries(${TARGET_NAME} ${ROCM_HIPRTC_LIB}) endif() + if(WITH_MUSA) + target_link_libraries(${TARGET_NAME} ${MUSARTC_LIB}) + # libtinfo.so depended by libmusa.so is located in '/usr/lib/x86_64-linux-gnu/' + target_link_options(${TARGET_NAME} PRIVATE + -Wl,-rpath,/usr/lib/x86_64-linux-gnu/) + endif(()) check_coverage_opt(${TARGET_NAME} ${cc_test_SRCS}) endif() endfunction() @@ -619,6 +628,12 @@ function(paddle_test_build TARGET_NAME) if(WITH_ROCM) target_link_libraries(${TARGET_NAME} ${ROCM_HIPRTC_LIB}) endif() + if(WITH_MUSA) + target_link_libraries(${TARGET_NAME} ${MUSARTC_LIB}) + # libtinfo.so depended by libmusa.so is located in '/usr/lib/x86_64-linux-gnu/' + target_link_options(${TARGET_NAME} PRIVATE + -Wl,-rpath,/usr/lib/x86_64-linux-gnu/) + endif() if(APPLE) target_link_libraries( ${TARGET_NAME} @@ -750,6 +765,115 @@ function(nv_test TARGET_NAME) endif() endfunction() + + +function(musa_library TARGET_NAME) + if(WITH_MUSA) + set(options STATIC static SHARED shared) + set(oneValueArgs "") + set(multiValueArgs SRCS DEPS) + cmake_parse_arguments(musa_library "${options}" "${oneValueArgs}" + "${multiValueArgs}" ${ARGN}) + if(musa_library_SRCS) + if(musa_library_SHARED OR musa_library_shared) # build *.so + musa_add_library(${TARGET_NAME} SHARED ${musa_library_SRCS}) + else() + musa_add_library(${TARGET_NAME} STATIC ${musa_library_SRCS}) + find_fluid_modules(${TARGET_NAME}) + find_phi_modules(${TARGET_NAME}) + endif() + if(musa_library_DEPS) + add_dependencies(${TARGET_NAME} ${musa_library_DEPS}) + target_link_libraries(${TARGET_NAME} ${musa_library_DEPS}) + endif() + # cpplint code style + foreach(source_file ${musa_library_SRCS}) + string(REGEX REPLACE "\\.[^.]*$" "" source ${source_file}) + if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h) + list(APPEND musa_library_HEADERS + ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h) + endif() + endforeach() + else() + if(musa_library_DEPS) + list(REMOVE_DUPLICATES musa_library_DEPS) + generate_dummy_static_lib( + LIB_NAME ${TARGET_NAME} FILE_PATH ${target_SRCS} GENERATOR + "generic.cmake:musa_library") + + target_link_libraries(${TARGET_NAME} ${musa_library_DEPS}) + add_dependencies(${TARGET_NAME} ${musa_library_DEPS}) + else() + message(FATAL "Please specify source file or library in musa_library.") + endif() + endif() + endif() +endfunction() + +function(musa_binary TARGET_NAME) + if(WITH_MUSA) + set(options "") + set(oneValueArgs "") + set(multiValueArgs SRCS DEPS) + cmake_parse_arguments(musa_binary "${options}" "${oneValueArgs}" + "${multiValueArgs}" ${ARGN}) + add_executable(${TARGET_NAME} ${musa_binary_SRCS}) + if(musa_binary_DEPS) + target_link_libraries(${TARGET_NAME} ${musa_binary_DEPS}) + add_dependencies(${TARGET_NAME} ${musa_binary_DEPS}) + common_link(${TARGET_NAME}) + endif() + endif() +endfunction() + +function(musa_test TARGET_NAME) + if(WITH_MUSA AND WITH_TESTING) + set(oneValueArgs "") + set(multiValueArgs SRCS DEPS) + cmake_parse_arguments(musa_test "${options}" "${oneValueArgs}" + "${multiValueArgs}" ${ARGN}) + musa_add_executable(${TARGET_NAME} ${musa_test_SRCS}) + # "-pthread -ldl -lrt" is defined in CMAKE_CXX_LINK_EXECUTABLE + target_link_options(${TARGET_NAME} PRIVATE -pthread -ldl -lrt) + get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES) + target_link_libraries( + ${TARGET_NAME} + ${musa_test_DEPS} + paddle_gtest_main + lod_tensor + memory + gtest + glog + phi + ${os_dependency_modules}) + add_dependencies( + ${TARGET_NAME} + ${musa_test_DEPS} + paddle_gtest_main + lod_tensor + memory + gtest + phi + glog) + common_link(${TARGET_NAME}) + add_test(${TARGET_NAME} ${TARGET_NAME}) + set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT + FLAGS_cpu_deterministic=true) + set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT + FLAGS_init_allocated_mem=true) + set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT + FLAGS_cudnn_deterministic=true) + set_property( + TEST ${TARGET_NAME} + PROPERTY + ENVIRONMENT + "LD_LIBRARY_PATH=${CMAKE_BINARY_DIR}/python/paddle/libs:$LD_LIBRARY_PATH" + ) + endif() +endfunction() + + + function(hip_library TARGET_NAME) if(WITH_ROCM) set(options STATIC static SHARED shared) @@ -758,6 +882,12 @@ function(hip_library TARGET_NAME) cmake_parse_arguments(hip_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) if(hip_library_SRCS) + # FindHIP.cmake defined hip_add_library, HIP_SOURCE_PROPERTY_FORMAT is requried if no .cu files found + if(NOT (${CMAKE_CURRENT_SOURCE_DIR} MATCHES ".*/operators" + OR ${CMAKE_CURRENT_SOURCE_DIR} MATCHES ".*/phi/kernels")) + set_source_files_properties(${hip_library_SRCS} + PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1) + endif() if(hip_library_SHARED OR hip_library_shared) # build *.so hip_add_library(${TARGET_NAME} SHARED ${hip_library_SRCS}) else() @@ -771,10 +901,6 @@ function(hip_library TARGET_NAME) endif() # cpplint code style foreach(source_file ${hip_library_SRCS}) - if(NOT ${source_file} MATCHES "\\.cu$") - set_source_files_properties(${source_file} - PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1) - endif() string(REGEX REPLACE "\\.[^.]*$" "" source ${source_file}) if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h) list(APPEND hip_library_HEADERS @@ -1375,6 +1501,15 @@ function(math_library TARGET) ${TARGET} SRCS ${cc_srcs} ${cu_srcs} DEPS ${math_library_DEPS} ${math_common_deps}) + elseif(WITH_MUSA) + musa_library( + ${TARGET} + SRCS + ${cc_srcs} + ${cu_srcs} + DEPS + ${math_library_DEPS} + ${math_common_deps}) elseif(${cc_srcs_len} GREATER 0) cc_library( ${TARGET} diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake index 517ac24cccc72..06dc5d6173794 100755 --- a/cmake/inference_lib.cmake +++ b/cmake/inference_lib.cmake @@ -237,16 +237,6 @@ copy_part_of_thrid_party(inference_lib_dist ${PADDLE_INFERENCE_INSTALL_DIR}) set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid") -if(WIN32) - set(paddle_common_lib ${PADDLE_BINARY_DIR}/paddle/common/common.*) -else() - set(paddle_common_lib ${PADDLE_BINARY_DIR}/paddle/common/libcommon.*) -endif() -copy( - inference_lib_dist - SRCS ${paddle_common_lib} - DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/lib) - if(WIN32) if(WITH_STATIC_LIB) set(paddle_inference_lib @@ -278,6 +268,11 @@ else() SRCS ${paddle_phi_lib} DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/lib) endif() + set(paddle_common_lib ${PADDLE_BINARY_DIR}/paddle/common/libcommon.*) + copy( + inference_lib_dist + SRCS ${paddle_common_lib} + DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/lib) endif() copy( diff --git a/cmake/mccl.cmake b/cmake/mccl.cmake new file mode 100644 index 0000000000000..5ce4ea9c25fec --- /dev/null +++ b/cmake/mccl.cmake @@ -0,0 +1,51 @@ +if(NOT WITH_MUSA) + return() +endif() + +# Now we don't support MCCL on windows +if(WIN32) + return() +endif() + +if(WITH_MCCL) + set(MCCL_ROOT + "/usr/local/musa/" + CACHE PATH "MCCL ROOT") + find_path( + MCCL_INCLUDE_DIR mccl.h + PATHS ${MCCL_ROOT} ${MCCL_ROOT}/include ${MCCL_ROOT}/local/include + $ENV{MCCL_ROOT} $ENV{MCCL_ROOT}/include $ENV{MCCL_ROOT}/local/include + NO_DEFAULT_PATH) + + if(MCCL_INCLUDE_DIR) + file(READ ${MCCL_INCLUDE_DIR}/mccl.h MCCL_VERSION_FILE_CONTENTS) + + string(REGEX MATCH "define MCCL_MAJOR +([0-9]+)" MCCL_MAJOR_VERSION + "${MCCL_VERSION_FILE_CONTENTS}") + string(REGEX REPLACE "define MCCL_MAJOR +([0-9]+)" "\\1" MCCL_MAJOR_VERSION + "${MCCL_MAJOR_VERSION}") + string(REGEX MATCH "define MCCL_MINOR +([0-9]+)" MCCL_MINOR_VERSION + "${MCCL_VERSION_FILE_CONTENTS}") + string(REGEX REPLACE "define MCCL_MINOR +([0-9]+)" "\\1" MCCL_MINOR_VERSION + "${MCCL_MINOR_VERSION}") + string(REGEX MATCH "define MCCL_PATCH +([0-9]+)" MCCL_PATCH_VERSION + "${MCCL_VERSION_FILE_CONTENTS}") + string(REGEX REPLACE "define MCCL_PATCH +([0-9]+)" "\\1" MCCL_PATCH_VERSION + "${MCCL_PATCH_VERSION}") + if(NOT MCCL_MAJOR_VERSION) + set(MCCL_VERSION "???") + else() + math(EXPR MCCL_VERSION "${MCCL_MAJOR_VERSION} * 1000 + + ${MCCL_MINOR_VERSION} * 100 + ${MCCL_PATCH_VERSION}") + endif() + include_directories(${MCCL_INCLUDE_DIR}) + + message(STATUS "Current MCCL header is ${MCCL_INCLUDE_DIR}/mccl.h. ") + message( + STATUS + "Current MCCL version is " + "v${MCCL_MAJOR_VERSION}.${MCCL_MINOR_VERSION}.${MCCL_PATCH_VERSION} ") + else() + message(FATAL_ERROR "WITH_MCCL is enabled but mccl.h file is not found!") + endif() +endif() diff --git a/cmake/mudnn.cmake b/cmake/mudnn.cmake new file mode 100644 index 0000000000000..81027890d144e --- /dev/null +++ b/cmake/mudnn.cmake @@ -0,0 +1,92 @@ +if(NOT WITH_MUSA) + return() +endif() + +if(WIN32) + return() +else() + set(MUDNN_ROOT + "/usr/local/musa" + CACHE PATH "MUDNN ROOT") +endif() + +find_path( + MUDNN_INCLUDE_DIR mudnn.h + PATHS ${MUDNN_ROOT} ${MUDNN_ROOT}/include $ENV{MUDNN_ROOT} + $ENV{MUDNN_ROOT}/include ${MUSA_TOOLKIT_INCLUDE} + NO_DEFAULT_PATH) + +set(TARGET_ARCH "x86_64") +if(NOT ${CMAKE_SYSTEM_PROCESSOR}) + set(TARGET_ARCH ${CMAKE_SYSTEM_PROCESSOR}) +endif() + +list( + APPEND + MUDNN_CHECK_LIBRARY_DIRS + ${MUDNN_ROOT} + ${MUDNN_ROOT}/lib64 + ${MUDNN_ROOT}/lib + ${MUDNN_ROOT}/lib/x64 + ${MUDNN_ROOT}/lib/${TARGET_ARCH}-linux-gnu + $ENV{MUDNN_ROOT} + $ENV{MUDNN_ROOT}/lib64 + $ENV{MUDNN_ROOT}/lib + $ENV{MUDNN_ROOT}/lib/x64 + /usr/lib + ${MUSA_TOOLKIT_ROOT_DIR} + ${MUSA_TOOLKIT_ROOT_DIR}/lib/x64) +set(MUDNN_LIB_NAME "") + +if(LINUX) + set(MUDNN_LIB_NAME "libmudnn.so") +endif() + +find_library( + MUDNN_LIBRARY + NAMES ${MUDNN_LIB_NAME} + PATHS ${MUDNN_CHECK_LIBRARY_DIRS} ${MUDNN_INCLUDE_DIR} + NO_DEFAULT_PATH + DOC "Path to muDNN library.") + +if(MUDNN_INCLUDE_DIR AND MUDNN_LIBRARY) + set(MUDNN_FOUND ON) +else() + set(MUDNN_FOUND OFF) +endif() + +macro(find_mudnn_version mudnn_version_file) + file(READ ${mudnn_version_file} MUDNN_VERSION_FILE_CONTENTS) + get_filename_component(MUDNN_LIB_PATH ${MUDNN_LIBRARY} DIRECTORY) + + string(REGEX MATCH "define MUDNN_VERSION_MAJOR +([0-9]+)" MUDNN_MAJOR_VERSION + "${MUDNN_VERSION_FILE_CONTENTS}") + string(REGEX REPLACE "define MUDNN_VERSION_MAJOR +([0-9]+)" "\\1" + MUDNN_MAJOR_VERSION "${MUDNN_MAJOR_VERSION}") + string(REGEX MATCH "define MUDNN_VERSION_MINOR +([0-9]+)" MUDNN_MINOR_VERSION + "${MUDNN_VERSION_FILE_CONTENTS}") + string(REGEX REPLACE "define MUDNN_VERSION_MINOR +([0-9]+)" "\\1" + MUDNN_MINOR_VERSION "${MUDNN_MINOR_VERSION}") + string(REGEX MATCH "define MUDNN_VERSION_PATCH +([0-9]+)" MUDNN_PATCH_VERSION + "${MUDNN_VERSION_FILE_CONTENTS}") + string(REGEX REPLACE "define MUDNN_VERSION_PATCH +([0-9]+)" "\\1" + MUDNN_PATCH_VERSION "${MUDNN_PATCH_VERSION}") + + if(NOT MUDNN_MAJOR_VERSION) + set(MUDNN_VERSION "???") + else() + add_definitions("-DMUDNN_MAJOR_VERSION=\"${MUDNN_MAJOR_VERSION}\"") + math(EXPR MUDNN_VERSION "${MUDNN_MAJOR_VERSION} * 1000 + + ${MUDNN_MINOR_VERSION} * 100 + ${MUDNN_PATCH_VERSION}") + message(STATUS "Current muDNN version file is ${mudnn_version_file} ") + message( + STATUS + "Current muDNN version is v${MUDNN_MAJOR_VERSION}.${MUDNN_MINOR_VERSION}.${MUDNN_PATCH_VERSION}. " + ) + endif() +endmacro() + +if(MUDNN_FOUND) + find_mudnn_version(${MUDNN_INCLUDE_DIR}/mudnn_version.h) + include_directories(${MUDNN_INCLUDE_DIR}) +endif() diff --git a/cmake/musa.cmake b/cmake/musa.cmake new file mode 100644 index 0000000000000..63a85e827061c --- /dev/null +++ b/cmake/musa.cmake @@ -0,0 +1,128 @@ +if(NOT WITH_MUSA) + return() +endif() + +if(NOT DEFINED ENV{MUSA_PATH}) + set(MUSA_PATH + "/usr/local/musa" + CACHE PATH "Path to which ROCm has been installed") +else() + set(MUSA_PATH + $ENV{MUSA_PATH} + CACHE PATH "Path to which ROCm has been installed") +endif() +set(CMAKE_MODULE_PATH "${MUSA_PATH}/cmake" ${CMAKE_MODULE_PATH}) + +find_package(MUSA REQUIRED) +include_directories(${MUSA_PATH}/include) + +# set openmp include directory +set(llvm_openmp_search_list) +foreach(item RANGE 6 20 1) + list(APPEND llvm_openmp_search_list /usr/lib/llvm-${item}/include/openmp/) +endforeach() + +find_path( + OPENMP_INCLUDE_DIR omp.h + PATHS ${llvm_openmp_search_list} REQUIRED + NO_DEFAULT_PATH) +include_directories(${OPENMP_INCLUDE_DIR}) + +macro(find_musa_version musa_version_file) + set(python_file ${PROJECT_BINARY_DIR}/get_version.py) + set(MUSA_VERSION + "None" + CACHE STRING "musa version" FORCE) + file( + WRITE ${python_file} + "" + "import json\n" + "import sys\n" + "with open(sys.argv[1], 'r') as f:\n" + " data = json.load(f)\n" + " print(data[\"musa_runtime\"][\"version\"])" + "") + + execute_process( + COMMAND "python" "${python_file}" ${musa_version_file} + WORKING_DIRECTORY "${PROJECT_BINARY_DIR}/CMakeFiles/" + RESULT_VARIABLE python_res + OUTPUT_VARIABLE python_out + ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE) + + if(python_res EQUAL 0) + set(MUSA_VERSION ${python_out}) + endif() + string(REGEX REPLACE "([0-9]+)\.([0-9]+)\.([0-9]+)" "\\1" MUSA_MAJOR_VERSION + "${MUSA_VERSION}") + string(REGEX REPLACE "([0-9]+)\.([0-9]+)\.([0-9]+)" "\\2" MUSA_MINOR_VERSION + "${MUSA_VERSION}") + string(REGEX REPLACE "([0-9]+)\.([0-9]+)\.([0-9]+)" "\\3" MUSA_PATCH_VERSION + "${MUSA_VERSION}") + + if(NOT MUSA_MAJOR_VERSION) + set(MUSA_VERSION "???") + message(WARNING "Cannot find MUSA version in ${MUSA_PATH}/version.json") + else() + math( + EXPR + MUSA_VERSION + "${MUSA_MAJOR_VERSION} * 10000 + ${MUSA_MINOR_VERSION} * 100 + ${MUSA_PATCH_VERSION}" + ) + message(STATUS "Current MUSA version file is ${MUSA_PATH}/version.json.") + message( + STATUS + "Current MUSA version is v${MUSA_MAJOR_VERSION}.${MUSA_MINOR_VERSION}.${MUSA_PATCH_VERSION} " + ) + endif() +endmacro() +find_musa_version(${MUSA_PATH}/version.json) + +list(APPEND MUSA_MCC_FLAGS -Wno-macro-redefined) +list(APPEND MUSA_MCC_FLAGS -Wno-deprecated-copy-with-user-provided-copy) +list(APPEND MUSA_MCC_FLAGS -Wno-pragma-once-outside-header) +list(APPEND MUSA_MCC_FLAGS -Wno-return-type) +list(APPEND MUSA_MCC_FLAGS -Wno-sign-compare) +list(APPEND MUSA_MCC_FLAGS -Wno-overloaded-virtual) +list(APPEND MUSA_MCC_FLAGS -Wno-mismatched-tags) +list(APPEND MUSA_MCC_FLAGS -Wno-pessimizing-move) +list(APPEND MUSA_MCC_FLAGS -Wno-unused-but-set-variable) +list(APPEND MUSA_MCC_FLAGS -Wno-bitwise-instead-of-logical) +list(APPEND MUSA_MCC_FLAGS -Wno-format) +list(APPEND MUSA_MCC_FLAGS -Wno-self-assign) +list(APPEND MUSA_MCC_FLAGS -Wno-literal-conversion) +list(APPEND MUSA_MCC_FLAGS -Wno-literal-range) +list(APPEND MUSA_MCC_FLAGS -Wno-unused-private-field) +list(APPEND MUSA_MCC_FLAGS -Wno-unknown-warning-option) +list(APPEND MUSA_MCC_FLAGS -Wno-unused-variable) +list(APPEND MUSA_MCC_FLAGS -Wno-unused-value) +list(APPEND MUSA_MCC_FLAGS -Wno-unused-local-typedef) +list(APPEND MUSA_MCC_FLAGS -Wno-unused-lambda-capture) +list(APPEND MUSA_MCC_FLAGS -Wno-reorder-ctor) +list(APPEND MUSA_MCC_FLAGS -Wno-braced-scalar-init) +list(APPEND MUSA_MCC_FLAGS -Wno-pass-failed) +list(APPEND MUSA_MCC_FLAGS -Wno-missing-braces) +list(APPEND MUSA_MCC_FLAGS -Wno-dangling-gsl) + +if(WITH_CINN) + list(APPEND MUSA_MCC_FLAGS -std=c++14) +else() + list(APPEND MUSA_MCC_FLAGS -std=c++17) +endif() + +list(APPEND MUSA_MCC_FLAGS --cuda-gpu-arch=mp_22) +list(APPEND MUSA_MCC_FLAGS -U__CUDA__) +# MUSA has compile conflicts of float16.h as platform::float16 overload std::is_floating_point and std::is_integer +list(APPEND MUSA_MCC_FLAGS -D__MUSA_NO_HALF_CONVERSIONS__) + +#set(MUSA_VERBOSE_BUILD ON) +if(CMAKE_BUILD_TYPE MATCHES Debug) + list(APPEND MUSA_MCC_FLAGS -g2) + list(APPEND MUSA_MCC_FLAGS -O0) +else() + list(APPEND MUSA_MCC_FLAGS -O2) +endif() + +set(musa_runtime_library_name musart) +find_library(MUSARTC_LIB ${musa_runtime_library_name} HINTS ${MUSA_PATH}/lib) +message(STATUS "MUSARTC_LIB: ${MUSARTC_LIB}") diff --git a/cmake/operators.cmake b/cmake/operators.cmake index 95273118c2505..60966c41e95b9 100644 --- a/cmake/operators.cmake +++ b/cmake/operators.cmake @@ -84,6 +84,11 @@ function(register_cu_kernel TARGET) ${TARGET} SRCS ${cu_srcs} DEPS ${op_library_DEPS} ${op_common_deps}) + elseif(WITH_MUSA) + musa_library( + ${TARGET} + SRCS ${cu_srcs} + DEPS ${op_library_DEPS} ${op_common_deps}) endif() set(OP_LIBRARY ${TARGET} ${OP_LIBRARY} @@ -151,14 +156,18 @@ function(op_library TARGET) set(cc_srcs) set(cu_srcs) set(hip_srcs) + set(mu_srcs) set(cu_cc_srcs) set(hip_cc_srcs) + set(mu_cc_srcs) set(xpu_cc_srcs) set(xpu_kp_cc_srcs) set(cudnn_cu_cc_srcs) set(miopen_cu_cc_srcs) + set(mudnn_cu_cc_srcs) set(cudnn_cu_srcs) set(miopen_cu_srcs) + set(mudnn_cu_srcs) set(CUDNN_FILE) set(MIOPEN_FILE) set(mkldnn_cc_srcs) @@ -237,6 +246,35 @@ function(op_library TARGET) list(APPEND miopen_cu_srcs ${MIOPEN_FILE}.cu) endif() endif() + if(WITH_MUSA) + if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu.cc) + list(APPEND mu_cc_srcs ${TARGET}.cu.cc) + endif() + if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu) + list(APPEND mu_srcs ${TARGET}.cu) + endif() + # rename in KP: .kps -> .cu + if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.kps) + file(COPY ${TARGET}.kps DESTINATION ${CMAKE_CURRENT_BINARY_DIR}) + file(RENAME ${CMAKE_CURRENT_BINARY_DIR}/${TARGET}.kps + ${CMAKE_CURRENT_BINARY_DIR}/${TARGET}.cu) + list(APPEND mu_srcs ${CMAKE_CURRENT_BINARY_DIR}/${TARGET}.cu) + endif() + if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu) + set(PART_CUDA_KERNEL_FILES + ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu + ${PART_CUDA_KERNEL_FILES} + PARENT_SCOPE) + list(APPEND mu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu) + endif() + string(REPLACE "_op" "_cudnn_op" MUDNN_FILE "${TARGET}") + if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${MUDNN_FILE}.cu.cc) + list(APPEND mudnn_cu_cc_srcs ${MUDNN_FILE}.cu.cc) + endif() + if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${MUDNN_FILE}.cu) + list(APPEND mudnn_cu_srcs ${MUDNN_FILE}.cu) + endif() + endif() if(WITH_MKLDNN) string(REPLACE "_op" "_mkldnn_op" MKLDNN_FILE "${TARGET}") if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/mkldnn/${MKLDNN_FILE}.cc) @@ -267,6 +305,14 @@ function(op_library TARGET) list(APPEND miopen_cu_cc_srcs ${src}) elseif(WITH_ROCM AND ${src} MATCHES ".*\\.cu.cc$") list(APPEND hip_cc_srcs ${src}) + elseif(WITH_MUSA AND ${src} MATCHES ".*_cudnn_op.cu$") + list(APPEND mudnn_cu_srcs ${src}) + elseif(WITH_MUSA AND ${src} MATCHES ".*\\.cu$") + list(APPEND mu_srcs ${src}) + elseif(WITH_MUSA AND ${src} MATCHES ".*_cudnn_op.cu.cc$") + list(APPEND mudnn_cu_cc_srcs ${src}) + elseif(WITH_MUSA AND ${src} MATCHES ".*\\.cu.cc$") + list(APPEND mu_cc_srcs ${src}) elseif(WITH_GPU AND ${src} MATCHES ".*_cudnn_op.cu$") list(APPEND cudnn_cu_srcs ${src}) elseif(WITH_GPU AND ${src} MATCHES ".*\\.cu$") @@ -285,13 +331,15 @@ function(op_library TARGET) list(APPEND xpu_kp_cc_srcs ${src}) elseif(${src} MATCHES ".*\\.cc$") list(APPEND cc_srcs ${src}) - elseif((WITH_ROCM OR WITH_GPU) AND ${src} MATCHES ".*\\.kps$") + elseif((WITH_ROCM OR WITH_GPU OR WITH_MUSA) AND ${src} MATCHES ".*\\.kps$") string(REPLACE ".kps" ".cu" src_cu ${src}) file(COPY ${src} DESTINATION ${CMAKE_CURRENT_BINARY_DIR}) file(RENAME ${CMAKE_CURRENT_BINARY_DIR}/${src} ${CMAKE_CURRENT_BINARY_DIR}/${src_cu}) if(WITH_ROCM) list(APPEND hip_srcs ${CMAKE_CURRENT_BINARY_DIR}/${src_cu}) + elseif(WITH_MUSA) + list(APPEND mu_srcs ${CMAKE_CURRENT_BINARY_DIR}/${src_cu}) else() list(APPEND cu_srcs ${CMAKE_CURRENT_BINARY_DIR}/${src_cu}) endif() @@ -391,6 +439,26 @@ function(op_library TARGET) SRCS ${cc_srcs} ${hip_cc_srcs} ${miopen_cu_cc_srcs} ${miopen_cu_srcs} ${mkldnn_cc_srcs} ${hip_srcs} DEPS ${op_library_DEPS} ${op_common_deps}) + elseif(WITH_MUSA) + list(REMOVE_ITEM mudnn_cu_cc_srcs "affine_grid_cudnn_op.cu.cc") + list(REMOVE_ITEM mudnn_cu_cc_srcs "grid_sampler_cudnn_op.cu.cc") + list(REMOVE_ITEM mu_srcs "cholesky_op.cu") + list(REMOVE_ITEM mu_srcs "cholesky_solve_op.cu") + list(REMOVE_ITEM mu_srcs "lu_op.cu") + list(REMOVE_ITEM mu_srcs "matrix_rank_op.cu") + list(REMOVE_ITEM mu_srcs "svd_op.cu") + list(REMOVE_ITEM mu_srcs "eigvalsh_op.cu") + list(REMOVE_ITEM mu_srcs "qr_op.cu") + list(REMOVE_ITEM mu_srcs "eigh_op.cu") + list(REMOVE_ITEM mu_srcs "lstsq_op.cu") + list(REMOVE_ITEM mu_srcs "multinomial_op.cu") + list(REMOVE_ITEM mu_srcs "multiclass_nms3_op.cu") + message(STATUS "mu_cc_srcs: ${mu_cc_srcs}, cc_srcs: ${cc_srcs}") + musa_library( + ${TARGET} + SRCS ${cc_srcs} ${mu_cc_srcs} ${mudnn_cu_cc_srcs} ${mudnn_cu_srcs} + ${mkldnn_cc_srcs} ${mu_srcs} + DEPS ${op_library_DEPS} ${op_common_deps}) elseif(WITH_XPU_KP AND ${xpu_kp_cc_srcs_len} GREATER 0) xpu_library( ${TARGET} @@ -424,8 +492,10 @@ function(op_library TARGET) list(LENGTH cu_srcs cu_srcs_len) list(LENGTH hip_srcs hip_srcs_len) + list(LENGTH mu_srcs mu_srcs_len) list(LENGTH cu_cc_srcs cu_cc_srcs_len) list(LENGTH hip_cc_srcs hip_cc_srcs_len) + list(LENGTH mu_cc_srcs mu_cc_srcs_len) list(LENGTH mkldnn_cc_srcs mkldnn_cc_srcs_len) list(LENGTH xpu_cc_srcs xpu_cc_srcs_len) list(LENGTH miopen_cu_cc_srcs miopen_cu_cc_srcs_len) @@ -536,12 +606,30 @@ function(op_library TARGET) endif() endforeach() + # pybind USE_OP_DEVICE_KERNEL for MUSA + list(APPEND mu_srcs ${mu_cc_srcs}) + message("mu_srcs ${mu_srcs}") + foreach(mu_src ${mu_srcs}) + set(op_name "") + find_register(${mu_src} "REGISTER_OP_CUDA_KERNEL" op_name) + find_phi_register(${mu_src} ${pybind_file} "PD_REGISTER_KERNEL") + find_phi_register(${mu_src} ${pybind_file} "PD_REGISTER_STRUCT_KERNEL") + find_phi_register(${mu_src} ${pybind_file} + "PD_REGISTER_KERNEL_FOR_ALL_DTYPE") + if(NOT ${op_name} EQUAL "") + file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${op_name}, CUDA);\n") + set(pybind_flag 1) + endif() + endforeach() + + # pybind USE_OP_DEVICE_KERNEL for CUDNN/MIOPEN list(APPEND cudnn_cu_srcs ${cudnn_cu_cc_srcs}) list(APPEND cudnn_cu_srcs ${miopen_cu_cc_srcs}) list(APPEND cudnn_cu_srcs ${miopen_cu_srcs}) + list(APPEND cudnn_cu_srcs ${mudnn_cu_cc_srcs}) + list(APPEND cudnn_cu_srcs ${mudnn_cu_srcs}) list(LENGTH cudnn_cu_srcs cudnn_cu_srcs_len) - #message("cudnn_cu_srcs ${cudnn_cu_srcs}") if(${cudnn_cu_srcs_len} GREATER 0 AND ${ORIGINAL_TARGET} STREQUAL "activation_op") file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(relu, CUDNN);\n") @@ -656,7 +744,7 @@ function(register_operators) string(REPLACE ".cc" "" OPS "${OPS}") list(REMOVE_DUPLICATES OPS) list(LENGTH register_operators_DEPS register_operators_DEPS_len) - + message(STATUS "OPS in register_operators:${OPS}") foreach(src ${OPS}) list(FIND register_operators_EXCLUDES ${src} _index) if(${_index} EQUAL -1) diff --git a/cmake/phi.cmake b/cmake/phi.cmake index ead66697ef68c..499cc4c591bbf 100644 --- a/cmake/phi.cmake +++ b/cmake/phi.cmake @@ -104,7 +104,7 @@ function(kernel_declare TARGET_LIST) endif() endif() # some gpu kernel only can run on cuda, not support rocm, so we add this branch - if(WITH_ROCM) + if(WITH_ROCM OR WITH_MUSA) string(FIND "${first_registry}" "cuda_only" pos) if(pos GREATER 1) set(first_registry "") diff --git a/paddle/cinn/ir/ir_base.h b/paddle/cinn/ir/ir_base.h index 0047100ebcfdf..c333448d029ae 100644 --- a/paddle/cinn/ir/ir_base.h +++ b/paddle/cinn/ir/ir_base.h @@ -110,23 +110,16 @@ class Dim; macro__(Product) \ macro__(Sum) \ macro__(PrimitiveNode) \ + macro__(IntrinsicOp) \ macro__(_BufferRange_) \ macro__(ScheduleBlock) \ macro__(ScheduleBlockRealize) \ macro__(_Dim_) \ -#define NODETY_CONTROL_OP_FOR_INTRINSIC(macro__) \ - macro__(IntrinsicOp) \ #define NODETY_FORALL(__m) \ NODETY_PRIMITIVE_TYPE_FOR_EACH(__m) \ NODETY_OP_FOR_EACH(__m) \ - NODETY_CONTROL_OP_FOR_INTRINSIC(__m) \ - NODETY_CONTROL_OP_FOR_EACH(__m) - -#define NODETY_FORALL_EXCEPT_INTRINSIC(__m) \ - NODETY_PRIMITIVE_TYPE_FOR_EACH(__m) \ - NODETY_OP_FOR_EACH(__m) \ NODETY_CONTROL_OP_FOR_EACH(__m) // clang-format on diff --git a/paddle/cinn/ir/utils/ir_nodes_collector.cc b/paddle/cinn/ir/utils/ir_nodes_collector.cc index e4ebaca653bae..ac2f0317e9213 100644 --- a/paddle/cinn/ir/utils/ir_nodes_collector.cc +++ b/paddle/cinn/ir/utils/ir_nodes_collector.cc @@ -15,8 +15,6 @@ #include "paddle/cinn/ir/utils/ir_nodes_collector.h" #include -#include "paddle/cinn/ir/intrinsic_ops.h" -#include "paddle/cinn/ir/ir.h" #include "paddle/cinn/ir/ir_mutator.h" #include "paddle/cinn/ir/ir_printer.h" @@ -73,71 +71,8 @@ struct IrNodesCollector : public IRVisitorRequireReImpl { } \ } - NODETY_FORALL_EXCEPT_INTRINSIC(__m) + NODETY_FORALL(__m) #undef __m - - void Visit(const ir::IntrinsicOp* op) { - switch (op->getKind()) { -#define __(x) \ - case ir::IntrinsicKind::k##x: \ - Visit(llvm::dyn_cast(op)); \ - break; - - INTRINSIC_KIND_FOR_EACH(__) -#undef __ - } - } - - void Visit(const ir::intrinsics::GetAddr* x) { - if (x->data.defined()) { - Visit(&(x->data)); - } - } - - void Visit(const ir::intrinsics::BufferGetDataHandle* x) { - if (x->buffer.defined()) { - Visit(&(x->buffer)); - } - } - - void Visit(const ir::intrinsics::BufferGetDataConstHandle* x) { - if (x->buffer.defined()) { - Visit(&(x->buffer)); - } - } - - void Visit(const ir::intrinsics::PodValueToX* x) { - if (x->pod_value_ptr.defined()) { - Visit(&(x->pod_value_ptr)); - } - } - - void Visit(const ir::intrinsics::BufferCreate* x) { - if (x->buffer.defined()) { - Visit(&(x->buffer)); - } - } - - void Visit(const ir::intrinsics::ArgsConstruct* x) { - if (x->var.defined()) { - Expr convert = Expr(x->var); - Visit(&convert); - } - for (int i = 0; i < x->args.size(); ++i) { - if (x->args[i].defined()) { - Visit(&(x->args[i])); - } - } - } - - void Visit(const ir::intrinsics::BuiltinIntrin* x) { - for (int i = 0; i < x->args.size(); ++i) { - if (x->args[i].defined()) { - Visit(&(x->args[i])); - } - } - } - std::set visited_; }; diff --git a/paddle/common/array.h b/paddle/common/array.h index 11457a1eaa756..20f7904fc3bd1 100644 --- a/paddle/common/array.h +++ b/paddle/common/array.h @@ -54,7 +54,7 @@ class Array { } HOSTDEVICE inline T &at(size_t i) { -#if !defined(__CUDA_ARCH__) && !defined(__HIPCC__) +#if !defined(__CUDA_ARCH__) && !defined(__HIPCC__)&& !defined(__MUSACC__) COMMON_ENFORCE_LT( i, N, common::errors::OutOfRange("Array index out of bounds.")); #endif @@ -62,7 +62,7 @@ class Array { } HOSTDEVICE inline const T &at(size_t i) const { -#if !defined(__CUDA_ARCH__) && !defined(__HIPCC__) +#if !defined(__CUDA_ARCH__) && !defined(__HIPCC__)&& !defined(__MUSACC__) COMMON_ENFORCE_LT( i, N, common::errors::OutOfRange("Array index out of bounds.")); #endif @@ -103,7 +103,7 @@ class Array { HOSTDEVICE inline T *GetMutable() { return nullptr; } HOSTDEVICE inline T &operator[](size_t) { -#if defined(__HIPCC__) || defined(__CUDA_ARCH__) +#if defined(__HIPCC__) || defined(__MUSACC__) || defined(__CUDA_ARCH__) // HIP and CUDA will have compile error, if use "obj()" // function declared in block scope cannot have 'static' storage class static T obj{}; @@ -114,7 +114,7 @@ class Array { } HOSTDEVICE inline const T &operator[](size_t) const { -#if defined(__HIPCC__) || defined(__CUDA_ARCH__) +#if defined(__HIPCC__) || defined(__MUSACC__) || defined(__CUDA_ARCH__) // HIP and CUDA will have compile error, if use "obj()" // function declared in block scope cannot have 'static' storage class static const T obj{}; diff --git a/paddle/common/hostdevice.h b/paddle/common/hostdevice.h index 7f8cf13563434..f7070893d83b5 100644 --- a/paddle/common/hostdevice.h +++ b/paddle/common/hostdevice.h @@ -18,6 +18,10 @@ #include #endif +#ifdef __MUSACC__ +#include +#endif + #if defined(__xpu__) #include @@ -26,7 +30,7 @@ #include "xpu/kernel/math.h" #endif -#if (defined(__CUDACC__) || defined(__HIPCC__) || defined(__xpu__)) +#if (defined(__CUDACC__) || defined(__HIPCC__) || defined(__MUSACC__) || defined(__xpu__)) #define HOSTDEVICE __host__ __device__ #define DEVICE __device__ #define HOST __host__ diff --git a/paddle/common/macros.h b/paddle/common/macros.h index 2d476c58cb6ae..8189b3147db8c 100644 --- a/paddle/common/macros.h +++ b/paddle/common/macros.h @@ -72,7 +72,7 @@ namespace common { #define PD_CONCATENATE2(arg1, arg2) arg1##arg2 #define PD_EXPAND(x) x -#if defined(__NVCC__) || defined(__HIPCC__) +#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) #define PADDLE_RESTRICT __restrict__ #else #define PADDLE_RESTRICT diff --git a/paddle/fluid/distributed/collective/CMakeLists.txt b/paddle/fluid/distributed/collective/CMakeLists.txt index d42b810972dc8..dd6309f7da360 100644 --- a/paddle/fluid/distributed/collective/CMakeLists.txt +++ b/paddle/fluid/distributed/collective/CMakeLists.txt @@ -15,7 +15,7 @@ if(WITH_DISTRIBUTE) DEPS phi common eager_api gloo_wrapper) endif() -if(WITH_NCCL OR WITH_RCCL) +if(WITH_NCCL OR WITH_RCCL OR WITH_MCCL) cc_library( process_group_nccl SRCS process_group_nccl.cc common.cc @@ -63,7 +63,7 @@ if(WITH_CUSTOM_DEVICE) endif() set(COMM_UTILS_DEPS process_group) -if(WITH_NCCL OR WITH_RCCL) +if(WITH_NCCL OR WITH_RCCL OR WITH_MCCL) set(COMM_UTILS_DEPS ${PROCESS_GROUP_UTILS_DEPS} process_group_nccl) endif() if(WITH_CUSTOM_DEVICE) diff --git a/paddle/fluid/distributed/collective/process_group_nccl.cc b/paddle/fluid/distributed/collective/process_group_nccl.cc index 6732ea375d500..dd3e1f410ee0d 100644 --- a/paddle/fluid/distributed/collective/process_group_nccl.cc +++ b/paddle/fluid/distributed/collective/process_group_nccl.cc @@ -106,6 +106,8 @@ bool ProcessGroupNCCL::NCCLTask::Wait(std::chrono::milliseconds timeout) { // If we use the work to do barrier, we should block cpu #ifdef PADDLE_WITH_CUDA PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize()); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(musaDeviceSynchronize()); #else // PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceSynchronize()); #endif @@ -137,18 +139,20 @@ ProcessGroupNCCL::~ProcessGroupNCCL() { } void ProcessGroupNCCL::GroupStart() { - NCCL_CHECK(phi::dynload::ncclGroupStart()); + MCCL_CHECK(phi::dynload::mcclGroupStart()); ++s_group_call_counter; } void ProcessGroupNCCL::GroupEnd() { - NCCL_CHECK(phi::dynload::ncclGroupEnd()); + MCCL_CHECK(phi::dynload::mcclGroupEnd()); --s_group_call_counter; // NOTE: This is to sync the calc stream and comm stream for debug using // batch_isend_irecv if (FLAGS_benchmark || FLAGS_benchmark_nccl) { #ifdef PADDLE_WITH_CUDA PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize()); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(musaDeviceSynchronize()); #else // PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceSynchronize()); #endif @@ -179,7 +183,7 @@ phi::DeviceContext* ProcessGroupNCCL::GetDeviceContext( } } -ncclComm_t ProcessGroupNCCL::NCCLComm(const Place& place) const { +mcclComm_t ProcessGroupNCCL::NCCLComm(const Place& place) const { const std::string& key = GetKeyFromPlace(place); const auto& iter = place_to_comm_ctx_.find(key); PADDLE_ENFORCE_NE( @@ -204,7 +208,7 @@ std::shared_ptr ProcessGroupNCCL::AllGather( numel > 0 ? GetPartialTensor(tensor_tmp, offset, numel) : tensor_tmp; return Collective( [&](phi::distributed::NCCLCommContext* comm_context, gpuStream_t stream) { - VLOG(3) << "[ncclAllGather] " + VLOG(3) << "[mcclAllGather] " << "sendbuff: " << in_tensor_maybe_partial.data() << ", recvbuff: " << out_tensor->data() << ", count: " << in_tensor_maybe_partial.numel() @@ -235,7 +239,7 @@ std::shared_ptr ProcessGroupNCCL::AllReduce( paddle::experimental::CheckAndTrans2NewContiguousTensor(in_tensor); return Collective( [&](phi::distributed::NCCLCommContext* comm_context, gpuStream_t stream) { - VLOG(3) << "[ncclAllReduce] " + VLOG(3) << "[mcclAllReduce] " << "sendbuff: " << tensor_tmp.data() << ", recvbuff: " << out_tensor->data() << ", count: " << tensor_tmp.numel() << ", datatype: " @@ -704,7 +708,7 @@ void ProcessGroupNCCL::CreateNCCLEnvCache(const Place& place, << ", store_key: " << store_key; for (size_t i = 0; i < s_group_call_counter; ++i) { - NCCL_CHECK(phi::dynload::ncclGroupEnd()); + MCCL_CHECK(phi::dynload::mcclGroupEnd()); } bool is_batch_p2p = s_group_call_counter > 0; @@ -713,13 +717,13 @@ void ProcessGroupNCCL::CreateNCCLEnvCache(const Place& place, int num_ranks = is_p2p_op ? 2 : GetSize(); int rank = is_p2p_op ? p2p_rank : GetRank(); - NCCL_CHECK(phi::dynload::ncclGroupStart()); + MCCL_CHECK(phi::dynload::mcclGroupStart()); phi::distributed::P2POption p2p_opts({is_p2p_op, p2p_rank, num_ranks, rank}); phi::distributed::CommContextManager::CreateNCCLCommContext( store_, store_key, rank_, size_, "", &p2p_opts); - NCCL_CHECK(phi::dynload::ncclGroupEnd()); + MCCL_CHECK(phi::dynload::mcclGroupEnd()); auto nccl_comm_ctx = this->GetCommContext(&store_key); VLOG(3) << "Get nccl comm: " << nccl_comm_ctx->GetNcclComm() @@ -747,10 +751,10 @@ void ProcessGroupNCCL::CreateNCCLEnvCache(const Place& place, phi::GPUPlace(phi::backends::gpu::GetCurrentDeviceId()), gpu_global_ranks_size); - NCCL_CHECK(phi::dynload::ncclAllGather(gpu_global_rank->ptr(), + MCCL_CHECK(phi::dynload::mcclAllGather(gpu_global_rank->ptr(), gpu_global_ranks->ptr(), 1, - ncclInt, + mcclInt, nccl_comm_ctx->GetNcclComm(), comm_ctx->stream())); @@ -783,7 +787,7 @@ void ProcessGroupNCCL::CreateNCCLEnvCache(const Place& place, place_to_comm_ctx_.emplace(place_key, std::move(comm_ctx)); for (size_t i = 0; i < s_group_call_counter; ++i) { - NCCL_CHECK(phi::dynload::ncclGroupStart()); + MCCL_CHECK(phi::dynload::mcclGroupStart()); } } @@ -878,6 +882,8 @@ std::shared_ptr ProcessGroupNCCL::Collective( if (FLAGS_benchmark || FLAGS_benchmark_nccl) { #ifdef PADDLE_WITH_CUDA PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize()); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(musaDeviceSynchronize()); #else // PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceSynchronize()); #endif @@ -993,6 +999,8 @@ std::shared_ptr ProcessGroupNCCL::Point2Point( if (!is_batch_p2p && (FLAGS_benchmark || FLAGS_benchmark_nccl)) { #ifdef PADDLE_WITH_CUDA PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize()); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(musaDeviceSynchronize()); #else // PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceSynchronize()); #endif diff --git a/paddle/fluid/distributed/collective/process_group_nccl.h b/paddle/fluid/distributed/collective/process_group_nccl.h index 22d90370f16af..8a626d701b324 100644 --- a/paddle/fluid/distributed/collective/process_group_nccl.h +++ b/paddle/fluid/distributed/collective/process_group_nccl.h @@ -175,7 +175,7 @@ class ProcessGroupNCCL final : public ProcessGroupWithStream { static void GroupEnd(); - ncclComm_t NCCLComm(const Place& place) const; + mcclComm_t NCCLComm(const Place& place) const; private: std::shared_ptr CreateTask(const Place& place, diff --git a/paddle/fluid/distributed/collective/processgroup_comm_utils.cc b/paddle/fluid/distributed/collective/processgroup_comm_utils.cc index eec697f523945..9061ce7aeaa06 100644 --- a/paddle/fluid/distributed/collective/processgroup_comm_utils.cc +++ b/paddle/fluid/distributed/collective/processgroup_comm_utils.cc @@ -14,7 +14,7 @@ #include "paddle/fluid/distributed/collective/process_group.h" #include "paddle/phi/backends/c_comm_lib.h" -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) #include "paddle/fluid/distributed/collective/process_group_nccl.h" #endif #if defined(PADDLE_WITH_CUSTOM_DEVICE) @@ -33,7 +33,7 @@ namespace detail { // In principle, the PHI Kernel cannot use the global singleton internally, // and the required members need to be passed in from the eucalyptus tree. ccl::CCLComm GetCCLComm(const Place& place, int global_gid) { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) || \ defined(PADDLE_WITH_CUSTOM_DEVICE) paddle::distributed::ProcessGroup* pg = nullptr; if (paddle::distributed::ProcessGroupMapFromGid::getInstance()->has( @@ -45,7 +45,7 @@ ccl::CCLComm GetCCLComm(const Place& place, int global_gid) { } #endif if (place.GetType() == phi::AllocationType::GPU) { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) return static_cast(pg)->NCCLComm( place); #else diff --git a/paddle/fluid/distributed/collective/reducer.cc b/paddle/fluid/distributed/collective/reducer.cc index 6165dfc27e38e..591e083d005a4 100644 --- a/paddle/fluid/distributed/collective/reducer.cc +++ b/paddle/fluid/distributed/collective/reducer.cc @@ -372,7 +372,7 @@ void EagerGroup::ConcatTensors(const platform::Place &place) { paddle::experimental::empty(IntArray({all_length_}), dtype_, place); if (platform::is_gpu_place(place)) { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) auto *default_ctx = static_cast( platform::DeviceContextPool::Instance().Get(place)); ConcatTensorsWithType( @@ -419,7 +419,7 @@ void EagerGroup::ConcatTensors(const platform::Place &place) { void EagerGroup::SplitTensors(const platform::DeviceContext &context) { auto place = context.GetPlace(); if (platform::is_gpu_place(place)) { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) auto &gpu_context = static_cast(context); SplitTensorsWithType( gpu_context, &dense_contents_, &dense_tensors_, dtype_); @@ -1112,7 +1112,7 @@ void EagerReducer::AllReduceSparse(EagerGroup *group, auto *dev_ctx = platform::DeviceContextPool::Instance().Get(inner_place_); // NOLINT if (platform::is_gpu_place(inner_place_)) { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) dev_ctx = static_cast( platform::DeviceContextPool::Instance().Get(inner_place_)); #else diff --git a/paddle/fluid/distributed/common/chunk_allocator.h b/paddle/fluid/distributed/common/chunk_allocator.h index 7b19b3a109839..17f7bb14224d3 100644 --- a/paddle/fluid/distributed/common/chunk_allocator.h +++ b/paddle/fluid/distributed/common/chunk_allocator.h @@ -14,7 +14,6 @@ #pragma once #include -#include "paddle/fluid/platform/enforce.h" namespace paddle { namespace distributed { @@ -78,16 +77,9 @@ class ChunkAllocator { void create_new_chunk() { Chunk* chunk; - size_t alloc_size = sizeof(Chunk) + sizeof(Node) * _chunk_size; - int error = posix_memalign(reinterpret_cast(&chunk), - std::max(sizeof(void*), alignof(Chunk)), - alloc_size); - PADDLE_ENFORCE_EQ(error, - 0, - paddle::platform::errors::ResourceExhausted( - "Fail to alloc memory of %ld size, error code is %d.", - alloc_size, - error)); + posix_memalign(reinterpret_cast(&chunk), + std::max(sizeof(void*), alignof(Chunk)), + sizeof(Chunk) + sizeof(Node) * _chunk_size); chunk->next = _chunks; _chunks = chunk; diff --git a/paddle/fluid/distributed/fleet_executor/carrier.cc b/paddle/fluid/distributed/fleet_executor/carrier.cc index 82a3514f2791f..c896786c657f6 100644 --- a/paddle/fluid/distributed/fleet_executor/carrier.cc +++ b/paddle/fluid/distributed/fleet_executor/carrier.cc @@ -277,7 +277,7 @@ static std::shared_ptr GetGC( int64_t max_memory_size = framework::GetEagerDeletionThreshold(); std::shared_ptr gc; if (max_memory_size >= 0) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) if (platform::is_gpu_place(place)) { if (framework::IsFastEagerDeletionModeEnabled()) { gc.reset(new framework::UnsafeFastGPUGarbageCollector(place, diff --git a/paddle/fluid/distributed/fleet_executor/cond_interceptor.cc b/paddle/fluid/distributed/fleet_executor/cond_interceptor.cc index 704dd16400065..61e0732f89f5b 100644 --- a/paddle/fluid/distributed/fleet_executor/cond_interceptor.cc +++ b/paddle/fluid/distributed/fleet_executor/cond_interceptor.cc @@ -71,7 +71,7 @@ bool CondInterceptor::GetCondResult() { const auto& cond_tensor = cond_var->Get(); bool res = false; if (platform::is_gpu_place(cond_tensor.place())) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) phi::DenseTensor cpu_tensor; framework::TensorCopy(cond_tensor, platform::CPUPlace(), &cpu_tensor); platform::DeviceContextPool::Instance().Get(cond_tensor.place())->Wait(); diff --git a/paddle/fluid/distributed/fleet_executor/dist_model.cc b/paddle/fluid/distributed/fleet_executor/dist_model.cc index a1fd38295319e..0117a472ef06d 100644 --- a/paddle/fluid/distributed/fleet_executor/dist_model.cc +++ b/paddle/fluid/distributed/fleet_executor/dist_model.cc @@ -76,7 +76,7 @@ bool LoadDataFromDistModelTensor(const DistModelTensor &input_data, input_data.data.length()); } else if (platform::is_gpu_place(place)) { VLOG(3) << "Loading data for GPU."; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); auto *dev_ctx = dynamic_cast(pool.Get(place)); auto gpu_place = place; diff --git a/paddle/fluid/distributed/fleet_executor/message_bus.cc b/paddle/fluid/distributed/fleet_executor/message_bus.cc index b5786e2393393..6dc9cff9d9120 100644 --- a/paddle/fluid/distributed/fleet_executor/message_bus.cc +++ b/paddle/fluid/distributed/fleet_executor/message_bus.cc @@ -51,7 +51,7 @@ void MessageBus::Init( addr_)); } -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) || \ defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_CUSTOM_DEVICE) // NOTE: To make the brpc is compatible with collective, // need release the handler holding the ip address. diff --git a/paddle/fluid/eager/api/manual/eager_manual/forwards/multiply_fwd_func.cc b/paddle/fluid/eager/api/manual/eager_manual/forwards/multiply_fwd_func.cc index 47509d025722d..2bd9213cae610 100644 --- a/paddle/fluid/eager/api/manual/eager_manual/forwards/multiply_fwd_func.cc +++ b/paddle/fluid/eager/api/manual/eager_manual/forwards/multiply_fwd_func.cc @@ -61,9 +61,8 @@ paddle::Tensor multiply_ad_func(const paddle::Tensor& x, // Type promotion Logic if (phi::NeedTypePromotion(x.dtype(), y.dtype())) { VLOG(5) << "got different data type, run type protmotion automatically."; - LOG_FIRST_N(WARNING, 1) - << "got different data type, run type protmotion " - "automatically, this may cause data type been changed."; + LOG(WARNING) << "got different data type, run type protmotion " + "automatically, this may cause data type been changed."; auto op_name = phi::TransToFluidOpName("multiply"); auto promotion_type = phi::GetPromoteDtype(op_name, x.dtype(), y.dtype()); @@ -408,9 +407,8 @@ paddle::Tensor multiply_ad_func(const paddle::Tensor& x, // Type promotion Logic if (phi::NeedTypePromotion(x.dtype(), y.dtype())) { VLOG(5) << "got different data type, run type protmotion automatically."; - LOG_FIRST_N(WARNING, 1) - << "got different data type, run type protmotion " - "automatically, this may cause data type been changed."; + LOG(WARNING) << "got different data type, run type protmotion " + "automatically, this may cause data type been changed."; auto op_name = phi::TransToFluidOpName("multiply"); auto promotion_type = phi::GetPromoteDtype(op_name, x.dtype(), y.dtype()); diff --git a/paddle/fluid/eager/auto_code_generator/CMakeLists.txt b/paddle/fluid/eager/auto_code_generator/CMakeLists.txt index a6bb716e6b7ad..bef2878e706f5 100644 --- a/paddle/fluid/eager/auto_code_generator/CMakeLists.txt +++ b/paddle/fluid/eager/auto_code_generator/CMakeLists.txt @@ -27,6 +27,10 @@ if(WITH_ROCM) target_link_libraries(eager_generator ${ROCM_HIPRTC_LIB}) endif() +if(WITH_MUSA) + target_link_libraries(eager_generator ${MUSARTC_LIB}) +endif() + if(WITH_CINN) target_link_libraries(eager_generator ${PYTHON_LIBRARIES}) endif() diff --git a/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py index 75d6cb94c6b5f..2a96fddccbce7 100644 --- a/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py +++ b/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py @@ -528,7 +528,7 @@ class {} : public egr::GradNodeBase {{ TYPE_PROMOTION_LOGIC_TEMPLATE = """ if (phi::NeedTypePromotion({x}.dtype(), {y}.dtype())) {{ VLOG(5) << "got different data type, run type protmotion automatically."; - LOG_FIRST_N(WARNING, 1) << "got different data type, run type protmotion automatically, this may cause data type been changed."; + LOG(WARNING) << "got different data type, run type protmotion automatically, this may cause data type been changed."; {op_name} auto promotion_type = phi::GetPromoteDtype(op_name, {x}.dtype(), {y}.dtype()); diff --git a/paddle/fluid/eager/auto_code_generator/generator/python_c_gen.py b/paddle/fluid/eager/auto_code_generator/generator/python_c_gen.py index daf16f446ab12..f93f41a21553a 100644 --- a/paddle/fluid/eager/auto_code_generator/generator/python_c_gen.py +++ b/paddle/fluid/eager/auto_code_generator/generator/python_c_gen.py @@ -146,7 +146,7 @@ def FindParsingFunctionFromAttributeType(atype): FUNCTION_SET_DEVICE_TEMPLATE = """{} SetPythonStack(); if (paddle::platform::is_gpu_place(place)) {{ -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) phi::backends::gpu::SetDeviceId(place.device); VLOG(4) <<"CurrentDeviceId: " << phi::backends::gpu::GetCurrentDeviceId() << " from " << (int)place.device; #else diff --git a/paddle/fluid/eager/nan_inf_utils.cc b/paddle/fluid/eager/nan_inf_utils.cc index a1e62ea6ba519..2da9994b7671c 100644 --- a/paddle/fluid/eager/nan_inf_utils.cc +++ b/paddle/fluid/eager/nan_inf_utils.cc @@ -103,7 +103,7 @@ void CheckTensorHasNanOrInf(const std::string& api_name, const Tensor& tensor) { auto& place = dense_tensor->place(); if (paddle::platform::is_gpu_place(place)) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) paddle::framework::details::tensor_check( api_name, tensor_name, *dense_tensor, place); #else diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 8aa03e98809fb..8aab6bf2a201a 100755 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -213,6 +213,11 @@ elseif(WITH_ROCM) data_type_transform SRCS data_type_transform.cu DEPS tensor) +elseif(WITH_MUSA) + musa_library( + data_type_transform + SRCS data_type_transform.cu + DEPS tensor) elseif(WITH_XPU) cc_library( data_type_transform @@ -461,7 +466,7 @@ if(WITH_PYTHON) ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto COMMENT "Copy generated python proto into directory paddle/fluid/proto." WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) - if(NOT WITH_ROCM) + if(WITH_GPU) add_custom_target( fleet_executor_proto_init ALL DEPENDS fleet_proto_init fleet_executor_desc_py_proto diff --git a/paddle/fluid/framework/conv_search_cache.h b/paddle/fluid/framework/conv_search_cache.h index 1620c99ce8560..6621b74740f25 100644 --- a/paddle/fluid/framework/conv_search_cache.h +++ b/paddle/fluid/framework/conv_search_cache.h @@ -45,6 +45,19 @@ class ConvSearchCache { AlgorithmsCache* GetConvFusion() { return &fusion_forward_cache_; } +#elif defined(PADDLE_WITH_MUSA) + // AlgorithmsCache* GetForward() { + // return &forward_cache_; + // } + // AlgorithmsCache* GetBackwardData() { + // return &backward_data_cache_; + // } + // AlgorithmsCache* GetBackwardFilter() { + // return &backward_filter_cache_; + // } + // AlgorithmsCache* GetConvFusion() { + // return &fusion_forward_cache_; + // } #else AlgorithmsCache* GetForward() { return &forward_cache_; @@ -72,6 +85,11 @@ class ConvSearchCache { AlgorithmsCache backward_data_cache_; AlgorithmsCache backward_filter_cache_; AlgorithmsCache fusion_forward_cache_; +#elif defined(PADDLE_WITH_MUSA) + // AlgorithmsCache forward_cache_; + // AlgorithmsCache backward_data_cache_; + // AlgorithmsCache backward_filter_cache_; + // AlgorithmsCache fusion_forward_cache_; #else AlgorithmsCache forward_cache_; AlgorithmsCache backward_data_cache_; diff --git a/paddle/fluid/framework/custom_operator.cc b/paddle/fluid/framework/custom_operator.cc index bf2f9e4379b69..4d2236ed1e66f 100644 --- a/paddle/fluid/framework/custom_operator.cc +++ b/paddle/fluid/framework/custom_operator.cc @@ -124,7 +124,7 @@ static void RunKernelFunc( "Input tensor (%s) is not initialized.", in_name)); paddle::Tensor custom_in; custom_in.set_impl(std::make_shared(*x)); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) if (custom_in.is_gpu_pinned()) { VLOG(3) << "Custom Operator: custom input is gpu pinned tensor"; auto gpu_place = phi::GPUPlace(platform::GetCurrentDeviceId()); @@ -936,7 +936,7 @@ static void RegisterOperatorKernel( } RegisterOperatorKernelWithPlace( name, op_kernel_func, proto::VarType::RAW, platform::CPUPlace()); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) RegisterOperatorKernelWithPlace( name, op_kernel_func, proto::VarType::RAW, platform::CUDAPlace()); #endif diff --git a/paddle/fluid/framework/data_feed.cc b/paddle/fluid/framework/data_feed.cc index 4a72f339a85cb..d3525c80d56db 100644 --- a/paddle/fluid/framework/data_feed.cc +++ b/paddle/fluid/framework/data_feed.cc @@ -271,6 +271,8 @@ void DataFeed::CopyToFeedTensor(void* dst, const void* src, size_t size) { cudaMemcpy(dst, src, size, cudaMemcpyHostToDevice); #elif defined(PADDLE_WITH_HIP) hipMemcpy(dst, src, size, hipMemcpyHostToDevice); +#elif defined(PADDLE_WITH_MUSA) + musaMemcpy(dst, src, size, musaMemcpyHostToDevice); #elif defined(PADDLE_WITH_XPU_KP) xpu_memcpy(dst, src, size, XPUMemcpyKind::XPU_HOST_TO_DEVICE); #else @@ -1529,7 +1531,7 @@ void MultiSlotInMemoryDataFeed::PutToFeedVec( #endif } -#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && !defined(_WIN32) +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)) && !defined(_WIN32) template void PrivateInstantDataFeed::PutToFeedVec() { for (size_t i = 0; i < use_slots_.size(); ++i) { diff --git a/paddle/fluid/framework/data_feed.cu b/paddle/fluid/framework/data_feed.cu index 156c70b982538..57cf488d2a301 100644 --- a/paddle/fluid/framework/data_feed.cu +++ b/paddle/fluid/framework/data_feed.cu @@ -2982,7 +2982,7 @@ std::shared_ptr GetNodeDegree( } int multi_node_sync_sample(int flag, - const ncclRedOp_t &op, + const mcclRedOp_t &op, const paddle::platform::Place &place, const int gpu_id, phi::DenseTensor *multi_node_sync_stat_ptr) { @@ -2998,8 +2998,8 @@ int multi_node_sync_sample(int flag, int *stat_ptr = multi_node_sync_stat_ptr->data(); auto comm = platform::NCCLCommContext::Instance().Get(0, place.GetDeviceId()); auto stream = comm->stream(); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce( - &stat_ptr[flag], &stat_ptr[3], 1, ncclInt, op, comm->comm(), stream)); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclAllReduce( + &stat_ptr[flag], &stat_ptr[3], 1, mcclInt, op, comm->comm(), stream)); PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(&ret, // output &stat_ptr[3], sizeof(int), @@ -3011,7 +3011,7 @@ int multi_node_sync_sample(int flag, } int get_multi_node_global_flag(int local_flag, - const ncclRedOp_t &op, + const mcclRedOp_t &op, const paddle::platform::Place &place, const int gpu_id, cudaStream_t stream) { @@ -3025,10 +3025,10 @@ int get_multi_node_global_flag(int local_flag, send_buff_ptr, &local_flag, sizeof(int), cudaMemcpyHostToDevice, stream); cudaStreamSynchronize(stream); auto comm = platform::NCCLCommContext::Instance().Get(0, place.GetDeviceId()); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(&send_buff_ptr[0], + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclAllReduce(&send_buff_ptr[0], &send_buff_ptr[1], 1, - ncclInt, + mcclInt, op, comm->comm(), stream)); @@ -3177,7 +3177,7 @@ int FillWalkBuf(const std::vector &h_device_keys_len, // to decide whether to continue sampling if (FLAGS_enable_graph_multi_node_sampling) { switch_command = multi_node_sync_sample( - switch_flag, ncclProd, place, conf.gpuid, multi_node_sync_stat_ptr); + switch_flag, mcclProd, place, conf.gpuid, multi_node_sync_stat_ptr); VLOG(2) << "gpuid:" << conf.gpuid << " multi node sample sync" << " switch_flag:" << switch_flag << "," << switch_command; if (switch_command) { @@ -3187,7 +3187,7 @@ int FillWalkBuf(const std::vector &h_device_keys_len, } sample_command = multi_node_sync_sample( - sample_flag, ncclMax, place, conf.gpuid, multi_node_sync_stat_ptr); + sample_flag, mcclMax, place, conf.gpuid, multi_node_sync_stat_ptr); VLOG(2) << "gpuid:" << conf.gpuid << " multi node sample sync" << " sample_flag:" << sample_flag << "," << sample_command; if (sample_command == EVENT_FINISH_EPOCH) { @@ -3280,7 +3280,7 @@ int FillWalkBuf(const std::vector &h_device_keys_len, if (FLAGS_enable_graph_multi_node_sampling) { int flag = *jump_rows_ptr > 0 ? 1 : 0; int command = multi_node_sync_sample( - flag, ncclMax, place, conf.gpuid, multi_node_sync_stat_ptr); + flag, mcclMax, place, conf.gpuid, multi_node_sync_stat_ptr); VLOG(2) << "gpuid:" << conf.gpuid << " multi node step sync" << " step:" << step << " step_sample:" << flag << "," << command; if (command <= 0) { @@ -3326,7 +3326,7 @@ int FillWalkBuf(const std::vector &h_device_keys_len, // Step synchronization for multi-step sampling in multi node int flag = sample_res.total_sample_size > 0 ? 1 : 0; int command = multi_node_sync_sample( - flag, ncclMax, place, conf.gpuid, multi_node_sync_stat_ptr); + flag, mcclMax, place, conf.gpuid, multi_node_sync_stat_ptr); VLOG(2) << "gpuid:" << conf.gpuid << " multi node step sync" << " step:" << step << " step_sample:" << flag << "," << command; @@ -3846,7 +3846,7 @@ void GraphDataGenerator::DoWalkandSage() { } else { if (conf_.sage_mode) { global_train_flag_ = get_multi_node_global_flag( - local_train_flag, ncclProd, place_, conf_.gpuid, sample_stream_); + local_train_flag, mcclProd, place_, conf_.gpuid, sample_stream_); VLOG(1) << "gpu_id: " << conf_.gpuid << ", local_train_flag: " << local_train_flag << ", global_train_flag: " << global_train_flag_; @@ -4010,7 +4010,7 @@ void GraphDataGenerator::DoSageForTrain() { // check whether reach sage pass end if (conf_.is_multi_node) { int res = multi_node_sync_sample(sage_pass_end, - ncclProd, + mcclProd, place_, conf_.gpuid, &multi_node_sync_stat_); @@ -4165,7 +4165,7 @@ void GraphDataGenerator::DoSageForInfer() { int local_pass_end = total_instance == 0; if (conf_.is_multi_node) { global_pass_end = get_multi_node_global_flag( - local_pass_end, ncclProd, place_, conf_.gpuid, sample_stream_); + local_pass_end, mcclProd, place_, conf_.gpuid, sample_stream_); } else { global_pass_end = local_pass_end; } @@ -4261,11 +4261,11 @@ int dynamic_adjust_total_row_for_infer(int local_reach_end, stream); cudaStreamSynchronize(stream); auto comm = platform::NCCLCommContext::Instance().Get(0, place.GetDeviceId()); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(&send_buff_ptr[0], + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclAllReduce(&send_buff_ptr[0], &send_buff_ptr[1], 1, - ncclInt, - ncclProd, + mcclInt, + mcclProd, comm->comm(), stream)); int global_reach_end = 0; @@ -4356,7 +4356,7 @@ bool FillInferBuf( global_infer_node_type_start[infer_cursor] + conf.buf_size >= device_key_size; int global_reach_end = get_multi_node_global_flag( - local_reach_end, ncclProd, place, conf.gpuid, stream); + local_reach_end, mcclProd, place, conf.gpuid, stream); int remain = device_key_size - global_infer_node_type_start[infer_cursor]; if (global_reach_end) { *total_row_ptr = remain; @@ -5005,11 +5005,11 @@ int GraphDataGenerator::dynamic_adjust_batch_num_for_sage() { cudaStreamSynchronize(sample_stream_); auto comm = platform::NCCLCommContext::Instance().Get(0, place_.GetDeviceId()); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(&send_buff_ptr[0], + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclAllReduce(&send_buff_ptr[0], &send_buff_ptr[1], 1, - ncclInt, - ncclMax, + mcclInt, + mcclMax, comm->comm(), sample_stream_)); int thread_max_batch_num = 0; diff --git a/paddle/fluid/framework/data_feed.h b/paddle/fluid/framework/data_feed.h index 243c5c818f588..492c7629abf9e 100644 --- a/paddle/fluid/framework/data_feed.h +++ b/paddle/fluid/framework/data_feed.h @@ -2023,7 +2023,7 @@ class PaddleBoxDataFeed : public MultiSlotInMemoryDataFeed { int pv_batch_size_; }; -#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && !defined(_WIN32) +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)) && !defined(_WIN32) template class PrivateInstantDataFeed : public DataFeed { public: diff --git a/paddle/fluid/framework/data_feed_factory.cc b/paddle/fluid/framework/data_feed_factory.cc index 88afa021b7c1b..010661fef6e8a 100644 --- a/paddle/fluid/framework/data_feed_factory.cc +++ b/paddle/fluid/framework/data_feed_factory.cc @@ -70,7 +70,7 @@ REGISTER_DATAFEED_CLASS(MultiSlotDataFeed); REGISTER_DATAFEED_CLASS(MultiSlotInMemoryDataFeed); REGISTER_DATAFEED_CLASS(PaddleBoxDataFeed); REGISTER_DATAFEED_CLASS(SlotRecordInMemoryDataFeed); -#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && !defined(_WIN32) +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)) && !defined(_WIN32) REGISTER_DATAFEED_CLASS(MultiSlotFileInstantDataFeed); #endif } // namespace framework diff --git a/paddle/fluid/framework/data_type_transform.cc b/paddle/fluid/framework/data_type_transform.cc index 9d114fcf56396..b2fb089f53574 100644 --- a/paddle/fluid/framework/data_type_transform.cc +++ b/paddle/fluid/framework/data_type_transform.cc @@ -101,7 +101,7 @@ struct CastDataType { in_end, out_begin, CastDataTypeFunctor()); -#if defined(__NVCC__) || defined(__HIPCC__) +#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) } else if (platform::is_gpu_place(in_.place())) { phi::Transform trans; auto* context = static_cast(ctx_); diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index f0c2b60f41b69..f43c20a0d3a94 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -221,6 +221,75 @@ elseif(WITH_ROCM) fused_broadcast_op_handle SRCS fused_broadcast_op_handle.cc DEPS broadcast_op_handle) +elseif(WITH_MUSA) + musa_library( + nan_inf_utils + SRCS nan_inf_utils_detail.cc + DEPS framework_proto scope place phi common) + musa_library( + all_reduce_op_handle + SRCS all_reduce_op_handle.cc + DEPS op_handle_base + scope + lod_tensor + phi + common + memory + dynload_cuda + variable_visitor) + musa_library( + fused_all_reduce_op_handle + SRCS fused_all_reduce_op_handle.cc + DEPS all_reduce_op_handle + op_handle_base + variable_visitor + scope + lod_tensor + phi + common + memory + dynload_cuda + place) + musa_library( + grad_merge_all_reduce_op_handle + SRCS grad_merge_all_reduce_op_handle.cc + DEPS fused_all_reduce_op_handle + op_handle_base + scope + lod_tensor + phi + common + memory + dynload_cuda + variable_visitor + place + all_reduce_op_handle) + + if(WITH_DISTRIBUTE) + musa_library( + reduce_op_handle + SRCS reduce_op_handle.cc + DEPS op_handle_base variable_visitor scope phi common dynload_cuda) + else() + musa_library( + reduce_op_handle + SRCS reduce_op_handle.cc + DEPS op_handle_base variable_visitor scope phi common dynload_cuda) + endif() + musa_library( + broadcast_op_handle + SRCS broadcast_op_handle.cc + DEPS op_handle_base + scope + phi + common + memory + variable_visitor + dynload_cuda) + musa_library( + fused_broadcast_op_handle + SRCS fused_broadcast_op_handle.cc + DEPS broadcast_op_handle) else() cc_library( nan_inf_utils @@ -420,7 +489,7 @@ endif() if(NOT APPLE AND NOT WIN32 - AND (WITH_GPU OR WITH_ROCM)) + AND (WITH_GPU OR WITH_ROCM OR WITH_MUSA)) set(IR_PASS_DEPS ${IR_PASS_DEPS} fusion_group_pass) endif() cc_library( diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc index b064a2aded0bc..087a629d49344 100644 --- a/paddle/fluid/framework/details/all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc @@ -20,7 +20,7 @@ #include "paddle/fluid/platform/profiler/event_tracing.h" #include "paddle/phi/core/flags.h" -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) PHI_DECLARE_bool(sync_nccl_allreduce); #endif @@ -28,7 +28,7 @@ namespace paddle { namespace framework { namespace details { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) AllReduceOpHandle::AllReduceOpHandle(ir::Node *node, const std::vector &local_scopes, const std::vector &places, @@ -207,17 +207,17 @@ void AllReduceOpHandle::AllReduceFunc( const std::vector &places, const std::vector &out_var_names) { if (platform::is_gpu_place(places[0])) { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) PADDLE_ENFORCE_NOT_NULL(nccl_ctxs_, platform::errors::InvalidArgument( "The nccl context should not be NULL.")); - ncclDataType_t nccl_dtype = platform::ToNCCLDataType(dtype); + mcclDataType_t nccl_dtype = platform::ToNCCLDataType(dtype); std::vector> all_reduce_calls; for (size_t i = 0; i < local_exec_scopes_.size(); ++i) { auto &p = places[i]; void *buffer = const_cast(lod_tensor_data.at(i)); all_reduce_calls.emplace_back([=] { - NCCLAllReduce(p, buffer, buffer, numel, nccl_dtype, ncclSum); + NCCLAllReduce(p, buffer, buffer, numel, nccl_dtype, mcclSum); }); } NCCLAllReduceFunc(all_reduce_calls); @@ -300,7 +300,7 @@ void AllReduceOpHandle::SyncBKCLAllReduce() { } #endif -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) void AllReduceOpHandle::NCCLAllReduceFunc( const std::vector> &all_reduce_calls) { this->RunAndRecordEvent([&] { diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.h b/paddle/fluid/framework/details/all_reduce_op_handle.h index 685ab0b957a44..0e2c06311bf38 100644 --- a/paddle/fluid/framework/details/all_reduce_op_handle.h +++ b/paddle/fluid/framework/details/all_reduce_op_handle.h @@ -31,7 +31,7 @@ namespace platform { class NCCLCommunicator; } // namespace platform } // namespace paddle -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) #include "paddle/fluid/framework/details/nccl_op_handle.h" #include "paddle/fluid/platform/device/gpu/nccl_helper.h" #elif defined(PADDLE_WITH_XPU_BKCL) @@ -43,7 +43,7 @@ namespace paddle { namespace framework { namespace details { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) class AllReduceOpHandle : public NCCLOpHandleBase { public: AllReduceOpHandle(ir::Node *node, @@ -77,14 +77,14 @@ class AllReduceOpHandle : public OpHandleBase { std::vector local_scopes_; -#if !defined(PADDLE_WITH_NCCL) && !defined(PADDLE_WITH_RCCL) && \ +#if !defined(PADDLE_WITH_NCCL) && !defined(PADDLE_WITH_RCCL) && !defined(PADDLE_WITH_MCCL) && \ !defined(PADDLE_WITH_XPU_BKCL) // NCCLOpHandleBase and BKCLOpHandleBase already have these attributes. // Will polish it by class inheritance framework. std::vector places_; #endif -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) void NCCLAllReduceFunc( const std::vector> &all_reduce_calls); diff --git a/paddle/fluid/framework/details/broadcast_op_handle.cc b/paddle/fluid/framework/details/broadcast_op_handle.cc index b79eff24ee87d..98672d09a2452 100644 --- a/paddle/fluid/framework/details/broadcast_op_handle.cc +++ b/paddle/fluid/framework/details/broadcast_op_handle.cc @@ -88,7 +88,7 @@ void BroadcastOpHandle::BroadcastOneVar( }); } } else if (platform::is_gpu_place(in_tensor.place())) { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) VarHandle *out_handle = nullptr; int root_id = in_tensor.place().device; // NOLINT std::vector> broadcast_calls; @@ -118,9 +118,9 @@ void BroadcastOpHandle::BroadcastOneVar( broadcast_calls.emplace_back( [send_recv_buffer, numel, type, root_id, &nccl_ctx] { PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::ncclBcast(send_recv_buffer, + platform::dynload::mcclBcast(send_recv_buffer, numel, - static_cast(type), + static_cast(type), root_id, nccl_ctx.comm_, nccl_ctx.stream())); diff --git a/paddle/fluid/framework/details/broadcast_op_handle.h b/paddle/fluid/framework/details/broadcast_op_handle.h index 9fbe2764913b5..3300c48b16585 100644 --- a/paddle/fluid/framework/details/broadcast_op_handle.h +++ b/paddle/fluid/framework/details/broadcast_op_handle.h @@ -34,7 +34,7 @@ class Node; } // namespace ir } // namespace framework namespace platform { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) struct NCCLContextMap; #endif #if defined(PADDLE_WITH_XPU_BKCL) @@ -43,7 +43,7 @@ struct BKCLContextMap; } // namespace platform } // namespace paddle -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) #include "paddle/fluid/platform/device/gpu/nccl_helper.h" #elif defined(PADDLE_WITH_XPU_BKCL) #include "paddle/fluid/platform/device/xpu/bkcl_helper.h" @@ -55,7 +55,7 @@ namespace details { struct BroadcastOpHandle : public OpHandleBase { public: -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) BroadcastOpHandle(ir::Node *node, const std::vector &local_scopes, const std::vector &places, @@ -109,7 +109,7 @@ struct BroadcastOpHandle : public OpHandleBase { std::vector local_scopes_; std::vector places_; -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) const platform::NCCLContextMap *nccl_ctxs_; #elif defined(PADDLE_WITH_XPU_BKCL) const platform::BKCLContextMap *bkcl_ctxs_; diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc index 5a6f4e6e70d4c..5b8857977c9fa 100644 --- a/paddle/fluid/framework/details/build_strategy.cc +++ b/paddle/fluid/framework/details/build_strategy.cc @@ -186,7 +186,7 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { "fuse_relu_depthwise_conv_pass"); AppendPassWithCheck(strategy_.fuse_bn_act_ops_, "fuse_bn_act_pass"); AppendPassWithCheck(strategy_.fuse_bn_add_act_ops_, "fuse_bn_add_act_pass"); -#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \ +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MCCL)) && \ !defined(_WIN32) && !defined(__APPLE__) AppendPassWithCheck(strategy_.enable_auto_fusion_, "fusion_group_pass"); #endif @@ -348,7 +348,7 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph, const std::string &loss_var_name, const std::vector &local_scopes, const size_t &nranks, -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) DeviceType use_device, platform::NCCLCommunicator *nccl_ctxs) const { #elif defined(PADDLE_WITH_XPU) && defined(PADDLE_WITH_XPU_BKCL) @@ -380,7 +380,7 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph, pass->Erase(kNRanks); pass->Set(kNRanks, new size_t(nranks)); -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) platform::NCCLCommunicator *nctx = (use_device == p::kCUDA) ? nccl_ctxs : nullptr; pass->Erase(kNCCLCtxs); @@ -400,7 +400,7 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph, pass->Erase(kLocalScopes); pass->SetNotOwned>(kLocalScopes, &local_scopes); -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) platform::NCCLCommunicator *nctx = (use_device == p::kCUDA) ? nccl_ctxs : nullptr; pass->Erase(kNCCLCtxs); @@ -428,7 +428,7 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph, LOG(INFO) << "set enable_sequential_execution:" << enable_sequential_execution_; } else if (pass->Type() == "all_reduce_deps_pass") { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) platform::NCCLCommunicator *nctx = (use_device == p::kCUDA) ? nccl_ctxs : nullptr; pass->Erase(kNCCLCtxs); @@ -545,7 +545,7 @@ USE_PASS(fused_feedforward_pass); #ifdef PADDLE_WITH_DNNL USE_PASS(mkldnn_placement_pass); #endif -#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \ +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MCCL)) && \ !defined(_WIN32) && !defined(__APPLE__) USE_PASS(fusion_group_pass); #endif diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h index 203525d5a7482..90cf7fe82ebfd 100644 --- a/paddle/fluid/framework/details/build_strategy.h +++ b/paddle/fluid/framework/details/build_strategy.h @@ -217,7 +217,7 @@ struct BuildStrategy { const std::string &loss_var_name, const std::vector &local_scopes, const size_t &nranks, -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)|| defined(PADDLE_WITH_MCCL) DeviceType use_device, platform::NCCLCommunicator *nccl_ctxs) const; #elif defined(PADDLE_WITH_XPU) && defined(PADDLE_WITH_XPU_BKCL) diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.cc b/paddle/fluid/framework/details/eager_deletion_op_handle.cc index 4012263f688cb..89d72a1b8213a 100644 --- a/paddle/fluid/framework/details/eager_deletion_op_handle.cc +++ b/paddle/fluid/framework/details/eager_deletion_op_handle.cc @@ -16,7 +16,7 @@ #include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h" #include "paddle/fluid/platform/profiler/event_tracing.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) #include "paddle/fluid/platform/cuda_device_guard.h" #endif #include @@ -44,7 +44,7 @@ EagerDeletionOpHandle::EagerDeletionOpHandle( place_(place), var_infos_(vars.begin(), vars.end()), gc_(gc) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)|| defined(PADDLE_WITH_MUSA) if (platform::is_gpu_place(place)) { dev_ctx_ = reinterpret_cast( platform::DeviceContextPool::Instance().Get(place)); @@ -53,6 +53,9 @@ EagerDeletionOpHandle::EagerDeletionOpHandle( #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS( hipEventCreateWithFlags(&event_, hipEventDisableTiming)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS( + musaEventCreateWithFlags(&event_, musaEventDisableTiming)); #else PADDLE_ENFORCE_GPU_SUCCESS( cudaEventCreateWithFlags(&event_, cudaEventDisableTiming)); @@ -75,12 +78,14 @@ EagerDeletionOpHandle::EagerDeletionOpHandle( } EagerDeletionOpHandle::~EagerDeletionOpHandle() { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) if (event_) { auto gpu_place = dev_ctx_->GetPlace(); platform::CUDADeviceGuard guard(gpu_place.device); #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipEventDestroy(event_)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(musaEventDestroy(event_)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaEventDestroy(event_)); #endif @@ -89,7 +94,7 @@ EagerDeletionOpHandle::~EagerDeletionOpHandle() { } void EagerDeletionOpHandle::InitCUDA() { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) int dev_id = dev_ctxes_.begin()->first.device; events_[dev_id] = nullptr; #endif @@ -177,7 +182,7 @@ void EagerDeletionOpHandle::RunImpl() { void EagerDeletionOpHandle::ClearGarbages( std::deque> *garbages) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) if (event_) { auto compute_stream = dev_ctx_->stream(); auto callback_stream = @@ -187,6 +192,10 @@ void EagerDeletionOpHandle::ClearGarbages( PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event_, compute_stream)); PADDLE_ENFORCE_GPU_SUCCESS( hipStreamWaitEvent(callback_stream, event_, 0)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(musaEventRecord(event_, compute_stream)); + PADDLE_ENFORCE_GPU_SUCCESS( + musaStreamWaitEvent(callback_stream, event_, 0)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event_, compute_stream)); PADDLE_ENFORCE_GPU_SUCCESS( @@ -197,7 +206,7 @@ void EagerDeletionOpHandle::ClearGarbages( } else { #endif gc_->Add(std::move(*garbages)); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) } #endif } diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.h b/paddle/fluid/framework/details/eager_deletion_op_handle.h index 0a92269c50ad2..049b0c2ec478b 100644 --- a/paddle/fluid/framework/details/eager_deletion_op_handle.h +++ b/paddle/fluid/framework/details/eager_deletion_op_handle.h @@ -80,7 +80,7 @@ class EagerDeletionOpHandle : public OpHandleBase { std::vector var_infos_; // not own GarbageCollector *gc_; // not own std::vector vars_; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) phi::GPUContext *dev_ctx_{nullptr}; gpuEvent_t event_{nullptr}; #endif diff --git a/paddle/fluid/framework/details/fetch_async_op_handle.cc b/paddle/fluid/framework/details/fetch_async_op_handle.cc index ee78d36671107..be3b196c3ca6c 100644 --- a/paddle/fluid/framework/details/fetch_async_op_handle.cc +++ b/paddle/fluid/framework/details/fetch_async_op_handle.cc @@ -135,7 +135,7 @@ static void TransData(const phi::DenseTensor *src_item, const platform::DeviceContext &ctx) { if (src_item->IsInitialized() && src_item->numel() > 0) { if (platform::is_gpu_place(src_item->place())) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) TensorCopy(*src_item, platform::CUDAPinnedPlace(), ctx, dst_item); #endif } else { diff --git a/paddle/fluid/framework/details/fetch_op_handle.cc b/paddle/fluid/framework/details/fetch_op_handle.cc index 27be4b7717635..0ab7767aca0ba 100644 --- a/paddle/fluid/framework/details/fetch_op_handle.cc +++ b/paddle/fluid/framework/details/fetch_op_handle.cc @@ -121,7 +121,7 @@ static void TransData(const phi::DenseTensor &src_item, phi::DenseTensor *dst_item) { if (src_item.IsInitialized() && src_item.numel() > 0) { if (platform::is_gpu_place(src_item.place())) { // NOLINT -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) TensorCopy(src_item, platform::CPUPlace(), dst_item); #endif } else { diff --git a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc index 53746482d58a8..b1db6b334013d 100644 --- a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc @@ -32,7 +32,7 @@ typedef std::vector< std::vector>> GradientAndLoDTensor; -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) FusedAllReduceOpHandle::FusedAllReduceOpHandle( ir::Node *node, const std::vector &local_scopes, @@ -61,11 +61,13 @@ FusedAllReduceOpHandle::FusedAllReduceOpHandle( #endif FusedAllReduceOpHandle::~FusedAllReduceOpHandle() { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) auto destroy_event = [](gpuEvent_t event) { if (event == nullptr) return; #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipEventDestroy(event)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(musaEventDestroy(event)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaEventDestroy(event)); #endif @@ -80,7 +82,7 @@ void FusedAllReduceOpHandle::RunImpl() { Name(), platform::TracerEventType::Communication, 1); VLOG(4) << this->DebugString(); -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) if (FLAGS_allreduce_record_one_event && start_event_ == nullptr) { VLOG(10) << "FLAGS_allreduce_record_one_event=true"; PADDLE_ENFORCE_EQ(use_hierarchical_allreduce_, @@ -103,6 +105,9 @@ void FusedAllReduceOpHandle::RunImpl() { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS( hipEventCreateWithFlags(event, hipEventDisableTiming)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS( + musaEventCreateWithFlags(event, musaEventDisableTiming)); #else PADDLE_ENFORCE_GPU_SUCCESS( cudaEventCreateWithFlags(event, cudaEventDisableTiming)); @@ -126,6 +131,10 @@ void FusedAllReduceOpHandle::RunImpl() { PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(start_event_, compute_stream)); PADDLE_ENFORCE_GPU_SUCCESS( hipStreamWaitEvent(nccl_stream, start_event_, 0)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(musaEventRecord(start_event_, compute_stream)); + PADDLE_ENFORCE_GPU_SUCCESS( + musaStreamWaitEvent(nccl_stream, start_event_, 0)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(start_event_, compute_stream)); PADDLE_ENFORCE_GPU_SUCCESS( @@ -185,12 +194,16 @@ void FusedAllReduceOpHandle::RunImpl() { FusedAllReduceFunc(in_var_handles, out_var_handles); } -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) if (FLAGS_allreduce_record_one_event) { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(end_event_, nccl_stream)); PADDLE_ENFORCE_GPU_SUCCESS( hipStreamWaitEvent(compute_stream, end_event_, 0)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(musaEventRecord(end_event_, nccl_stream)); + PADDLE_ENFORCE_GPU_SUCCESS( + musaStreamWaitEvent(compute_stream, end_event_, 0)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(end_event_, nccl_stream)); PADDLE_ENFORCE_GPU_SUCCESS( diff --git a/paddle/fluid/framework/details/fused_all_reduce_op_handle.h b/paddle/fluid/framework/details/fused_all_reduce_op_handle.h index 533d1d0860a55..a5c6c431f1742 100644 --- a/paddle/fluid/framework/details/fused_all_reduce_op_handle.h +++ b/paddle/fluid/framework/details/fused_all_reduce_op_handle.h @@ -33,7 +33,7 @@ namespace platform { class NCCLCommunicator; } // namespace platform } // namespace paddle -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) #include "paddle/fluid/framework/details/nccl_op_handle.h" #include "paddle/fluid/platform/device/gpu/nccl_helper.h" #elif defined(PADDLE_WITH_XPU_BKCL) @@ -44,7 +44,7 @@ namespace paddle { namespace framework { namespace details { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) struct FusedAllReduceOpHandle : public AllReduceOpHandle { FusedAllReduceOpHandle(ir::Node *node, const std::vector &local_scopes, @@ -75,7 +75,7 @@ struct FusedAllReduceOpHandle : public AllReduceOpHandle { private: size_t num_of_all_reduce_; -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) gpuEvent_t start_event_{nullptr}; gpuEvent_t end_event_{nullptr}; #endif diff --git a/paddle/fluid/framework/details/fused_broadcast_op_handle.h b/paddle/fluid/framework/details/fused_broadcast_op_handle.h index 6ba6df7011ade..198fb8b6eb07e 100644 --- a/paddle/fluid/framework/details/fused_broadcast_op_handle.h +++ b/paddle/fluid/framework/details/fused_broadcast_op_handle.h @@ -36,7 +36,7 @@ struct NCCLContextMap; } // namespace platform } // namespace paddle -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) #include "paddle/fluid/platform/device/gpu/nccl_helper.h" #endif @@ -46,7 +46,7 @@ namespace details { struct FusedBroadcastOpHandle : public BroadcastOpHandle { public: -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) FusedBroadcastOpHandle(ir::Node *node, const std::vector local_scopes, const std::vector &places, diff --git a/paddle/fluid/framework/details/grad_merge_all_reduce_op_handle.cc b/paddle/fluid/framework/details/grad_merge_all_reduce_op_handle.cc index 15648aa058f07..2ebaa31f53bd8 100644 --- a/paddle/fluid/framework/details/grad_merge_all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/grad_merge_all_reduce_op_handle.cc @@ -16,7 +16,7 @@ #include "paddle/fluid/platform/profiler/event_tracing.h" #include "paddle/phi/core/flags.h" -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) PHI_DECLARE_bool(sync_nccl_allreduce); #endif @@ -24,7 +24,7 @@ namespace paddle { namespace framework { namespace details { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) GradMergeAllReduceOpHandle::GradMergeAllReduceOpHandle( ir::Node *node, const std::vector &local_scopes, @@ -77,7 +77,7 @@ std::string GradMergeAllReduceOpHandle::Name() const { return "grad_merge_all_reduce"; } -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) FusedGradMergeAllReduceOpHandle::FusedGradMergeAllReduceOpHandle( ir::Node *node, const std::vector &local_scopes, diff --git a/paddle/fluid/framework/details/grad_merge_all_reduce_op_handle.h b/paddle/fluid/framework/details/grad_merge_all_reduce_op_handle.h index ce01f85eaba52..5e8d061762cbc 100644 --- a/paddle/fluid/framework/details/grad_merge_all_reduce_op_handle.h +++ b/paddle/fluid/framework/details/grad_merge_all_reduce_op_handle.h @@ -33,7 +33,7 @@ namespace platform { class NCCLCommunicator; } // namespace platform } // namespace paddle -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) #include "paddle/fluid/framework/details/nccl_op_handle.h" #include "paddle/fluid/platform/device/gpu/nccl_helper.h" #endif @@ -44,7 +44,7 @@ namespace details { class GradMergeAllReduceOpHandle : public AllReduceOpHandle { public: -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) GradMergeAllReduceOpHandle(ir::Node *node, const std::vector &local_scopes, const std::vector &places, @@ -75,7 +75,7 @@ class GradMergeAllReduceOpHandle : public AllReduceOpHandle { class FusedGradMergeAllReduceOpHandle : public FusedAllReduceOpHandle { public: -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) FusedGradMergeAllReduceOpHandle(ir::Node *node, const std::vector &local_scopes, const std::vector &places, diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.cc b/paddle/fluid/framework/details/nan_inf_utils_detail.cc index 6c3f5356ac1f1..91cb342594a63 100644 --- a/paddle/fluid/framework/details/nan_inf_utils_detail.cc +++ b/paddle/fluid/framework/details/nan_inf_utils_detail.cc @@ -183,7 +183,7 @@ void CheckVarHasNanOrInf(const std::string& op_type, << ", place:" << tensor->place() << ", numel:" << tensor->numel(); if (platform::is_gpu_place(tensor->place())) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) tensor_check(op_type, var_name, *tensor, place); #else PADDLE_THROW(platform::errors::PreconditionNotMet( diff --git a/paddle/fluid/framework/details/nccl_op_handle.h b/paddle/fluid/framework/details/nccl_op_handle.h index e4472e8d989dd..ab7c4ecd88468 100644 --- a/paddle/fluid/framework/details/nccl_op_handle.h +++ b/paddle/fluid/framework/details/nccl_op_handle.h @@ -27,6 +27,9 @@ #ifdef PADDLE_WITH_HIP #include "paddle/fluid/platform/dynload/rccl.h" #endif +#ifdef PADDLE_WITH_MUSA +#include "paddle/fluid/platform/dynload/mccl.h" +#endif #include "paddle/fluid/platform/device/gpu/nccl_helper.h" #include "paddle/phi/core/flags.h" @@ -55,6 +58,8 @@ class NCCLOpHandleBase : public OpHandleBase { for (auto& ev : inter_events_) { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipEventDestroy(ev.second)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(musaEventDestroy(ev.second)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaEventDestroy(ev.second)); #endif @@ -62,6 +67,8 @@ class NCCLOpHandleBase : public OpHandleBase { for (auto& ev : exter_events_) { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipEventDestroy(ev.second)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(musaEventDestroy(ev.second)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaEventDestroy(ev.second)); #endif @@ -72,7 +79,7 @@ class NCCLOpHandleBase : public OpHandleBase { return nccl_ctxs_; } - ncclComm_t GetComm() const { + mcclComm_t GetComm() const { PADDLE_ENFORCE_EQ( places_.size(), 1, @@ -143,6 +150,11 @@ class NCCLOpHandleBase : public OpHandleBase { &inter_events_[dev_id], hipEventDisableTiming)); PADDLE_ENFORCE_GPU_SUCCESS(hipEventCreateWithFlags( &exter_events_[dev_id], hipEventDisableTiming)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(musaEventCreateWithFlags( + &inter_events_[dev_id], musaEventDisableTiming)); + PADDLE_ENFORCE_GPU_SUCCESS(musaEventCreateWithFlags( + &exter_events_[dev_id], musaEventDisableTiming)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaEventCreateWithFlags( &inter_events_[dev_id], cudaEventDisableTiming)); @@ -159,8 +171,8 @@ class NCCLOpHandleBase : public OpHandleBase { const void* sendbuff, void* recvbuff, size_t count, - ncclDataType_t datatype, - ncclRedOp_t op) { + mcclDataType_t datatype, + mcclRedOp_t op) { PADDLE_ENFORCE_GE( run_order_, 0, @@ -176,7 +188,7 @@ class NCCLOpHandleBase : public OpHandleBase { << ", dev_id:" << dev_id << ", dtype:" << datatype << ", place:" << place; - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclAllReduce( sendbuff, recvbuff, count, datatype, op, comm, stream)); } @@ -184,8 +196,8 @@ class NCCLOpHandleBase : public OpHandleBase { const void* sendbuff, void* recvbuff, size_t count, - ncclDataType_t datatype, - ncclRedOp_t op) { + mcclDataType_t datatype, + mcclRedOp_t op) { PADDLE_ENFORCE_GE( run_order_, 0, @@ -203,8 +215,8 @@ class NCCLOpHandleBase : public OpHandleBase { const void* sendbuff, void* recvbuff, size_t count, - ncclDataType_t datatype, - ncclRedOp_t op) { + mcclDataType_t datatype, + mcclRedOp_t op) { PADDLE_ENFORCE_GE( run_order_, 0, @@ -224,8 +236,8 @@ class NCCLOpHandleBase : public OpHandleBase { const void* sendbuff, void* recvbuff, size_t count, - ncclDataType_t datatype, - ncclRedOp_t op UNUSED) { + mcclDataType_t datatype, + mcclRedOp_t op UNUSED) { auto nccl_ctxs = nccl_ctxs_->GetHierarchicalInterCtx(run_order_); int dev_id = place.device; auto& nccl_ctx = nccl_ctxs->at(dev_id); @@ -238,11 +250,13 @@ class NCCLOpHandleBase : public OpHandleBase { << ", dtype:" << datatype << ", place:" << place << ", stream:" << stream; - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclReduce( - sendbuff, recvbuff, count, datatype, ncclSum, 0, comm, stream)); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclReduce( + sendbuff, recvbuff, count, datatype, mcclSum, 0, comm, stream)); #ifdef PADDLE_WITH_HIP hipEventRecord(inter_events_.at(dev_id), stream); +#elif defined(PADDLE_WITH_MUSA) + musaEventRecord(inter_events_.at(dev_id), stream); #else cudaEventRecord(inter_events_.at(dev_id), stream); #endif @@ -256,8 +270,8 @@ class NCCLOpHandleBase : public OpHandleBase { const void* sendbuff, void* recvbuff, size_t count, - ncclDataType_t datatype, - ncclRedOp_t op) { + mcclDataType_t datatype, + mcclRedOp_t op) { auto nccl_ctxs = nccl_ctxs_->GetHierarchicalExterCtx(run_order_); PADDLE_ENFORCE_NOT_NULL( nccl_ctxs_, @@ -276,14 +290,21 @@ class NCCLOpHandleBase : public OpHandleBase { #ifdef PADDLE_WITH_HIP hipStreamWaitEvent(stream, inter_events_.at(dev_id), 0); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclAllReduce( sendbuff, recvbuff, count, datatype, op, comm, stream)); hipEventRecord(exter_events_.at(dev_id), stream); +#elif defined(PADDLE_WITH_MUSA) + musaStreamWaitEvent(stream, inter_events_.at(dev_id), 0); + + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclAllReduce( + sendbuff, recvbuff, count, datatype, op, comm, stream)); + + musaEventRecord(exter_events_.at(dev_id), stream); #else cudaStreamWaitEvent(stream, inter_events_.at(dev_id), 0); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclAllReduce( sendbuff, recvbuff, count, datatype, op, comm, stream)); cudaEventRecord(exter_events_.at(dev_id), stream); @@ -296,8 +317,8 @@ class NCCLOpHandleBase : public OpHandleBase { void InterBroadCast(platform::Place place, void* sendbuff, size_t count, - ncclDataType_t datatype, - ncclRedOp_t op UNUSED) { + mcclDataType_t datatype, + mcclRedOp_t op UNUSED) { auto nccl_ctxs = nccl_ctxs_->GetHierarchicalInterCtx(run_order_); int dev_id = place.device; auto& nccl_ctx = nccl_ctxs->at(dev_id); @@ -310,10 +331,12 @@ class NCCLOpHandleBase : public OpHandleBase { << ", stream:" << stream; #ifdef PADDLE_WITH_HIP hipStreamWaitEvent(stream, exter_events_.at(dev_id), 0); +#elif defined(PADDLE_WITH_MUSA) + musaStreamWaitEvent(stream, exter_events_.at(dev_id), 0); #else cudaStreamWaitEvent(stream, exter_events_.at(dev_id), 0); #endif - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclBcast( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclBcast( sendbuff, count, datatype, 0, comm, stream)); } diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc index ee87141a9d541..896b251571fc9 100644 --- a/paddle/fluid/framework/details/op_handle_base.cc +++ b/paddle/fluid/framework/details/op_handle_base.cc @@ -31,11 +31,13 @@ std::string OpHandleBase::DebugString() const { } OpHandleBase::~OpHandleBase() PADDLE_MAY_THROW { // NOLINT -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) for (auto &ev : events_) { if (ev.second) { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipEventDestroy(ev.second)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(musaEventDestroy(ev.second)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaEventDestroy(ev.second)); #endif @@ -45,13 +47,16 @@ OpHandleBase::~OpHandleBase() PADDLE_MAY_THROW { // NOLINT } void OpHandleBase::InitCUDA() { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)|| defined(PADDLE_WITH_MUSA) for (auto &p : dev_ctxes_) { int dev_id = p.first.device; // NOLINT platform::SetDeviceId(dev_id); #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS( hipEventCreateWithFlags(&events_[dev_id], hipEventDisableTiming)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS( + musaEventCreateWithFlags(&events_[dev_id], musaEventDisableTiming)); #else PADDLE_ENFORCE_GPU_SUCCESS( cudaEventCreateWithFlags(&events_[dev_id], cudaEventDisableTiming)); @@ -136,7 +141,7 @@ void OpHandleBase::InitXPU() { } void OpHandleBase::Run(DeviceType use_device) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)|| defined(PADDLE_WITH_MUSA) if (events_.empty() && use_device == p::kCUDA && !dev_ctxes_.empty()) { InitCUDA(); } @@ -172,7 +177,7 @@ void OpHandleBase::Run(DeviceType use_device) { } void OpHandleBase::RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)|| defined(PADDLE_WITH_MUSA) PADDLE_ENFORCE_NOT_NULL( waited_ctx, platform::errors::InvalidArgument("Argument waited_ctx is NULL.")); @@ -188,6 +193,8 @@ void OpHandleBase::RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx) { for (auto &ev : events_) { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipStreamWaitEvent(stream, ev.second, 0)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(musaStreamWaitEvent(stream, ev.second, 0)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamWaitEvent(stream, ev.second, 0)); #endif @@ -221,12 +228,15 @@ void OpHandleBase::WaitInputVarGenerated(bool wait_for_feed) { if (in_var_handle) { auto &place = in_var_handle->place(); if (platform::is_gpu_place(place)) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) auto stream = static_cast(dev_ctxes_.at(place))->stream(); #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS( hipStreamWaitEvent(stream, in_var_handle->GetEvent(), 0)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS( + musaStreamWaitEvent(stream, in_var_handle->GetEvent(), 0)); #else PADDLE_ENFORCE_GPU_SUCCESS( cudaStreamWaitEvent(stream, in_var_handle->GetEvent(), 0)); @@ -248,7 +258,7 @@ void OpHandleBase::WaitInputVarGenerated(bool wait_for_feed) { if (in_var_handle) { auto &place = in_var_handle->place(); if (platform::is_gpu_place(place)) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); auto stream = @@ -273,13 +283,16 @@ void OpHandleBase::WaitInputVarGenerated(const platform::Place &place) { auto *in_var_handle = dynamic_cast(in_var); if (in_var_handle) { if (platform::is_gpu_place(in_var_handle->place())) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) auto stream = static_cast( dev_ctxes_.at(in_var_handle->place())) ->stream(); #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS( hipStreamWaitEvent(stream, in_var_handle->GetEvent(), 0)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS( + musaStreamWaitEvent(stream, in_var_handle->GetEvent(), 0)); #else PADDLE_ENFORCE_GPU_SUCCESS( cudaStreamWaitEvent(stream, in_var_handle->GetEvent(), 0)); @@ -311,7 +324,7 @@ bool OpHandleBase::NeedWait(VarHandleBase *in_var) { void OpHandleBase::RunAndRecordEvent(const std::function &callback) { callback(); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) if (!events_.empty()) { // Use event for (auto &p : dev_ctxes_) { auto dev_id = p.first.device; @@ -320,6 +333,9 @@ void OpHandleBase::RunAndRecordEvent(const std::function &callback) { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS( hipEventRecord(events_.at(dev_id), cuda_dev_ctx->stream())); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS( + musaEventRecord(events_.at(dev_id), cuda_dev_ctx->stream())); #else PADDLE_ENFORCE_GPU_SUCCESS( cudaEventRecord(events_.at(dev_id), cuda_dev_ctx->stream())); @@ -331,7 +347,7 @@ void OpHandleBase::RunAndRecordEvent(const std::function &callback) { void OpHandleBase::RunAndRecordEvent(platform::Place p, const std::function &callback) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)|| defined(PADDLE_WITH_MUSA) if (platform::is_cpu_place(p) || events_.empty()) { callback(); } else { diff --git a/paddle/fluid/framework/details/op_handle_base.h b/paddle/fluid/framework/details/op_handle_base.h index 9afe56e4babd4..4bd385ff5099c 100644 --- a/paddle/fluid/framework/details/op_handle_base.h +++ b/paddle/fluid/framework/details/op_handle_base.h @@ -161,7 +161,7 @@ class OpHandleBase { // See https://github.com/PaddlePaddle/Paddle/pull/32283 bool is_variant_scope_ = false; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) std::unordered_map events_; #endif diff --git a/paddle/fluid/framework/details/reduce_op_handle.cc b/paddle/fluid/framework/details/reduce_op_handle.cc index fe43126ca8abe..d7d0a3e286363 100644 --- a/paddle/fluid/framework/details/reduce_op_handle.cc +++ b/paddle/fluid/framework/details/reduce_op_handle.cc @@ -182,7 +182,7 @@ void ReduceOpHandle::RunImpl() { } }); } else if (paddle::platform::is_gpu_place(lod_tensors[0]->place())) { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) auto pre_in = pre_in_var->Get(); VariableVisitor::ShareDimsAndLoD(*pre_in_var, out_var); VariableVisitor::GetMutableTensor(out_var).mutable_data( @@ -210,12 +210,12 @@ void ReduceOpHandle::RunImpl() { size_t numel = static_cast(lod_tensor.numel()); all_reduce_calls.emplace_back( [buffer, recvbuffer, type, numel, root_id, &nccl_ctx] { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclReduce( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclReduce( buffer, recvbuffer, numel, - static_cast(type), - ncclSum, + static_cast(type), + mcclSum, root_id, nccl_ctx.comm_, nccl_ctx.stream())); diff --git a/paddle/fluid/framework/details/reduce_op_handle.h b/paddle/fluid/framework/details/reduce_op_handle.h index 2eb0ad2923211..eb0e319cce3b5 100644 --- a/paddle/fluid/framework/details/reduce_op_handle.h +++ b/paddle/fluid/framework/details/reduce_op_handle.h @@ -39,7 +39,7 @@ namespace platform { struct NCCLContextMap; } // namespace platform } // namespace paddle -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) #include "paddle/fluid/platform/device/gpu/nccl_helper.h" #elif defined(PADDLE_WITH_XPU_BKCL) #include "paddle/fluid/platform/device/xpu/bkcl_helper.h" @@ -79,7 +79,7 @@ struct ReduceOpHandle : public OpHandleBase { std::vector local_scopes_; std::vector places_; -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) const platform::NCCLContextMap *nccl_ctxs_; ReduceOpHandle(ir::Node *node, const std::vector &local_scopes, @@ -129,7 +129,7 @@ struct ReduceOpHandle : public OpHandleBase { std::vector GetLocalScopes() override { return local_scopes_; } -#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_HIP) && \ +#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_HIP || defined PADDLE_WITH_MUSA) && \ defined PADDLE_WITH_DISTRIBUTE template void GatherSelectedRows( diff --git a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc index 8b486be9cc686..f37ea73a477b6 100644 --- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc +++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc @@ -76,7 +76,7 @@ struct ScaleLossGradFunctor { "Please recompile or reinstall Paddle with XPU support.")); #endif } else { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) OutT cast_coeff = static_cast(coeff_); auto stream = static_cast(ctx_)->stream(); memory::Copy(place_, @@ -110,7 +110,7 @@ void ScaleLossGradOpHandle::RunOnVar(Variable *var, bool record_event) { auto *tensor = var->GetMutable(); tensor->Resize(common::make_ddim({1})); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) ScaleLossGradFunctor func( coeff_, tensor, place_, out_dtype_, this->dev_ctxes_.at(place_)); if (record_event) { diff --git a/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc b/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc index 02a68fb697efb..cb16915316ecf 100644 --- a/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc +++ b/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc @@ -95,7 +95,7 @@ void ShareTensorBufferOpHandle::SetShareDimsAndDtype( } void ShareTensorBufferOpHandle::InitCUDA() { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) int dev_id = dev_ctxes_.begin()->first.device; events_[dev_id] = nullptr; #endif diff --git a/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc b/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc index ba678bbe2e26b..5c266946144fe 100644 --- a/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc @@ -196,7 +196,7 @@ void SparseAllReduceOpHandle::RunImplEncoded() { auto comm = nccl_ctx.comm_; int encode_size = 2 * k * sizeof(int); - // dgc use ncclAllGather to get all the encoded data + // dgc use mcclAllGather to get all the encoded data // so the buffer need nranks. int buf_size = nranks_ * encode_size; void *gather_buff = gathers[i]->data(); @@ -207,10 +207,10 @@ void SparseAllReduceOpHandle::RunImplEncoded() { all_gather_calls.emplace_back([=] { PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::ncclAllGather(in_tensor_buf, + platform::dynload::mcclAllGather(in_tensor_buf, gather_buff, 2 * k, - static_cast(dtype), + static_cast(dtype), comm, stream)); }); diff --git a/paddle/fluid/framework/details/var_handle.h b/paddle/fluid/framework/details/var_handle.h index a6314220d5c26..9a130bea0d3a2 100644 --- a/paddle/fluid/framework/details/var_handle.h +++ b/paddle/fluid/framework/details/var_handle.h @@ -129,7 +129,7 @@ struct VarHandle : public VarHandleBase { name_(std::move(name)), place_(std::move(place)) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) bool HasEvent() { return has_event_; } const gpuEvent_t& GetEvent() { @@ -154,7 +154,7 @@ struct VarHandle : public VarHandleBase { size_t scope_idx_; std::string name_; platform::Place place_; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) // Only when this event is triggered, var is generated. gpuEvent_t event_; bool has_event_{false}; diff --git a/paddle/fluid/framework/device_worker.h b/paddle/fluid/framework/device_worker.h index d7714808ff08a..e448f80ae3938 100644 --- a/paddle/fluid/framework/device_worker.h +++ b/paddle/fluid/framework/device_worker.h @@ -53,7 +53,7 @@ class Scope; } // namespace framework } // namespace paddle -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) #include "paddle/fluid/platform/device/gpu/nccl_helper.h" #endif @@ -85,12 +85,12 @@ class PullDenseWorker { public: virtual ~PullDenseWorker() {} virtual void Initialize(const TrainerDesc& param); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) void AddStream(const gpuStream_t stream) { copy_streams_.push_back(stream); } #endif #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ - defined(PADDLE_WITH_XPU) + defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_MUSA) void AddPlace(const paddle::platform::Place place) { places_.push_back(place); } @@ -155,7 +155,7 @@ class PullDenseWorker { float total_batch_num_ = 0; std::unordered_map scope_to_thread_id_; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) std::vector copy_streams_; #endif std::vector places_; @@ -186,7 +186,7 @@ class DeviceWorker { virtual void ProduceTasks() {} virtual void GetXpuOpIndex() {} virtual void Schedule(int taskid UNUSED) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) virtual void SetStream(const gpuStream_t stream UNUSED) {} virtual void SetEvent(const gpuEvent_t event UNUSED) {} #endif @@ -588,7 +588,7 @@ class HeterCpuWorker : public HogwildWorker { }; #endif -#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL || \ +#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL || defined PADDLE_WITH_MCCL || \ defined PADDLE_WITH_XPU_BKCL) && \ (defined PADDLE_WITH_PSLIB) class PSGPUWorker : public HogwildWorker { @@ -604,7 +604,7 @@ class PSGPUWorker : public HogwildWorker { new (&program_) ProgramDesc(main_program); } void ProduceTasks() override; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) virtual void SetStream(const gpuStream_t stream) { copy_stream_ = stream; } virtual void SetEvent(const gpuEvent_t event) { event_ = event; } #endif @@ -672,7 +672,7 @@ class PSGPUWorker : public HogwildWorker { std::unordered_map> feasign_set_; paddle::framework::Channel> pull_queue_; paddle::framework::Channel> push_queue_; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) gpuEvent_t event_; gpuStream_t copy_stream_; #endif @@ -718,7 +718,7 @@ class PSGPUWorker : public HogwildWorker { }; #endif -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) class SectionWorker : public DeviceWorker { public: SectionWorker() {} @@ -845,7 +845,7 @@ class HeterSectionWorker : public DeviceWorker { Scope* GetThreadScope() override { return minibatch_scope_; } // multi-stream - // #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + // #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) // void SetStream(const gpuStream_t stream) override {} // void SetEvent(const gpuEvent_t event) override {} // #endif diff --git a/paddle/fluid/framework/device_worker_factory.cc b/paddle/fluid/framework/device_worker_factory.cc index 5c920fa3e318f..c4ef22ebfe82c 100644 --- a/paddle/fluid/framework/device_worker_factory.cc +++ b/paddle/fluid/framework/device_worker_factory.cc @@ -77,13 +77,13 @@ REGISTER_DEVICE_WORKER_CLASS(HeterSectionWorker); REGISTER_DEVICE_WORKER_CLASS(HeterCpuWorker); #endif -#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL || \ +#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL || defined PADDLE_WITH_MCCL|| \ defined PADDLE_WITH_XPU_BKCL) && \ (defined PADDLE_WITH_PSLIB) REGISTER_DEVICE_WORKER_CLASS(PSGPUWorker); #endif -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) REGISTER_DEVICE_WORKER_CLASS(SectionWorker); #endif } // namespace framework diff --git a/paddle/fluid/framework/dlpack_tensor.cc b/paddle/fluid/framework/dlpack_tensor.cc index 1e1a02f944f65..4c6e19fd964bb 100644 --- a/paddle/fluid/framework/dlpack_tensor.cc +++ b/paddle/fluid/framework/dlpack_tensor.cc @@ -96,7 +96,7 @@ struct DLDeviceVisitor { } inline ::DLDevice operator()(const platform::CUDAPlace &place) const { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) ::DLDevice device; device.device_type = kDLGPU; device.device_id = place.device; // NOLINT @@ -108,7 +108,7 @@ struct DLDeviceVisitor { } inline ::DLDevice operator()(const platform::CUDAPinnedPlace &place) const { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) ::DLDevice device; device.device_type = kDLCPUPinned; device.device_id = 0; diff --git a/paddle/fluid/framework/fleet/CMakeLists.txt b/paddle/fluid/framework/fleet/CMakeLists.txt index 5dee8b04e78b7..659bdcaaf9516 100644 --- a/paddle/fluid/framework/fleet/CMakeLists.txt +++ b/paddle/fluid/framework/fleet/CMakeLists.txt @@ -50,6 +50,12 @@ if(WITH_HETERPS) SRCS ps_gpu_wrapper.cu ps_gpu_wrapper.cc DEPS heter_ps gloo_wrapper ${BRPC_DEPS}) add_subdirectory(heter_ps) + elseif(WITH_MCCL) + musa_library( + ps_gpu_wrapper + SRCS ps_gpu_wrapper.cu ps_gpu_wrapper.cc + DEPS heter_ps gloo_wrapper ${BRPC_DEPS}) + add_subdirectory(heter_ps) endif() else() cc_library( @@ -58,7 +64,7 @@ else() DEPS gloo_wrapper) endif() -if(WITH_NCCL OR WITH_RCCL) +if(WITH_NCCL OR WITH_RCCL OR WITH_MCCL) cc_library( nccl_wrapper SRCS nccl_wrapper.cc @@ -77,6 +83,12 @@ if(WITH_BOX_PS) SRCS box_wrapper.cc box_wrapper.cu DEPS framework_proto lod_tensor box_ps) endif() + if(WITH_MUSA) + musa_library( + box_wrapper + SRCS box_wrapper.cc box_wrapper.cu + DEPS framework_proto lod_tensor box_ps) + endif() else() cc_library( box_wrapper diff --git a/paddle/fluid/framework/fleet/box_wrapper.cu b/paddle/fluid/framework/fleet/box_wrapper.cu index 5f46906cf8e82..0d1c4aba87dc5 100644 --- a/paddle/fluid/framework/fleet/box_wrapper.cu +++ b/paddle/fluid/framework/fleet/box_wrapper.cu @@ -161,6 +161,11 @@ void BoxWrapper::CopyForPull(const paddle::platform::Place& place, values.data(), values.size() * sizeof(float*), hipMemcpyHostToDevice); +#elif defined(PADDLE_WITH_MUSA) + musaMemcpy(gpu_values, + values.data(), + values.size() * sizeof(float*), + musaMemcpyHostToDevice); #else cudaMemcpy(gpu_values, values.data(), @@ -250,6 +255,10 @@ void BoxWrapper::CopyKeys(const paddle::platform::Place& place, slot_num, total_len); hipStreamSynchronize(stream); +#elif defined(PADDLE_WITH_MUSA) + CopyKeysKernel<<<(total_len + 512 - 1) / 512, 512, 0, stream>>>( + origin_keys, total_keys, gpu_len, slot_num, total_len); + musaStreamSynchronize(stream); #else CopyKeysKernel<<<(total_len + 512 - 1) / 512, 512, 0, stream>>>( origin_keys, total_keys, gpu_len, slot_num, total_len); @@ -295,6 +304,19 @@ void BoxWrapper::CopyForPush(const paddle::platform::Place& place, slot_vector_.data(), slot_lengths_lod.size() * sizeof(int), hipMemcpyHostToDevice); +#elif defined(PADDLE_WITH_MUSA) + musaMemcpy(gpu_values, + grad_values.data(), + grad_values.size() * sizeof(float*), + musaMemcpyHostToDevice); + musaMemcpy(gpu_len, + slot_lengths_lod.data(), + slot_lengths.size() * sizeof(int64_t), + musaMemcpyHostToDevice); + musaMemcpy(d_slot_vector, + slot_vector_.data(), + slot_lengths_lod.size() * sizeof(int), + musaMemcpyHostToDevice); #else cudaMemcpy(gpu_values, grad_values.data(), diff --git a/paddle/fluid/framework/fleet/box_wrapper.h b/paddle/fluid/framework/fleet/box_wrapper.h index 9853c328cd14e..b3432277805a7 100644 --- a/paddle/fluid/framework/fleet/box_wrapper.h +++ b/paddle/fluid/framework/fleet/box_wrapper.h @@ -595,6 +595,9 @@ class BoxWrapper { data->resize(len); #ifdef PADDLE_WITH_HIP hipMemcpy(data->data(), gpu_data, sizeof(T) * len, hipMemcpyDeviceToHost); +#elif defined(PADDLE_WITH_MUSA) + musaMemcpy( + data->data(), gpu_data, sizeof(T) * len, musaMemcpyDeviceToHost); #else cudaMemcpy( data->data(), gpu_data, sizeof(T) * len, cudaMemcpyDeviceToHost); diff --git a/paddle/fluid/framework/fleet/box_wrapper_impl.h b/paddle/fluid/framework/fleet/box_wrapper_impl.h index d72e418aadd3e..9eb4360e7dd08 100644 --- a/paddle/fluid/framework/fleet/box_wrapper_impl.h +++ b/paddle/fluid/framework/fleet/box_wrapper_impl.h @@ -44,7 +44,7 @@ void BoxWrapper::PullSparseCase(const paddle::platform::Place& place, PADDLE_THROW(platform::errors::Unimplemented( "Warning:: CPUPlace is not supported in PaddleBox now.")); } else if (platform::is_gpu_place(place)) { -#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && !defined(_WIN32) +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)) && !defined(_WIN32) VLOG(3) << "Begin copy keys, key_num[" << total_length << "]"; int device_id = place.GetDeviceId(); phi::DenseTensor& total_keys_tensor = keys_tensor[device_id]; @@ -70,6 +70,15 @@ void BoxWrapper::PullSparseCase(const paddle::platform::Place& place, slot_lengths_lod.data(), slot_lengths.size() * sizeof(int64_t), hipMemcpyHostToDevice); +#elif defined(PADDLE_WITH_MUSA) + musaMemcpy(gpu_keys, + keys.data(), + keys.size() * sizeof(uint64_t*), + musaMemcpyHostToDevice); + musaMemcpy(gpu_len, + slot_lengths_lod.data(), + slot_lengths.size() * sizeof(int64_t), + musaMemcpyHostToDevice); #else cudaMemcpy(gpu_keys, keys.data(), @@ -153,7 +162,7 @@ void BoxWrapper::PushSparseGradCase( PADDLE_THROW(platform::errors::Unimplemented( "Warning:: CPUPlace is not supported in PaddleBox now.")); } else if (platform::is_gpu_place(place)) { -#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && !defined(_WIN32) +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)) && !defined(_WIN32) int device_id = place.GetDeviceId(); phi::DenseTensor& cached_total_keys_tensor = keys_tensor[device_id]; uint64_t* total_keys = diff --git a/paddle/fluid/framework/fleet/fleet_wrapper.cc b/paddle/fluid/framework/fleet/fleet_wrapper.cc index 05433c1014656..7ac9e4f7302a6 100644 --- a/paddle/fluid/framework/fleet/fleet_wrapper.cc +++ b/paddle/fluid/framework/fleet/fleet_wrapper.cc @@ -784,7 +784,7 @@ void FleetWrapper::PushDenseVarsSync( const uint64_t table_id, const std::vector& var_names) {} -#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \ +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)) && \ (defined PADDLE_WITH_PSLIB) void FleetWrapper::PushDenseVarsAsync( const Scope& scope, @@ -816,6 +816,9 @@ void FleetWrapper::PushDenseVarsAsync( #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event, stream)); hipEventSynchronize(event); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(musaEventRecord(event, stream)); + musaEventSynchronize(event); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event, stream)); cudaEventSynchronize(event); diff --git a/paddle/fluid/framework/fleet/fleet_wrapper.h b/paddle/fluid/framework/fleet/fleet_wrapper.h index fb5cf91729256..1284b379c9f20 100644 --- a/paddle/fluid/framework/fleet/fleet_wrapper.h +++ b/paddle/fluid/framework/fleet/fleet_wrapper.h @@ -175,7 +175,7 @@ class FleetWrapper { // Push dense variables to server in async mode // Param: scope, table_id, var_names, scale_datanorm, batch_size // Param: push_sparse_status -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) void PushDenseVarsAsync( const Scope& scope, const uint64_t table_id, diff --git a/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt b/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt index 0af67107f0cbc..1dbd675073dd7 100644 --- a/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt +++ b/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt @@ -96,3 +96,18 @@ if(WITH_ROCM) SRCS heter_ps.cu DEPS heter_comm) endif() +if(WITH_MUSA) + musa_library( + heter_comm + SRCS heter_comm.h feature_value.h heter_resource.cc heter_resource.h + hashtable.h + DEPS cub device_context) + musa_test( + test_heter_comm + SRCS feature_value.h + DEPS heter_comm) + musa_library( + heter_ps + SRCS heter_ps.cu + DEPS heter_comm) +endif() \ No newline at end of file diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.cu b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.cu index 3bf395071df27..b5d788840ee54 100644 --- a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.cu +++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.cu @@ -846,7 +846,7 @@ void GraphGpuWrapper::init_service() { inter_comms_.resize(dev_size); if (gloo->Rank() == 0) { for (int i = 0; i < dev_size; ++i) { - platform::dynload::ncclGetUniqueId(&inter_ncclids_[i]); + platform::dynload::mcclGetUniqueId(&inter_ncclids_[i]); } } @@ -860,13 +860,13 @@ void GraphGpuWrapper::init_service() { opts.setRoot(0); gloo::broadcast(opts); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart()); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclGroupStart()); for (int i = 0; i < dev_size; ++i) { platform::CUDADeviceGuard guard(device_id_mapping[i]); platform::dynload::ncclCommInitRank( &inter_comms_[i], gloo->Size(), inter_ncclids_[i], gloo->Rank()); } - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd()); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclGroupEnd()); rank_id_ = gloo->Rank(); node_size_ = gloo->Size(); diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h index 315a9860ed67a..4045c615a27cb 100644 --- a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h +++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h @@ -22,7 +22,7 @@ #include "paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h" #ifdef PADDLE_WITH_HETERPS -#if defined(__NVCC__) || defined(__HIPCC__) +#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) #include #include #include @@ -302,9 +302,9 @@ class GraphGpuWrapper { int node_size_ = 1; int multi_node_ = 0; #ifdef PADDLE_WITH_CUDA - std::vector inner_comms_; - std::vector inter_comms_; - std::vector inter_ncclids_; + std::vector inner_comms_; + std::vector inter_comms_; + std::vector inter_ncclids_; #endif }; // class GraphGpuWrapper #endif diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm.h index 18e3966b220c0..b869ad1c235cb 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_comm.h +++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm.h @@ -166,8 +166,8 @@ class HeterComm { size_t len, Sgd& sgd); // NOLINT - void set_nccl_comm_and_size(const std::vector& inner_comms, - const std::vector& inter_comms, + void set_nccl_comm_and_size(const std::vector& inner_comms, + const std::vector& inter_comms, int comm_size, int rank_id) { nccl_inner_comms_ = inner_comms; @@ -791,8 +791,8 @@ class HeterComm { #if defined(PADDLE_WITH_CUDA) GpuRDMAChecker* rdma_checker_ = nullptr; - std::vector nccl_inner_comms_; - std::vector nccl_inter_comms_; + std::vector nccl_inner_comms_; + std::vector nccl_inter_comms_; int multi_mf_dim_{8}; int max_mf_dim_ = 8; std::vector> allocators_; diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h index 36fe556bcf3fb..3df6e6e89861f 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h +++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h @@ -2870,7 +2870,7 @@ size_t HeterComm::send_data_by_all2all( auto &loc = storage_[gpu_id]; auto nccl_stream = resource_->comm_stream(gpu_id, 0); size_t total_fea_num = 0; - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart()); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclGroupStart()); for (int i = 0; i < nccl_node_size; i++) { if (i == nccl_rank_id) { continue; @@ -2881,7 +2881,7 @@ size_t HeterComm::send_data_by_all2all( PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::ncclSend(&d_send_buff[send_offset], send_size * value_bytes, - ncclInt8, + mcclInt8, i, comm, nccl_stream)); @@ -2893,14 +2893,14 @@ size_t HeterComm::send_data_by_all2all( PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv( reinterpret_cast(&d_rev_buff[recv_offset]), recv_size * value_bytes, - ncclInt8, + mcclInt8, i, comm, nccl_stream)); total_fea_num += recv_size; } } - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd()); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclGroupEnd()); PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(nccl_stream)); return total_fea_num; @@ -2959,11 +2959,11 @@ size_t HeterComm:: cache.node_barrier_.Resume(); auto &comm = nccl_inter_comms_[gpu_id]; auto nccl_stream = resource_->comm_stream(gpu_id, 0); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllGather( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclAllGather( &res.d_node_size_ptr[rank_offset], reinterpret_cast(res.d_node_size_ptr), node_size_, - ncclInt, + mcclInt, comm, nccl_stream)); PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(nccl_stream)); @@ -3780,11 +3780,11 @@ size_t HeterComm:: my_cache.node_barrier_.Resume(); auto &comm = nccl_inter_comms_[gpu_id]; auto nccl_stream = resource_->comm_stream(gpu_id, 0); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllGather( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclAllGather( &res.d_node_size_ptr[rank_id_ * node_size_], reinterpret_cast(res.d_node_size_ptr), node_size_, - ncclInt, + mcclInt, comm, nccl_stream)); PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(nccl_stream)); diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu b/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu index 3fe05753e09a3..017e3726357b9 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu +++ b/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu @@ -134,8 +134,8 @@ void HeterPs::push_sparse(int num, template class GPUOptimizer> void HeterPs::set_nccl_comm_and_size( - const std::vector& inner_comms, - const std::vector& inter_comms, + const std::vector& inner_comms, + const std::vector& inter_comms, int comm_size, int rank_id) { comm_->set_nccl_comm_and_size(inner_comms, inter_comms, comm_size, rank_id); diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_ps.h b/paddle/fluid/framework/fleet/heter_ps/heter_ps.h index c472c2ed75a9d..d1c1d0c8b611b 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_ps.h +++ b/paddle/fluid/framework/fleet/heter_ps/heter_ps.h @@ -49,8 +49,8 @@ class HeterPs : public HeterPsBase { size_t chunk_size, int stream_num) override; #if defined(PADDLE_WITH_CUDA) - void set_nccl_comm_and_size(const std::vector& inner_comms, - const std::vector& inter_comms, + void set_nccl_comm_and_size(const std::vector& inner_comms, + const std::vector& inter_comms, int comm_size, int rank_id) override; void set_multi_mf_dim(int multi_mf_dim, int max_mf_dim) override; diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h b/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h index 8624425d8bfbd..b729cdfcbb0f9 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h +++ b/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h @@ -46,8 +46,8 @@ class HeterPsBase { virtual int get_index_by_devid(int devid) = 0; #if defined(PADDLE_WITH_CUDA) virtual void set_nccl_comm_and_size( - const std::vector& inner_comms, - const std::vector& inter_comms, + const std::vector& inner_comms, + const std::vector& inter_comms, int comm_size, int rank_id) = 0; virtual void set_multi_mf_dim(int multi_mf_dim, int max_mf_dim) = 0; diff --git a/paddle/fluid/framework/fleet/heter_wrapper.cc b/paddle/fluid/framework/fleet/heter_wrapper.cc index a8ce9be92bdf6..97b704b4f3d21 100644 --- a/paddle/fluid/framework/fleet/heter_wrapper.cc +++ b/paddle/fluid/framework/fleet/heter_wrapper.cc @@ -121,7 +121,7 @@ void HeterWrapper::SerializeToReq(const std::string& varname, tensor->numel() * SizeOfType(framework::TransToProtoVarType(tensor->dtype()))); } else { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) memory::Copy(platform::CPUPlace(), data_ptr, tensor->place(), @@ -141,7 +141,7 @@ void HeterWrapper::SerializeToReq(const std::string& varname, } } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) void HeterWrapper::DeSerializeToTensor(Scope* scope, const VariableMessage& req_var, platform::Place place, @@ -169,7 +169,7 @@ void HeterWrapper::DeSerializeToTensor(Scope* scope, void* tensor_data = tensor->mutable_data( place, framework::TransToPhiDataType(ToVarType(req_var.data_type()))); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) memory::Copy(place, tensor_data, platform::CPUPlace(), diff --git a/paddle/fluid/framework/fleet/heter_wrapper.h b/paddle/fluid/framework/fleet/heter_wrapper.h index 77838fbec6d00..70cbce2acc24d 100644 --- a/paddle/fluid/framework/fleet/heter_wrapper.h +++ b/paddle/fluid/framework/fleet/heter_wrapper.h @@ -92,7 +92,7 @@ class HeterWrapper { framework::proto::VarType::Type ToVarType(VariableMessage::Type type); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) void DeSerializeToTensor(Scope* scope, const VariableMessage& req_var, platform::Place place, diff --git a/paddle/fluid/framework/fleet/nccl_wrapper.cc b/paddle/fluid/framework/fleet/nccl_wrapper.cc index 640f7dd08dc8d..8be530c3170ba 100644 --- a/paddle/fluid/framework/fleet/nccl_wrapper.cc +++ b/paddle/fluid/framework/fleet/nccl_wrapper.cc @@ -21,9 +21,9 @@ std::shared_ptr NCCLWrapper::s_instance_ = NULL; bool NCCLWrapper::is_initialized_ = false; void NCCLWrapper::InitNCCL() { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::ncclCommInitRank(&(nccl_info_.comm_), + platform::dynload::mcclCommInitRank(&(nccl_info_.comm_), nccl_info_.global_ranks_, nccl_info_.nccl_id_, nccl_info_.my_global_rank_)); @@ -32,16 +32,16 @@ void NCCLWrapper::InitNCCL() { } void NCCLWrapper::SetNCCLId(const NCCLInfo& nccl_info) { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) nccl_info_.nccl_id_ = nccl_info.nccl_id_; #endif return; } NCCLInfo NCCLWrapper::GetNCCLId() { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::ncclGetUniqueId(&(nccl_info_.nccl_id_))); + platform::dynload::mcclGetUniqueId(&(nccl_info_.nccl_id_))); #endif return nccl_info_; } @@ -49,13 +49,15 @@ NCCLInfo NCCLWrapper::GetNCCLId() { void NCCLWrapper::SetRankInfo(const int local_rank, const int global_rank, const int ranks) { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) nccl_info_.local_rank_ = local_rank; nccl_info_.my_global_rank_ = global_rank; nccl_info_.global_ranks_ = ranks; platform::SetDeviceId(local_rank); #ifdef PADDLE_WITH_RCCL PADDLE_ENFORCE_GPU_SUCCESS(hipStreamCreate(&(nccl_info_.stream_))); +#elif defined(PADDLE_WITH_MCCL) + PADDLE_ENFORCE_GPU_SUCCESS(musaStreamCreate(&(nccl_info_.stream_))); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamCreate(&(nccl_info_.stream_))); #endif @@ -66,20 +68,22 @@ void NCCLWrapper::SetRankInfo(const int local_rank, void NCCLWrapper::SyncVar(const int root_rank, const Scope& scope, const std::vector& var_names) { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) for (auto& name : var_names) { auto var = scope.FindVar(name); phi::DenseTensor* tensor = var->GetMutable(); int32_t total_size = tensor->numel(); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclBcast( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclBcast( reinterpret_cast(tensor->data()), total_size, - ncclFloat, + mcclFloat, root_rank, nccl_info_.comm_, nccl_info_.stream_)); #ifdef PADDLE_WITH_RCCL hipStreamSynchronize(nccl_info_.stream_); +#elif defined(PADDLE_WITH_MCCL) + musaStreamSynchronize(nccl_info_.stream_); #else cudaStreamSynchronize(nccl_info_.stream_); #endif diff --git a/paddle/fluid/framework/fleet/nccl_wrapper.h b/paddle/fluid/framework/fleet/nccl_wrapper.h index 7e9cc0c56a6b4..46cdae20395e9 100644 --- a/paddle/fluid/framework/fleet/nccl_wrapper.h +++ b/paddle/fluid/framework/fleet/nccl_wrapper.h @@ -31,6 +31,10 @@ limitations under the License. */ #ifdef PADDLE_WITH_RCCL #include "paddle/fluid/platform/dynload/rccl.h" #endif +#ifdef PADDLE_WITH_MCCL +#include "paddle/fluid/platform/dynload/mccl.h" +#endif + #include "paddle/fluid/platform/macros.h" // for DISABLE_COPY_AND_ASSIGN namespace paddle { @@ -51,9 +55,9 @@ class NCCLInfo { int local_rank_; int global_ranks_; int my_global_rank_; -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) - ncclUniqueId nccl_id_; - ncclComm_t comm_; +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) + mcclUniqueId nccl_id_; + mcclComm_t comm_; gpuStream_t stream_; #endif }; diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h index edfa4048b5528..85fe092e963db 100644 --- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h +++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h @@ -314,7 +314,7 @@ class PSGPUWrapper { inter_comms_.resize(dev_size); if (gloo->Rank() == 0) { for (int i = 0; i < dev_size; ++i) { - platform::dynload::ncclGetUniqueId(&inter_ncclids_[i]); + platform::dynload::mcclGetUniqueId(&inter_ncclids_[i]); } } @@ -328,13 +328,13 @@ class PSGPUWrapper { opts.setRoot(0); gloo::broadcast(opts); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart()); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclGroupStart()); for (int i = 0; i < dev_size; ++i) { platform::CUDADeviceGuard guard(dev_ids[i]); platform::dynload::ncclCommInitRank( &inter_comms_[i], gloo->Size(), inter_ncclids_[i], gloo->Rank()); } - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd()); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclGroupEnd()); rank_id_ = gloo->Rank(); node_size_ = gloo->Size(); @@ -979,9 +979,9 @@ class PSGPUWrapper { uint64_t table_id_; int gpu_graph_mode_ = 0; #ifdef PADDLE_WITH_CUDA - std::vector inner_comms_; - std::vector inter_comms_; - std::vector inter_ncclids_; + std::vector inner_comms_; + std::vector inter_comms_; + std::vector inter_ncclids_; #endif std::vector heter_devices_; std::unordered_set gpu_ps_config_keys_; diff --git a/paddle/fluid/framework/garbage_collector.cc b/paddle/fluid/framework/garbage_collector.cc index d0620381ae8e9..5f9db8c20d51f 100644 --- a/paddle/fluid/framework/garbage_collector.cc +++ b/paddle/fluid/framework/garbage_collector.cc @@ -13,7 +13,7 @@ // limitations under the License. #include -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) #include "paddle/fluid/platform/cuda_device_guard.h" #endif #include "paddle/fluid/framework/garbage_collector.h" @@ -64,7 +64,7 @@ void IPUGarbageCollector::ClearCallback(const std::function &callback) { } #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) UnsafeFastGPUGarbageCollector::UnsafeFastGPUGarbageCollector( const platform::CUDAPlace &place, size_t max_memory_size) : GarbageCollector(place, max_memory_size) {} @@ -93,6 +93,8 @@ StreamGarbageCollector::StreamGarbageCollector(const platform::CUDAPlace &place, platform::CUDADeviceGuard guard(place.device); #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipStreamCreate(&stream_)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(musaStreamCreate(&stream_)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamCreate(&stream_)); callback_manager_ = @@ -201,7 +203,7 @@ std::unique_ptr CreateGarbageCollector( const platform::Place &place, const size_t max_memory_size) { std::unique_ptr gc = nullptr; if (platform::is_gpu_place(place)) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) if (IsFastEagerDeletionModeEnabled()) { gc = std::make_unique(place, max_memory_size); diff --git a/paddle/fluid/framework/garbage_collector.h b/paddle/fluid/framework/garbage_collector.h index 5376739624d6f..f9d94600a513d 100644 --- a/paddle/fluid/framework/garbage_collector.h +++ b/paddle/fluid/framework/garbage_collector.h @@ -85,7 +85,7 @@ class IPUGarbageCollector : public GarbageCollector { }; #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) class UnsafeFastGPUGarbageCollector : public GarbageCollector { public: UnsafeFastGPUGarbageCollector(const platform::CUDAPlace &place, diff --git a/paddle/fluid/framework/hogwild_worker.cc b/paddle/fluid/framework/hogwild_worker.cc index b98094ab74101..83dbe31d86a5a 100644 --- a/paddle/fluid/framework/hogwild_worker.cc +++ b/paddle/fluid/framework/hogwild_worker.cc @@ -28,7 +28,7 @@ limitations under the License. */ #include "paddle/phi/core/flags.h" #include "paddle/phi/kernels/funcs/math_function.h" -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) #include "paddle/phi/core/distributed/nccl_comm_context.h" PHI_DECLARE_bool(dynamic_static_unified_comm); #endif @@ -1202,20 +1202,20 @@ bool HogwildWorker::CheckBatchNum(int flag) { // comm_ctx->AllReduce only support allreduce on the whole tensor, // single element is not supported now. PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::ncclAllReduce(&stat_ptr[flag], + platform::dynload::mcclAllReduce(&stat_ptr[flag], &stat_ptr[2], 1, ncclFloat32, - ncclProd, + mcclProd, comm_ctx->GetNcclComm(), stream)); } else { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(&stat_ptr[flag], + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclAllReduce(&stat_ptr[flag], &stat_ptr[2], 1, ncclFloat32, - ncclProd, + mcclProd, comm->comm(), stream)); } @@ -1246,11 +1246,11 @@ bool HogwildWorker::GetPassEnd(int flag) { // auto stream = static_cast(dev_ctx_)->stream(); // PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream)); auto stream = comm->stream(); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(&stat_ptr[flag], + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclAllReduce(&stat_ptr[flag], &stat_ptr[2], 1, ncclFloat32, - ncclProd, + mcclProd, comm->comm(), stream)); PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(&ret, // output @@ -1267,7 +1267,7 @@ bool HogwildWorker::GetPassEnd(int flag) { void HogwildWorker::TrainFilesWithProfiler() { platform::SetNumThreads(1); #if defined(PADDLE_WITH_HETERPS) && \ - (defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)) + (defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)) platform::SetDeviceId(thread_id_); #elif defined(PADDLE_WITH_HETERPS) && defined(PADDLE_WITH_XPU_BKCL) platform::SetXPUDeviceId(thread_id_); @@ -1473,7 +1473,7 @@ void HogwildWorker::TrainFiles() { platform::Timer timeline; timeline.Start(); #if defined(PADDLE_WITH_HETERPS) && \ - (defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)) + (defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)) platform::SetDeviceId(thread_id_); #elif defined(PADDLE_WITH_HETERPS) && defined(PADDLE_WITH_XPU_BKCL) platform::SetXPUDeviceId(thread_id_); diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index 46183fd93e97f..d0c11c3098ddb 100755 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -3,7 +3,7 @@ add_subdirectory(memory_optimize_pass) add_subdirectory(multi_devices_graph_pass) if(NOT APPLE AND NOT WIN32 - AND (WITH_GPU OR WITH_ROCM)) + AND (WITH_GPU OR WITH_ROCM OR WITH_MUSA)) add_subdirectory(fusion_group) endif() @@ -169,7 +169,7 @@ if(WITH_TENSORRT) pass_library(trt_remove_amp_strategy_op_pass inference) endif() -if(WITH_GPU OR WITH_ROCM) +if(WITH_GPU OR WITH_ROCM OR WITH_MUSA) pass_library(cudnn_placement_pass base DEPS placement_pass_base) pass_library(embedding_eltwise_layernorm_fuse_pass inference) endif() @@ -493,7 +493,7 @@ cc_test( SRCS relu6_fuse_pass_test.cc DEPS relu6_fuse_pass) -if(WITH_GPU OR WITH_ROCM) +if(WITH_GPU OR WITH_ROCM OR WITH_MUSA) cc_test( test_embedding_eltwise_layernorm_fuse_pass SRCS embedding_eltwise_layernorm_fuse_pass_tester.cc @@ -543,7 +543,7 @@ if(WITH_MKLDNN) device_context phi common) - if(WITH_GPU OR WITH_ROCM) + if(WITH_GPU OR WITH_ROCM OR WITH_MUSA) set(TEST_CONV_BN_PASS_DEPS ${TEST_CONV_BN_PASS_DEPS} depthwise_conv) endif() cc_test( diff --git a/paddle/fluid/framework/ir/cost_model.cc b/paddle/fluid/framework/ir/cost_model.cc index a54138060283b..a28930961efa0 100644 --- a/paddle/fluid/framework/ir/cost_model.cc +++ b/paddle/fluid/framework/ir/cost_model.cc @@ -128,7 +128,7 @@ bool CostData::SetCostData(const ProgramDesc& program, double cpu_time_ms = main_thread_events[op_push_index].CpuElapsedMs( main_thread_events[op_pop_index]); double gpu_time_ms = 0; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) gpu_time_ms = main_thread_events[op_push_index].CudaElapsedMs( main_thread_events[op_pop_index]); #endif @@ -152,7 +152,7 @@ bool CostData::SetCostData(const ProgramDesc& program, double cpu_time_ms = main_thread_events[start_profiler_idx].CpuElapsedMs( main_thread_events[stop_profiler_idx]); double gpu_time_ms = 0; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) gpu_time_ms = main_thread_events[start_profiler_idx].CudaElapsedMs( main_thread_events[stop_profiler_idx]); #endif diff --git a/paddle/fluid/framework/ir/fuse_bn_act_pass.cc b/paddle/fluid/framework/ir/fuse_bn_act_pass.cc index 048b33a649f94..e0a9502c685d2 100644 --- a/paddle/fluid/framework/ir/fuse_bn_act_pass.cc +++ b/paddle/fluid/framework/ir/fuse_bn_act_pass.cc @@ -34,8 +34,8 @@ namespace framework { namespace ir { void FuseBatchNormActPass::ApplyImpl(ir::Graph *graph) const { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 4, 1) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || CUDNN_VERSION_MIN(7, 4, 1) // forward std::unordered_set act_types = {"relu"}; graph = FuseBatchNormAct(graph, act_types); diff --git a/paddle/fluid/framework/ir/fuse_bn_add_act_pass.cc b/paddle/fluid/framework/ir/fuse_bn_add_act_pass.cc index 2a24c5476a501..36fa8a3331e7e 100644 --- a/paddle/fluid/framework/ir/fuse_bn_add_act_pass.cc +++ b/paddle/fluid/framework/ir/fuse_bn_add_act_pass.cc @@ -25,8 +25,8 @@ namespace framework { namespace ir { void FuseBatchNormAddActPass::ApplyImpl(ir::Graph *graph) const { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 4, 1) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || CUDNN_VERSION_MIN(7, 4, 1) // forward std::unordered_set act_types = {"relu"}; graph = FuseBatchNormAddAct(graph, act_types); diff --git a/paddle/fluid/framework/ir/fusion_group/CMakeLists.txt b/paddle/fluid/framework/ir/fusion_group/CMakeLists.txt index 570b081aae95e..390dd25b9cf5d 100644 --- a/paddle/fluid/framework/ir/fusion_group/CMakeLists.txt +++ b/paddle/fluid/framework/ir/fusion_group/CMakeLists.txt @@ -2,7 +2,7 @@ cc_library( code_generator SRCS operation.cc code_generator.cc code_generator_helper.cc DEPS graph subgraph_detector) -if(WITH_GPU OR WITH_ROCM) +if(WITH_GPU OR WITH_ROCM OR WITH_MUSA) cc_test( test_code_generator SRCS code_generator_tester.cc diff --git a/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc b/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc index 9749fb2bfa81c..92c1c1c6f0207 100644 --- a/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc +++ b/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc @@ -27,7 +27,7 @@ namespace phi { class DenseTensor; } // namespace phi -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/ir/fusion_group/cuda_resources.h b/paddle/fluid/framework/ir/fusion_group/cuda_resources.h index 195b29a9794a9..232e9bbf43607 100644 --- a/paddle/fluid/framework/ir/fusion_group/cuda_resources.h +++ b/paddle/fluid/framework/ir/fusion_group/cuda_resources.h @@ -34,7 +34,7 @@ __device__ inline double Log(double x) { return log(x); } __device__ inline double Sqrt(double x) { return sqrt(x); } )"; -#ifdef PADDLE_WITH_HIP +#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) static constexpr char predefined_cuda_functions_fp16[] = R"( __device__ inline __half Exp(const __half x) { return hexp(x); } __device__ inline __half Log(const __half x) { return hlog(x); } diff --git a/paddle/fluid/framework/ir/graph_helper.cc b/paddle/fluid/framework/ir/graph_helper.cc index 30a001777bd58..17910d7dfae80 100644 --- a/paddle/fluid/framework/ir/graph_helper.cc +++ b/paddle/fluid/framework/ir/graph_helper.cc @@ -25,7 +25,7 @@ limitations under the License. */ #include "paddle/fluid/framework/program_utils.h" #include "paddle/phi/core/distributed/comm_context_manager.h" -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) #include "paddle/fluid/framework/details/nccl_op_handle.h" #include "paddle/fluid/platform/collective_helper.h" #include "paddle/phi/core/distributed/nccl_comm_context.h" @@ -513,7 +513,7 @@ static OpDesc *ReplaceScaleLossGradOp(const Node &node, OpDesc *desc) { void ReplaceAllReduceOp(const Node &node, proto::BlockDesc *block, std::vector *ops) { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) bool is_fused = (node.Name() == "fused_all_reduce"); details::OpHandleBase &op_handle = @@ -688,7 +688,7 @@ static void GetGraphOpDesc(const std::vector &nodes, ops->emplace_back(); auto &desc = ops->back(); ReplaceScaleLossGradOp(*n, &desc); -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) } else if ((n->Name() == "allreduce" || n->Name() == "fused_all_reduce") && dynamic_cast( &(n->Wrapper())) != nullptr) { diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc b/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc index 9c60a665de002..c2a8c1bc73e8e 100644 --- a/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc +++ b/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc @@ -204,7 +204,7 @@ TEST(test_reference_count_pass, test_no_need_buffer_var_shrink) { {}); std::vector use_cuda_list{false}; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) use_cuda_list.push_back(true); #endif for (auto use_cuda : use_cuda_list) { diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/all_reduce_deps_pass.cc b/paddle/fluid/framework/ir/multi_devices_graph_pass/all_reduce_deps_pass.cc index 0dcf316c33c69..4579e172ef665 100644 --- a/paddle/fluid/framework/ir/multi_devices_graph_pass/all_reduce_deps_pass.cc +++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/all_reduce_deps_pass.cc @@ -30,7 +30,7 @@ class AllReduceDepsPass : public ir::Pass { std::vector all_reduce_op_handles = GetSortedAllReduceOps(*graph); -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) auto use_hierarchical_allreduce = Get(details::kUseHierarchicalAllReduce); for (size_t i = 0; i < all_reduce_op_handles.size(); ++i) { diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/fuse_all_reduce_op_pass.cc b/paddle/fluid/framework/ir/multi_devices_graph_pass/fuse_all_reduce_op_pass.cc index dc18979260f92..a24fd784bb408 100644 --- a/paddle/fluid/framework/ir/multi_devices_graph_pass/fuse_all_reduce_op_pass.cc +++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/fuse_all_reduce_op_pass.cc @@ -37,7 +37,7 @@ class FuseAllReduceOpPass : public ir::Pass { auto &places = Get>(details::kPlaces); auto &local_scopes = Get>(details::kLocalScopes); -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) auto *multi_nccl_ctxs = &Get(details::kNCCLCtxs); #elif defined(PADDLE_WITH_XPU_BKCL) @@ -95,7 +95,7 @@ class FuseAllReduceOpPass : public ir::Pass { for (auto &p_g : group_p_g) { group_all_reduce_ops.emplace_back(all_reduce_ops.at(p_g.second)); } -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) InsertFusedAllReduce(places, local_scopes, group_size, @@ -177,7 +177,7 @@ class FuseAllReduceOpPass : public ir::Pass { const std::vector &local_scopes, const size_t num_of_all_reduce, const std::vector &all_reduce_ops, -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) const platform::NCCLCommunicator *multi_nccl_ctxs, #elif defined(PADDLE_WITH_XPU_BKCL) const platform::BKCLCommunicator *multi_bkcl_ctxs, @@ -244,7 +244,7 @@ class FuseAllReduceOpPass : public ir::Pass { result->RemoveNode(op_handle.Node()); } -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) CreateFusedAllReduceOp(inputs, outputs, num_of_all_reduce, @@ -285,7 +285,7 @@ class FuseAllReduceOpPass : public ir::Pass { const std::vector &local_scopes, bool is_grad_merge, const std::string &grad_merge_cond_name, -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) const platform::NCCLCommunicator *multi_nccl_ctxs, #elif defined(PADDLE_WITH_XPU_BKCL) const platform::BKCLCommunicator *multi_bkcl_ctxs, @@ -293,7 +293,7 @@ class FuseAllReduceOpPass : public ir::Pass { ir::Graph *result) const { details::FusedAllReduceOpHandle *op_handle = nullptr; if (is_grad_merge) { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) op_handle = new details::FusedGradMergeAllReduceOpHandle( result->CreateEmptyNode("fused_all_reduce", ir::Node::Type::kOperation), @@ -321,7 +321,7 @@ class FuseAllReduceOpPass : public ir::Pass { grad_merge_cond_name); #endif } else { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) op_handle = new details::FusedAllReduceOpHandle( result->CreateEmptyNode("fused_all_reduce", ir::Node::Type::kOperation), @@ -355,7 +355,7 @@ class FuseAllReduceOpPass : public ir::Pass { op_handle->AddOutput(out); } -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) if (!multi_nccl_ctxs) { SetCommunicationContext(places, op_handle); } diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc b/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc index 295ef57cfdfea..9e7b22b8930cc 100644 --- a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc +++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc @@ -170,7 +170,7 @@ void MultiDevSSAGraphBuilderBase::Init() const { places_ = Get>(details::kPlaces); local_scopes_ = Get>(details::kLocalScopes); strategy_ = Get(kStrategy); -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) multi_nccl_ctxs_ = &Get(details::kNCCLCtxs); nccl_ctxs_ = nullptr; if (multi_nccl_ctxs_) { @@ -338,7 +338,7 @@ std::vector MultiDevSSAGraphBuilderBase::SortOperations( bool MultiDevSSAGraphBuilderBase::UseGPU() const { bool use_gpu = false; -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) use_gpu = nccl_ctxs_ != nullptr; #endif return use_gpu; @@ -389,7 +389,7 @@ void MultiDevSSAGraphBuilderBase::CreateOpHandleIOs(ir::Graph *result, void MultiDevSSAGraphBuilderBase::SetCommunicationContext( details::OpHandleBase *op_handle, const platform::Place &p) const { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) if (nccl_ctxs_ == nullptr) { op_handle->SetDeviceContext(p, platform::DeviceContextPool::Instance().Get(p)); @@ -408,7 +408,7 @@ void MultiDevSSAGraphBuilderBase::SetCommunicationContext( void MultiDevSSAGraphBuilderBase::CreateBroadcastOp(ir::Graph *result, const std::string &p_name, size_t src_dev_id) const { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) auto *op_handle = new details::BroadcastOpHandle( result->CreateEmptyNode("broadcast", ir::Node::Type::kOperation), local_scopes_, @@ -453,7 +453,7 @@ void MultiDevSSAGraphBuilderBase::CreateBroadcastOp(ir::Graph *result, void MultiDevSSAGraphBuilderBase::CreateFusedBroadcastOp( ir::Graph *result, const std::vector> &bcast_varnames) const { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) auto *op_handle = new details::FusedBroadcastOpHandle( result->CreateEmptyNode("fused_broadcast", ir::Node::Type::kOperation), local_scopes_, @@ -534,7 +534,7 @@ void MultiDevSSAGraphBuilderBase::CreateAllReduceOp(ir::Graph *result, -> details::OpHandleBase * { if (is_encoded) { #if defined(PADDLE_WITH_DGC) && \ - (defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)) + (defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)) result->Get(kGraphOps).emplace_back( new details::SparseAllReduceOpHandle( result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation), @@ -553,7 +553,7 @@ void MultiDevSSAGraphBuilderBase::CreateAllReduceOp(ir::Graph *result, grad_merge_cond_name = PADDLE_GET_CONST( std::string, node->Op()->GetAttr(GRAD_MERGE_COND_NAME)); VLOG(10) << "og=" << og << " use grad_merge_allreduce"; -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) result->Get(kGraphOps).emplace_back( new details::GradMergeAllReduceOpHandle( result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation), @@ -578,7 +578,7 @@ void MultiDevSSAGraphBuilderBase::CreateAllReduceOp(ir::Graph *result, grad_merge_cond_name)); #endif } else { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) result->Get(kGraphOps).emplace_back( new details::AllReduceOpHandle( result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation), @@ -718,7 +718,7 @@ void MultiDevSSAGraphBuilderBase::CreateComputationalOps( details::VarHandle *MultiDevSSAGraphBuilderBase::CreateReduceOp( ir::Graph *result, const std::string &og, size_t dst_dev_id) const { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) result->Get(kGraphOps).emplace_back(new details::ReduceOpHandle( result->CreateEmptyNode("reduce", ir::Node::Type::kOperation), local_scopes_, diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.h b/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.h index 9e8fb5202a2d5..397922ad4bc88 100644 --- a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.h +++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.h @@ -39,7 +39,7 @@ class Graph; namespace paddle { namespace platform { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) class NCCLCommunicator; class NCCLContextMap; #elif defined(PADDLE_WITH_XPU_BKCL) @@ -126,7 +126,7 @@ class MultiDevSSAGraphBuilderBase : public ir::Pass { void CreateIsolatedVarNode(ir::Graph *result, ir::Node *var_node) const; -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) mutable platform::NCCLContextMap *nccl_ctxs_{nullptr}; mutable platform::NCCLCommunicator *multi_nccl_ctxs_{nullptr}; #elif defined(PADDLE_WITH_XPU_BKCL) diff --git a/paddle/fluid/framework/new_executor/instruction/instruction_util.cc b/paddle/fluid/framework/new_executor/instruction/instruction_util.cc index debc3be7a32e0..976cd32e8ae51 100644 --- a/paddle/fluid/framework/new_executor/instruction/instruction_util.cc +++ b/paddle/fluid/framework/new_executor/instruction/instruction_util.cc @@ -34,7 +34,7 @@ #include "paddle/fluid/framework/new_executor/pir_adaptor/pir_adaptor_util.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/pir/core/block_argument.h" -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) #include "paddle/fluid/platform/collective_helper.h" #include "paddle/phi/core/distributed/comm_context_manager.h" #include "paddle/phi/core/distributed/nccl_comm_context.h" @@ -105,7 +105,7 @@ platform::DeviceContext* ParseDeviceContext( return dev_ctx; } -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) // NOTE(Ruibiao): Here supports multi-stream overlap for c_allreduce_sum // with use_cal_stream==false by returning a device context getting from the // global NCCLCommContext instance. Because when use_calc_stream==false, in @@ -338,7 +338,7 @@ bool GetCondData(const phi::DenseTensor& cond) { // when platform::is_gpu_place(cond.place()) or // platform::is_xpu_place(cond.place()) is true std::unique_ptr cpu_cond{new phi::DenseTensor()}; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || \ defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_CUSTOM_DEVICE) paddle::framework::TensorCopySync(cond, platform::CPUPlace(), cpu_cond.get()); #else diff --git a/paddle/fluid/framework/new_executor/interpreter/execution_config.cc b/paddle/fluid/framework/new_executor/interpreter/execution_config.cc index 8383b1fdd1790..a7434ad9d4181 100644 --- a/paddle/fluid/framework/new_executor/interpreter/execution_config.cc +++ b/paddle/fluid/framework/new_executor/interpreter/execution_config.cc @@ -53,7 +53,7 @@ inline std::tuple GetThreadPoolConfig(const phi::Place& place, processor_count = static_cast(std::thread::hardware_concurrency()); if (processor_count) { if (platform::is_gpu_place(place)) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) device_count = phi::backends::gpu::GetGPUDeviceCount(); #endif } diff --git a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc index 46b9247728d63..491370d4198fb 100644 --- a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc +++ b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc @@ -749,7 +749,7 @@ void BuildOpFuncList(const platform::Place& place, *op_with_kernel, *runtime_scope, *dev_ctx, runtime_context); auto expected_kernel_key = framework::TransPhiKernelKeyToOpKernelType( op_with_kernel->GetExpectedKernelType(exec_ctx)); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) if (op_with_kernel->CanCUDNNBeUsed(exec_ctx, expected_kernel_key.data_type_)) { expected_kernel_key.library_type_ = framework::LibraryType::kCUDNN; diff --git a/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.cc b/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.cc index 5b60205fbc529..bc273000e626f 100644 --- a/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.cc +++ b/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.cc @@ -20,7 +20,7 @@ #include "paddle/fluid/framework/new_executor/instruction/instruction_base.h" #include "paddle/fluid/framework/new_executor/interpreter/interpreter_util.h" #include "paddle/fluid/platform/device_context.h" -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) #include "paddle/fluid/platform/collective_helper.h" #include "paddle/phi/core/distributed/comm_context_manager.h" #include "paddle/phi/core/distributed/nccl_comm_context.h" @@ -229,7 +229,7 @@ DeviceContext* StreamAnalyzer::ParseDeviceContext( return dev_ctx; } -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) // NOTE(Ruibiao): Here supports multi-stream overlap for c_allreduce_sum // with use_cal_stream==false by returning a device context getting from the // global NCCLCommContext instance. Because when use_calc_stream==false, in diff --git a/paddle/fluid/framework/new_executor/interpreter_base_impl.h b/paddle/fluid/framework/new_executor/interpreter_base_impl.h index ff5832ba8335e..f6a5ed407c3f3 100644 --- a/paddle/fluid/framework/new_executor/interpreter_base_impl.h +++ b/paddle/fluid/framework/new_executor/interpreter_base_impl.h @@ -48,7 +48,7 @@ PD_DECLARE_bool(benchmark); PHI_DECLARE_uint64(executor_log_deps_every_microseconds); PHI_DECLARE_bool(new_executor_use_cuda_graph); PHI_DECLARE_bool(enable_pir_in_executor); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) PHI_DECLARE_bool(sync_nccl_allreduce); #endif @@ -121,7 +121,7 @@ class InterpreterBaseImpl { inline void SetDeviceId(const platform::Place& place) { // TODO(zhiqiu): reduce the cost if (platform::is_gpu_place(place)) { -#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) +#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA) PADDLE_THROW(platform::errors::Unavailable( "Cannot run operator on place %s, please recompile paddle or " "reinstall Paddle with CUDA support.", diff --git a/paddle/fluid/framework/new_executor/new_executor_defs.cc b/paddle/fluid/framework/new_executor/new_executor_defs.cc index a336e2c377dfd..ee7587140b923 100644 --- a/paddle/fluid/framework/new_executor/new_executor_defs.cc +++ b/paddle/fluid/framework/new_executor/new_executor_defs.cc @@ -314,7 +314,7 @@ void Instruction::AddInplace(Variable* in, Variable* out) { void Instruction::ClearInplace() { vec_inplace_in_to_out_.clear(); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) void Instruction::UpdataRecordStreamForGcInfo() { if (!IsInterpretercoreFastGCEnabled() || KernelType() != OpFuncType::kGpuAsync) { @@ -328,7 +328,7 @@ void Instruction::UpdataRecordStreamForGcInfo() { stream_ = reinterpret_cast(DeviceContext()).stream(); // TODO(lizhiyu): Only analyse the 'send_v2' for GPT pp strategy right now. // To support all the operators for communicating in the future. -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) auto operator_base_ptr = OpBase(); if ((operator_base_ptr->Type() == "send_v2") && (operator_base_ptr->Attr("use_calc_stream") == false)) { diff --git a/paddle/fluid/framework/new_executor/new_executor_defs.h b/paddle/fluid/framework/new_executor/new_executor_defs.h index 66773746deb27..6e96c0e5c109f 100644 --- a/paddle/fluid/framework/new_executor/new_executor_defs.h +++ b/paddle/fluid/framework/new_executor/new_executor_defs.h @@ -26,7 +26,7 @@ #include "paddle/fluid/platform/event.h" #include "paddle/phi/core/infermeta_utils.h" #include "paddle/phi/core/utils/rw_lock.h" -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) #include "paddle/fluid/platform/device/gpu/nccl_helper.h" #include "paddle/phi/core/distributed/comm_context_manager.h" #include "paddle/phi/core/distributed/nccl_comm_context.h" @@ -306,7 +306,7 @@ class Instruction { const OpFuncNode* OpFunc() const { return &op_func_node_; } // record stream for gc -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) bool need_record_stream_for_gc_ = false; gpuStream_t stream_{nullptr}; void UpdataRecordStreamForGcInfo(); diff --git a/paddle/fluid/framework/new_executor/pir_interpreter.cc b/paddle/fluid/framework/new_executor/pir_interpreter.cc index 66de40585130b..fe64b51464214 100644 --- a/paddle/fluid/framework/new_executor/pir_interpreter.cc +++ b/paddle/fluid/framework/new_executor/pir_interpreter.cc @@ -64,7 +64,7 @@ #include "paddle/pir/core/builtin_attribute.h" #include "paddle/pir/dialect/control_flow/ir/cf_op.h" -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) #include "paddle/fluid/platform/device/gpu/nccl_helper.h" #include "paddle/phi/core/distributed/comm_context_manager.h" #include "paddle/phi/core/distributed/nccl_comm_context.h" @@ -857,7 +857,7 @@ void PirInterpreter::RecordMemcpyD2H(InstructionBase* instr_node) { } void PirInterpreter::RecordStreamForGC(InstructionBase* instr) { -#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) +#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA) PADDLE_THROW(platform::errors::Unimplemented( "RecordStreamForGC is only implemented when compiled with GPU.")); #else @@ -876,7 +876,7 @@ void PirInterpreter::RecordStreamForGC(InstructionBase* instr) { reinterpret_cast(instr->DeviceContext()).stream(); // TODO(lizhiyu): Only analyse the 'send_v2' for GPT pp strategy right now. // To support all the operators for communicating in the future. -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) if (instr->Name() == "pd_op.send_v2") { ::pir::Operation* op = instr->Operation(); if (op->HasAttribute("use_calc_stream") && @@ -998,7 +998,7 @@ void PirInterpreter::CheckGC(InstructionBase* instr) { platform::RecordEvent record( "CheckGC", platform::TracerEventType::UserDefined, 10); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) RecordStreamForGC(instr); #endif @@ -1619,7 +1619,7 @@ void PirInterpreter::RunInstructionBase(InstructionBase* instr_node) { if (FLAGS_benchmark) { instr_node->DeviceContext().Wait(); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) PADDLE_ENFORCE_GPU_SUCCESS(platform::GpuGetLastError()); VLOG(4) << "Operator(" << instr_node->Name() // NOLINT << "): context wait and get last error"; diff --git a/paddle/fluid/framework/new_executor/profiler.h b/paddle/fluid/framework/new_executor/profiler.h index 95eee77d36288..f2fa9fd50eedb 100644 --- a/paddle/fluid/framework/new_executor/profiler.h +++ b/paddle/fluid/framework/new_executor/profiler.h @@ -42,7 +42,7 @@ class ProfilerGuard { private: void TotalCUDAAllocatedMemorySize(const platform::Place& place) { if (platform::is_gpu_place(place)) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) auto cuda_place = place; cost_info_->device_memory_bytes = platform::RecordedGpuMallocSize(cuda_place.device); diff --git a/paddle/fluid/framework/new_executor/program_interpreter.cc b/paddle/fluid/framework/new_executor/program_interpreter.cc index d1ce9f55e4690..f0aefb94e6b69 100644 --- a/paddle/fluid/framework/new_executor/program_interpreter.cc +++ b/paddle/fluid/framework/new_executor/program_interpreter.cc @@ -32,7 +32,7 @@ #endif #include "paddle/fluid/platform/cuda_graph_with_memory_pool.h" #include "paddle/phi/backends/device_manager.h" -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) #include "paddle/fluid/platform/device/gpu/nccl_helper.h" #include "paddle/phi/core/distributed/comm_context_manager.h" #include "paddle/phi/core/distributed/nccl_comm_context.h" @@ -92,7 +92,7 @@ ProgramInterpreter::ProgramInterpreter(const platform::Place& place, PrepareForCUDAGraphCapture(); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) calculate_stream_timer_ = std::make_unique(place); #endif } @@ -659,7 +659,7 @@ void ProgramInterpreter::ClearLoDTensorArrayInLocalScope() { std::tuple ProgramInterpreter::InterpreterRunTime() { double start_time = 0, end_time = 0; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) start_time = calculate_stream_timer_->StartTime(); end_time = calculate_stream_timer_->EndTime(); #endif @@ -701,7 +701,7 @@ void ProgramInterpreter::Convert( #endif vec_instruction_.emplace_back(op_idx, std::move(op_func_node), *dev_ctx_); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) vec_instruction_.back().UpdataRecordStreamForGcInfo(); #endif } @@ -973,7 +973,7 @@ void ProgramInterpreter::RunOperator(const Instruction& instr_node) { 1, platform::EventRole::kInnerOp); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) if (is_in_op_profiling_mode_) { platform::GpuDeviceSync(); } @@ -1009,7 +1009,7 @@ void ProgramInterpreter::RunOperator(const Instruction& instr_node) { OperatorDistAttr* op_dist_attr = block_.Op(op->Id())->MutableDistAttr(); platform::Timer op_timer; op_timer.Start(); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) platform::GpuDeviceSync(); #endif op_timer.Pause(); @@ -1040,7 +1040,7 @@ void ProgramInterpreter::RunOperator(const Instruction& instr_node) { /*For profiling/benchmark only*/ if (FLAGS_benchmark) { instr_node.DeviceContext().Wait(); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) PADDLE_ENFORCE_GPU_SUCCESS(platform::GpuGetLastError()); VLOG(4) << "Operator(" << op->Type() // NOLINT << "): context wait and get last error"; @@ -1105,7 +1105,7 @@ void ProgramInterpreter::RunInstruction(const Instruction& instr_node) { try { instr_node.WaitEvent(place_); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) if (enable_job_schedule_profiler_) { if (!calculate_stream_timer_->IsStarted() && op->Type() != "feed" && !interpreter::IsCommunicationOp(instr_node)) { @@ -1124,7 +1124,7 @@ void ProgramInterpreter::RunInstruction(const Instruction& instr_node) { } instr_node.RecordEvent(place_); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) if (enable_job_schedule_profiler_) { if (instr_node.Id() == last_calculate_instr_id_ && calculate_stream_timer_->IsStarted()) { @@ -1320,7 +1320,7 @@ void ProgramInterpreter::RunInstructionAsync(size_t instr_id) { } void ProgramInterpreter::RecordStreamForGC(const Instruction& instr) { -#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) +#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA) PADDLE_THROW(platform::errors::Unimplemented( "RecordStreamForGC is only implemented when compiled with GPU.")); #else @@ -1428,7 +1428,7 @@ void ProgramInterpreter::RecordStreamForGC(const Instruction& instr) { void ProgramInterpreter::CheckGC(const Instruction& instr) { platform::RecordEvent record( "CheckGC", platform::TracerEventType::UserDefined, 10); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) if (instr.need_record_stream_for_gc_) { RecordStreamForGC(instr); } diff --git a/paddle/fluid/framework/new_executor/program_interpreter.h b/paddle/fluid/framework/new_executor/program_interpreter.h index b19e3a06a4258..701da4f947359 100644 --- a/paddle/fluid/framework/new_executor/program_interpreter.h +++ b/paddle/fluid/framework/new_executor/program_interpreter.h @@ -16,7 +16,7 @@ #include "paddle/fluid/framework/new_executor/interpreter_base_impl.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) #include "paddle/phi/kernels/autotune/gpu_timer.h" #endif @@ -234,7 +234,7 @@ class ProgramInterpreter : public InterpreterBaseImpl { std::vector output_hookfuncs_; std::vector input_hookfuncs_; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) std::unique_ptr calculate_stream_timer_; #endif size_t last_calculate_instr_id_; diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h index 84ee045918fcd..f4a5f6d410eae 100644 --- a/paddle/fluid/framework/op_registry.h +++ b/paddle/fluid/framework/op_registry.h @@ -359,7 +359,7 @@ struct OpKernelRegistrarFunctorExCanCUDNNBeUsed(exe_ctx, kernel_type.data_type_)) { auto tmp_kernel_type = kernel_type; tmp_kernel_type.library_type_ = framework::LibraryType::kCUDNN; @@ -1567,12 +1567,12 @@ bool OperatorWithKernel::CanCUDNNBeUsed(const framework::ExecutionContext& ctx, bool use_cudnn = ctx.HasAttr("use_cudnn") && ctx.Attr("use_cudnn") && paddle::platform::is_gpu_place(ctx.GetPlace()); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) if (use_cudnn) { auto& dev_ctx = ctx.device_context(); use_cudnn &= (dev_ctx.cudnn_handle() != nullptr); } -#endif // PADDLE_WITH_CUDA || PADDLE_WITH_HIP +#endif // PADDLE_WITH_CUDA || PADDLE_WITH_HIP || defined(PADDLE_WITH_MUSA) #if defined(PADDLE_WITH_CUDA) if (use_cudnn && data_type == phi::DataType::BFLOAT16) { @@ -1808,7 +1808,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope, } #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) if (this->CanCUDNNBeUsed(exe_ctx, kernel_type_->data_type_)) { kernel_type_->library_type_ = framework::LibraryType::kCUDNN; } @@ -2071,7 +2071,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope, /*For profiling/benchmark only*/ if (FLAGS_benchmark) { dev_ctx->Wait(); -#if defined(PADDLE_WITH_CUDA) || defined(PADLDE_WITH_ROCM) +#if defined(PADDLE_WITH_CUDA) || defined(PADLDE_WITH_ROCM) || defined(PADLDE_WITH_MUSA) PADDLE_ENFORCE_GPU_SUCCESS(platform::GpuGetLastError()); #endif VLOG(4) << "Operator(" << Type() << "): context wait and get last error"; @@ -2134,7 +2134,7 @@ OpKernelType OperatorWithKernel::InnerGetExpectedKernelType( } #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) if (this->CanCUDNNBeUsed(ctx, expected_kernel_key.data_type_)) { expected_kernel_key.library_type_ = framework::LibraryType::kCUDNN; } @@ -2157,7 +2157,7 @@ OpKernelType OperatorWithKernel::InnerGetExpectedKernelType( // CPUKernel will be executed and a warning will be given at the same // time. expected_kernel_key.place_ = platform::CPUPlace(); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) if (SupportGPU()) { auto& dev_ctx = ctx.device_context(); expected_kernel_key.place_ = dev_ctx.GetPlace(); diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index d51c0ce0f415d..f8943d53f1590 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -584,7 +584,7 @@ class ExecutionContext : public phi::KernelContext { return device_context_; } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) const inline phi::GPUContext& cuda_device_context() const { PADDLE_ENFORCE_EQ(platform::is_gpu_place(device_context_.GetPlace()), true, diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index e6c11df275b56..cef7e14a2a1b8 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -41,14 +41,14 @@ limitations under the License. */ #include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler/event_tracing.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) #include "paddle/fluid/platform/cuda_device_guard.h" #endif #include "paddle/fluid/platform/flags.h" PHI_DECLARE_double(eager_delete_tensor_gb); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) PHI_DECLARE_bool(sync_nccl_allreduce); #endif @@ -69,7 +69,7 @@ static std::once_flag gProfileOnce; static bool gProfileStarted = false; #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) std::once_flag p2p_init_flag; #endif @@ -148,7 +148,7 @@ class ParallelExecutorPrivate { } } -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) void InitNCCLCtxs(framework::Scope *scope, const BuildStrategy &bst) { VLOG(1) << "nccl comm num:" << bst.nccl_comm_num_ << ", nranks:" << nranks_ << ", num_trainers:" << bst.num_trainers_ @@ -162,7 +162,7 @@ class ParallelExecutorPrivate { << bst.hierarchical_allreduce_exter_nranks_; } - std::vector flat_nccl_ids; + std::vector flat_nccl_ids; if (nranks_ == 1) { // FIXME(gongwb): need not to create ncclid when nranks==1 nccl_ctxs_->InitFlatCtxs( @@ -173,18 +173,18 @@ class ParallelExecutorPrivate { if (bst.enable_parallel_graph_) { VLOG(1) << "use only one ncclid in pg model"; - ncclUniqueId *nccl_id = nullptr; + mcclUniqueId *nccl_id = nullptr; std::string var_name = platform::GetFlatNCCLVarName(0); auto nccl_id_var = scope->FindVar(var_name); if (nccl_id_var) { - nccl_id = nccl_id_var->GetMutable(); + nccl_id = nccl_id_var->GetMutable(); VLOG(10) << "find nccl_id_var:" << var_name << ", nccl_id:" << nccl_id; } else { - nccl_id = new ncclUniqueId(); + nccl_id = new mcclUniqueId(); PADDLE_ENFORCE_EQ( - platform::dynload::ncclGetUniqueId(nccl_id), - ncclSuccess, + platform::dynload::mcclGetUniqueId(nccl_id), + mcclSuccess, platform::errors::PreconditionNotMet( "PaddlePaddle failed to get NCCL unique ID. It may due to your " "system settings or NCCL library error, please debug on NCCL")); @@ -213,7 +213,7 @@ class ParallelExecutorPrivate { PADDLE_ENFORCE_NOT_NULL( nccl_id_var, platform::errors::NotFound("Can't find nccl_id_var '%s'.", var_name)); - auto nccl_id = nccl_id_var->GetMutable(); + auto nccl_id = nccl_id_var->GetMutable(); flat_nccl_ids.push_back(nccl_id); } @@ -221,25 +221,25 @@ class ParallelExecutorPrivate { places_, flat_nccl_ids, bst.num_trainers_, bst.trainer_id_); if (bst.use_hierarchical_allreduce_) { - std::vector inter_nccl_ids; + std::vector inter_nccl_ids; for (int i = 0; i < static_cast(bst.nccl_comm_num_); i++) { std::string var_name = platform::GetHierarchicalInterNCCLVarName(i); auto nccl_id_var = scope->FindVar(var_name); PADDLE_ENFORCE_NOT_NULL(nccl_id_var, platform::errors::NotFound( "Can't find nccl_id_var '%s'.", var_name)); - auto inter_nccl_id = nccl_id_var->GetMutable(); + auto inter_nccl_id = nccl_id_var->GetMutable(); inter_nccl_ids.push_back(inter_nccl_id); } - std::vector exter_nccl_ids; + std::vector exter_nccl_ids; for (int i = 0; i < static_cast(bst.nccl_comm_num_); i++) { std::string var_name = platform::GetHierarchicalExterNCCLVarName(i); auto nccl_id_var = scope->FindVar(var_name); PADDLE_ENFORCE_NOT_NULL(nccl_id_var, platform::errors::NotFound( "Can't find nccl_id_var '%s'.", var_name)); - auto nccl_id = nccl_id_var->GetMutable(); + auto nccl_id = nccl_id_var->GetMutable(); exter_nccl_ids.push_back(nccl_id); } @@ -400,7 +400,7 @@ class ParallelExecutorPrivate { std::unordered_map is_persistable_; -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) platform::NCCLCommunicator *nccl_ctxs_{nullptr}; #elif defined(PADDLE_WITH_XPU_BKCL) platform::BKCLCommunicator *bkcl_ctxs_{nullptr}; @@ -512,7 +512,7 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) { } std::unique_ptr gc; if (platform::is_gpu_place(place)) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) if (IsFastEagerDeletionModeEnabled()) { gc = std::make_unique(place, max_memory_size); @@ -623,7 +623,7 @@ bool ParallelExecutor::NeedCreateLocalExeScope() { } void InitP2P(const std::vector &places) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) std::call_once(p2p_init_flag, [&]() { int count = places.size(); if (count <= 1) return; @@ -644,6 +644,10 @@ void InitP2P(const std::vector &places) { hipError_t ret = hipDeviceCanAccessPeer(&can_acess, devices[i], devices[j]); if (ret != hipSuccess || can_acess != 1) { +#elif defined(PADDLE_WITH_MUSA) + musaError_t ret = + musaDeviceCanAccessPeer(&can_acess, devices[i], devices[j]); + if (ret != musaSuccess || can_acess != 1) { #else cudaError_t ret = cudaDeviceCanAccessPeer(&can_acess, devices[i], devices[j]); @@ -655,6 +659,8 @@ void InitP2P(const std::vector &places) { platform::CUDADeviceGuard guard(devices[i]); #ifdef PADDLE_WITH_HIP hipDeviceEnablePeerAccess(devices[j], 0); +#elif defined(PADDLE_WITH_MUSA) + musaDeviceEnablePeerAccess(devices[j], 0); #else cudaDeviceEnablePeerAccess(devices[j], 0); #endif @@ -807,12 +813,12 @@ void ParallelExecutor::BCastParamsToDevices( } auto &dims = main_tensor.dims(); if (paddle::platform::is_gpu_place(main_tensor.place())) { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) std::vector buffers; buffers.reserve(member_->places_.size()); size_t numel = main_tensor.numel(); auto dtype = framework::TransToProtoVarType(main_tensor.dtype()); - ncclDataType_t data_type = platform::ToNCCLDataType(dtype); + mcclDataType_t data_type = platform::ToNCCLDataType(dtype); for (size_t i = 0; i < member_->places_.size(); ++i) { auto place = member_->places_[i]; void *buffer; @@ -840,7 +846,7 @@ void ParallelExecutor::BCastParamsToDevices( platform::NCCLGroupGuard guard; for (size_t i = 0; i < member_->places_.size(); ++i) { auto &nccl_ctx = nccl_ctxs->at(member_->places_[i]); - platform::dynload::ncclBcast(buffers[i], + platform::dynload::mcclBcast(buffers[i], numel, data_type, 0, @@ -1282,7 +1288,7 @@ void ParallelExecutor::InitExecutorPrivateMemberInfo( BuildStrategy::ReduceStrategy::kAllReduce; member_->use_all_reduce_ = true; } -#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && defined(_WIN32) +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)) && defined(_WIN32) if (member_->IsUseCUDA(member_->use_device_)) { PADDLE_ENFORCE_EQ( device_count, @@ -1291,8 +1297,8 @@ void ParallelExecutor::InitExecutorPrivateMemberInfo( } #endif -#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \ - (!defined(PADDLE_WITH_NCCL) && !defined(PADDLE_WITH_RCCL)) +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)) && \ + (!defined(PADDLE_WITH_NCCL) && !defined(PADDLE_WITH_RCCL) && !defined(PADDLE_WITH_MCCL)) if (member_->IsUseCUDA(member_->use_device_)) { PADDLE_ENFORCE_EQ( device_count, @@ -1450,7 +1456,7 @@ void ParallelExecutor::PrepareNCCLCommunicator(Scope *global_scope) { } if (member_->IsUseCUDA(member_->use_device_) && member_->nranks_ > 1) { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) member_->InitOrGetNCCLCommunicator(global_scope, &member_->build_strategy_); // Initialize device context's nccl comm, will be used by normal @@ -1501,7 +1507,7 @@ std::vector ParallelExecutor::CompileGraphWithBuildStrategy( std::vector async_graphs(device_count); auto &graphs = *device_graphs; -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) if (member_->build_strategy_.async_mode_) { PADDLE_ENFORCE_EQ(graphs.size(), device_count, @@ -1656,7 +1662,7 @@ std::vector ParallelExecutor::CreateSSAGraphExecutor( final_graphs = *async_graphs; } else if (member_->build_strategy_.enable_parallel_graph_) { VLOG(3) << "use ParallelSSAGraphExecutor"; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) // TODO(Yancey1989): Remove passing in the main_program when // allreduce_seq_pass doesn't need it as the attr. bool is_inference = details::IsDataParallelInferenceGraph(*graph); diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h index 32514089763c6..48cd609d798e3 100644 --- a/paddle/fluid/framework/parallel_executor.h +++ b/paddle/fluid/framework/parallel_executor.h @@ -33,7 +33,7 @@ limitations under the License. */ #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/platform/device_context.h" -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) #include "paddle/fluid/platform/device/gpu/nccl_helper.h" #endif diff --git a/paddle/fluid/framework/phi_utils.cc b/paddle/fluid/framework/phi_utils.cc index cc5cf54724dab..0b77e80a0b465 100644 --- a/paddle/fluid/framework/phi_utils.cc +++ b/paddle/fluid/framework/phi_utils.cc @@ -134,7 +134,7 @@ phi::KernelKey FallBackToCpu(const phi::KernelKey& kernel_key, phi::Backend::CPU, kernel_key.layout(), kernel_key.dtype()); } #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) if (kernel_key.backend() == phi::Backend::GPU || kernel_key.backend() == phi::Backend::GPUDNN) { PADDLE_THROW( diff --git a/paddle/fluid/framework/phi_utils.h b/paddle/fluid/framework/phi_utils.h index d1eb5558c5454..e37957918fe40 100644 --- a/paddle/fluid/framework/phi_utils.h +++ b/paddle/fluid/framework/phi_utils.h @@ -72,7 +72,7 @@ struct ConvertToPhiContext { using TYPE = phi::CPUContext; }; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) template <> struct ConvertToPhiContext { using TYPE = phi::GPUContext; diff --git a/paddle/fluid/framework/pipeline_trainer.cc b/paddle/fluid/framework/pipeline_trainer.cc index 4566927e068ca..827e39c152640 100644 --- a/paddle/fluid/framework/pipeline_trainer.cc +++ b/paddle/fluid/framework/pipeline_trainer.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) #include "paddle/fluid/framework/data_feed_factory.h" #include "paddle/fluid/framework/device_worker_factory.h" #include "paddle/fluid/framework/trainer.h" @@ -34,7 +34,7 @@ void PipelineTrainer::Initialize(const TrainerDesc& trainer_desc, ParseDumpConfig(trainer_desc); const auto& section_config = section_params.section_config(); int place_id = section_config.place_id(); -#if (defined PADDLE_WITH_NCCL) || (defined PADDLE_WITH_RCCL) +#if (defined PADDLE_WITH_NCCL) || (defined PADDLE_WITH_RCCL)|| (defined PADDLE_WITH_MCCL) place_ = platform::CUDAPlace(place_id); #endif worker_ = DeviceWorkerFactory::CreateDeviceWorker( diff --git a/paddle/fluid/framework/ps_gpu_trainer.cc b/paddle/fluid/framework/ps_gpu_trainer.cc index 4b629c24cf0e6..472eb5ef9b42f 100644 --- a/paddle/fluid/framework/ps_gpu_trainer.cc +++ b/paddle/fluid/framework/ps_gpu_trainer.cc @@ -24,7 +24,7 @@ limitations under the License. */ #include "paddle/fluid/framework/device_worker_factory.h" #include "paddle/fluid/framework/fleet/ps_gpu_wrapper.h" #include "paddle/fluid/framework/trainer.h" -#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL || \ +#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL ||defined PADDLE_WITH_MCCL || \ defined PADDLE_WITH_XPU_BKCL) && \ (defined PADDLE_WITH_PSLIB) #ifdef PADDLE_WITH_CUDA diff --git a/paddle/fluid/framework/ps_gpu_worker.cc b/paddle/fluid/framework/ps_gpu_worker.cc index 85fc30978f16a..f1cc62bbfd304 100644 --- a/paddle/fluid/framework/ps_gpu_worker.cc +++ b/paddle/fluid/framework/ps_gpu_worker.cc @@ -19,7 +19,7 @@ limitations under the License. */ #include "paddle/fluid/platform/lodtensor_printer.h" #include "paddle/fluid/string/string_helper.h" -#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL || \ +#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL || defined PADDLE_WITH_MCCL || \ defined PADDLE_WITH_XPU_BKCL) && \ (defined PADDLE_WITH_PSLIB) #ifdef PADDLE_WITH_CUDA @@ -286,7 +286,7 @@ void PSGPUWorker::TrainFiles() { timeline.Start(); int total_ins_num = 0; -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) platform::SetDeviceId(thread_id_); #elif defined(PADDLE_WITH_XPU_BKCL) platform::SetXPUDeviceId(thread_id_); @@ -511,7 +511,7 @@ void PSGPUWorker::TrainFilesWithProfiler() { int total_ins_num = 0; int cur_batch; timeline.Start(); -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) platform::SetDeviceId(thread_id_); #elif defined(PADDLE_WITH_XPU_BKCL) platform::SetXPUDeviceId(thread_id_); diff --git a/paddle/fluid/framework/pull_dense_worker.cc b/paddle/fluid/framework/pull_dense_worker.cc index f295fa7106dd4..8b740ea6156e2 100644 --- a/paddle/fluid/framework/pull_dense_worker.cc +++ b/paddle/fluid/framework/pull_dense_worker.cc @@ -69,11 +69,11 @@ void PullDenseWorker::Initialize(const TrainerDesc& param) { fleet_ptr_ = FleetWrapper::GetInstance(); #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) copy_streams_.clear(); #endif #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ - defined(PADDLE_WITH_XPU) + defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_MUSA) places_.clear(); thread_scopes_.clear(); #endif @@ -81,7 +81,7 @@ void PullDenseWorker::Initialize(const TrainerDesc& param) { void PullDenseWorker::CreatePinVar() { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ - defined(PADDLE_WITH_XPU) + defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_MUSA) // for (auto& v : dense_value_names_) { // for (auto& name : v.second) { for (int i = 0; i < dwp_param_.program_config(0).pull_dense_table_id_size(); @@ -95,7 +95,7 @@ void PullDenseWorker::CreatePinVar() { auto* ptr = root_scope_->Var(name + "pin"); InitializeVariable(ptr, proto::VarType::LOD_TENSOR); phi::DenseTensor* pin_tensor = ptr->GetMutable(); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) pin_tensor->mutable_data(tensor->dims(), platform::CUDAPinnedPlace()); #endif @@ -125,7 +125,7 @@ void PullDenseWorker::Wait(std::vector<::std::future>* status_vec) { } status_vec->resize(0); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ - defined(PADDLE_WITH_XPU) + defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_MUSA) for (size_t i = 0; i < places_.size(); ++i) { // for (auto& v : dense_value_names_) { @@ -141,7 +141,7 @@ void PullDenseWorker::Wait(std::vector<::std::future>* status_vec) { Variable* var = thread_scopes_[i]->FindVar(name); phi::DenseTensor* tensor = var->GetMutable(); float* w = tensor->data(); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) memory::Copy(places_[i], w, platform::CUDAPinnedPlace(), @@ -177,7 +177,7 @@ void PullDenseWorker::PullDense(bool force_update) { dwp_param_.program_config(0).pull_dense_table_id(i)); if (force_update || CheckUpdateParam(tid)) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ - defined(PADDLE_WITH_XPU) + defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_MUSA) VLOG(3) << "pull dense " << force_update << " " << tid; fleet_ptr_->PullDenseVarsAsync(*root_scope_, tid, diff --git a/paddle/fluid/framework/section_worker.cc b/paddle/fluid/framework/section_worker.cc index f88dbc409d170..9f347ca4c0126 100644 --- a/paddle/fluid/framework/section_worker.cc +++ b/paddle/fluid/framework/section_worker.cc @@ -9,7 +9,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) #include #include "paddle/fluid/framework/device_worker.h" @@ -228,7 +228,7 @@ void SectionWorker::TrainFiles() { int64_t max_memory_size = GetEagerDeletionThreshold(); std::unique_ptr gc; if (max_memory_size >= 0) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) if (platform::is_gpu_place(place_)) { if (IsFastEagerDeletionModeEnabled()) { gc = std::make_unique(place_, diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc index 27dc5902c75ba..01267fd059c1f 100644 --- a/paddle/fluid/framework/tensor_util.cc +++ b/paddle/fluid/framework/tensor_util.cc @@ -125,7 +125,7 @@ void TensorCopyImpl(const TENSOR& src, "Copy from %s to %s is not supported.", src_place, dst_place)); } #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) else if (platform::is_cuda_pinned_place(src_place) && // NOLINT platform::is_cuda_pinned_place(dst_place)) { memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); @@ -379,7 +379,7 @@ void TensorCopySync(const phi::DenseTensor& src, "Copy from %s to %s is not supported.", src_place, dst_place)); } #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) else if (platform::is_cuda_pinned_place(src_place) && // NOLINT platform::is_cuda_pinned_place(dst_place)) { memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); @@ -482,7 +482,7 @@ void TensorToStream(std::ostream& os, platform::errors::ResourceExhausted( "tensor size %d overflow when writing tensor", size)); if (platform::is_gpu_place(tensor.place())) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) constexpr size_t kBufSize = 1024 * 1024 * 64; // 64MB std::unique_ptr buf(new char[kBufSize]); auto& gpu_dev_ctx = static_cast(dev_ctx); @@ -616,7 +616,7 @@ void TensorFromStream(std::istream& is, if (platform::is_gpu_place(dev_ctx.GetPlace()) || platform::is_xpu_place(dev_ctx.GetPlace()) || platform::is_custom_place(dev_ctx.GetPlace())) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || \ defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_CUSTOM_DEVICE) phi::DenseTensor cpu_tensor; cpu_tensor.Resize(common::make_ddim(shape)); @@ -690,7 +690,7 @@ void TensorFromStream(std::istream& is, platform::is_xpu_place(dev_ctx.GetPlace()) || platform::is_custom_place(dev_ctx.GetPlace())) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ - defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_CUSTOM_DEVICE) + defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_CUSTOM_DEVICE) || defined(PADDLE_WITH_MUSA) phi::DenseTensor cpu_tensor; cpu_tensor.Resize(common::make_ddim(dims)); framework::VisitDataType( @@ -812,7 +812,7 @@ void TensorFromDLPack(const ::DLTensor& dl_tensor, phi::DenseTensor* dst) { if (dl_tensor.device.device_type == kDLCPU) { memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) if (dl_tensor.device.device_type == kDLGPU) { platform::CUDAPlace dst_place = platform::CUDAPlace(dl_tensor.device.device_id); @@ -852,7 +852,7 @@ void TensorFromDLPack(const DLManagedTensor* src, phi::DenseTensor* dst) { void* dst_ptr = GetDstPtrByDLDataType(type, dst, dst_place); memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) if (src->dl_tensor.device.device_type == kDLGPU) { platform::CUDAPlace dst_place = platform::CUDAPlace(src->dl_tensor.device.device_id); diff --git a/paddle/fluid/framework/tensor_util.h b/paddle/fluid/framework/tensor_util.h index d9e3e38433736..c4d9b9c143009 100644 --- a/paddle/fluid/framework/tensor_util.h +++ b/paddle/fluid/framework/tensor_util.h @@ -129,7 +129,7 @@ void TensorFromArray(const T* src, if (platform::is_cpu_place(dst_place)) { memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) else if (platform::is_gpu_place(dst_place)) { // NOLINT memory::Copy(dst_place, dst_ptr, @@ -175,7 +175,7 @@ void TensorFromVector(const std::vector& src, if (platform::is_cpu_place(dst_place)) { memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) else if (platform::is_gpu_place(dst_place)) { // NOLINT memory::Copy(dst_place, dst_ptr, @@ -304,7 +304,7 @@ void TensorToVector(const phi::DenseTensor& src, if (platform::is_cpu_place(src.place())) { memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) else if (platform::is_gpu_place(src.place())) { // NOLINT memory::Copy(dst_place, dst_ptr, @@ -346,7 +346,7 @@ inline void TensorToVector(const phi::DenseTensor& src, if (platform::is_cpu_place(src.place())) { memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) else if (platform::is_gpu_place(src.place())) { // NOLINT memory::Copy(dst_place, dst_ptr, diff --git a/paddle/fluid/framework/trainer.h b/paddle/fluid/framework/trainer.h index af7fc63a2122a..75268cb5aea27 100644 --- a/paddle/fluid/framework/trainer.h +++ b/paddle/fluid/framework/trainer.h @@ -159,7 +159,7 @@ class DistMultiTrainer : public MultiTrainer { std::shared_ptr pull_dense_worker_; }; -#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_HIP || \ +#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_HIP || defined(PADDLE_WITH_MUSA)|| \ defined PADDLE_WITH_XPU) && \ (defined PADDLE_WITH_PSLIB) && (!defined(PADDLE_WITH_HETERPS)) class HeterServiceContext { @@ -175,7 +175,7 @@ class HeterServiceContext { int place_num_; Scope* scope_{nullptr}; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) gpuEvent_t event_; #endif std::vector ops_; @@ -207,7 +207,7 @@ class HeterXpuTrainer : public TrainerBase { virtual std::string GetDumpPath(int tid) { return ""; } virtual void InitDumpEnv() {} template -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) void HeterMemCpy(phi::DenseTensor* tensor, phi::DenseTensor* root_tensor, const paddle::platform::Place& thread_place, @@ -245,7 +245,7 @@ class HeterXpuTrainer : public TrainerBase { std::vector place_scopes_; BtObjectPool object_pool_; std::vector places_; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) std::vector copy_streams_; std::vector events_; #endif @@ -253,7 +253,7 @@ class HeterXpuTrainer : public TrainerBase { #endif -#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL || \ +#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL || defined PADDLE_WITH_MCCL || \ defined PADDLE_WITH_XPU_BKCL) && \ (defined PADDLE_WITH_PSLIB) class PSGPUTrainer : public TrainerBase { @@ -305,7 +305,7 @@ class PSGPUTrainer : public TrainerBase { }; #endif -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) class PipelineTrainer : public TrainerBase { public: PipelineTrainer() {} diff --git a/paddle/fluid/framework/trainer_factory.cc b/paddle/fluid/framework/trainer_factory.cc index ba5dac4830aa1..aeb033649509f 100644 --- a/paddle/fluid/framework/trainer_factory.cc +++ b/paddle/fluid/framework/trainer_factory.cc @@ -72,17 +72,17 @@ REGISTER_TRAINER_CLASS(DistMultiTrainer); REGISTER_TRAINER_CLASS(HeterPipelineTrainer); #endif -#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_HIP || \ +#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_HIP || defined PADDLE_WITH_MUSA || \ defined PADDLE_WITH_XPU) && \ (defined PADDLE_WITH_PSLIB) && (!defined(PADDLE_WITH_HETERPS)) REGISTER_TRAINER_CLASS(HeterXpuTrainer); #endif -#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL || \ +#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL || defined PADDLE_WITH_MCCL || \ defined PADDLE_WITH_XPU_BKCL) && \ (defined PADDLE_WITH_PSLIB) REGISTER_TRAINER_CLASS(PSGPUTrainer); #endif -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) REGISTER_TRAINER_CLASS(PipelineTrainer); #endif } // namespace framework diff --git a/paddle/fluid/framework/var_type_traits.cc b/paddle/fluid/framework/var_type_traits.cc index c1f192673a702..42471cceb3025 100644 --- a/paddle/fluid/framework/var_type_traits.cc +++ b/paddle/fluid/framework/var_type_traits.cc @@ -37,6 +37,13 @@ #include "paddle/fluid/operators/miopen_rnn_cache.h" #endif +#ifdef PADDLE_WITH_MUSA +#if defined(PADDLE_WITH_MCCL) +#include "paddle/fluid/operators/nccl/nccl_gpu_common.h" // NOLINT +#include "paddle/fluid/platform/device/gpu/nccl_helper.h" // NOLINT +#endif +#endif + #if defined(PADDLE_WITH_XPU_BKCL) #include "paddle/fluid/platform/device/xpu/bkcl_helper.h" #endif diff --git a/paddle/fluid/framework/var_type_traits.h b/paddle/fluid/framework/var_type_traits.h index 9bffd125a3f3d..61790dc36e912 100644 --- a/paddle/fluid/framework/var_type_traits.h +++ b/paddle/fluid/framework/var_type_traits.h @@ -34,6 +34,14 @@ #include #endif #endif + +#ifdef PADDLE_WITH_MUSA +#include +#if defined(PADDLE_WITH_MCCL) +#include +#endif +#endif + #ifdef PADDLE_WITH_HIP #include #ifdef PADDLE_WITH_RCCL @@ -60,8 +68,8 @@ class SparseCsrTensor; namespace paddle { namespace platform { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) class Communicator; class NCCLCommunicator; #endif @@ -190,13 +198,13 @@ using VarTypeRegistry = detail::VarTypeRegistryImpl< FetchList, FeedList, operators::reader::OrderedMultiDeviceLoDTensorBlockingQueueHolder, -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) - ncclUniqueId, +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) + mcclUniqueId, platform::Communicator, platform::NCCLCommunicator, #endif - operators::CudnnRNNCache, + // operators::CudnnRNNCache, #endif #if defined(PADDLE_WITH_XPU_BKCL) BKCLUniqueId, diff --git a/paddle/fluid/imperative/CMakeLists.txt b/paddle/fluid/imperative/CMakeLists.txt index b6d846e9a0c12..ebf1fd4141ace 100644 --- a/paddle/fluid/imperative/CMakeLists.txt +++ b/paddle/fluid/imperative/CMakeLists.txt @@ -97,7 +97,7 @@ cc_library( SRCS profiler.cc DEPS phi common) if(NOT WIN32) - if(WITH_NCCL OR WITH_RCCL) + if(WITH_NCCL OR WITH_RCCL OR WITH_MCCL) cc_library( imperative_all_reduce SRCS all_reduce.cc @@ -119,6 +119,12 @@ if(NOT WIN32) SRCS reducer.cc reducer.cu DEPS layer imperative_all_reduce) endif() + if(WITH_MCCL) + musa_library( + reducer + SRCS reducer.cc reducer.cu + DEPS layer imperative_all_reduce) + endif() endif() if(WITH_XPU_BKCL) cc_library( @@ -138,6 +144,7 @@ if(NOT WIN32) if(NOT (WITH_NCCL OR WITH_RCCL + OR WITH_MCCL OR WITH_XPU_BKCL OR WITH_GLOO)) cc_library( @@ -148,6 +155,7 @@ if(NOT WIN32) endif() if(WITH_NCCL OR WITH_RCCL + OR WITH_MCCL OR WITH_XPU_BKCL OR WITH_CUSTOM_DEVICE) cc_library( @@ -169,6 +177,7 @@ if(WITH_GLOO) OR (NOT (WITH_NCCL OR WITH_RCCL + OR WITH_MCCL OR WITH_XPU_BKCL) )) cc_library( diff --git a/paddle/fluid/imperative/all_reduce.cc b/paddle/fluid/imperative/all_reduce.cc index c4bb42e4c22bb..5436364e56f7f 100644 --- a/paddle/fluid/imperative/all_reduce.cc +++ b/paddle/fluid/imperative/all_reduce.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) #include "paddle/fluid/imperative/all_reduce.h" @@ -26,6 +26,11 @@ #include #endif +#ifdef PADDLE_WITH_MCCL +#include +#endif + + #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/selected_rows_utils.h" #include "paddle/fluid/framework/variable.h" @@ -69,16 +74,16 @@ static void AllReduce(const phi::DenseTensor &src, auto *dst_ptr = dst->mutable_data(src.place(), src.dtype()); auto nccl_dtype = platform::ToNCCLDataType(framework::TransToProtoVarType(src.dtype())); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(src_ptr, + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclAllReduce(src_ptr, dst_ptr, src.numel(), nccl_dtype, - ncclSum, + mcclSum, comm->comm(), stream)); } -#if NCCL_VERSION_CODE >= 2212 +// #if NCCL_VERSION_CODE >= 2212 static void AllReduce(const phi::SelectedRows &src, phi::SelectedRows *dst, const ParallelStrategy &strategy, @@ -101,7 +106,7 @@ static void AllReduce(const phi::SelectedRows &src, bool use_calc_stream = (dev_ctx->stream() == stream); VLOG(4) << "Is use calculate stream: " << use_calc_stream; - // 1. Gather rows number from all workers. Here use ncclAllGather to do this, + // 1. Gather rows number from all workers. Here use mcclAllGather to do this, // but we can use other ways to implement is in the future const auto &src_rows = src.rows(); phi::Vector rows_num_vector(strategy.nranks_); @@ -114,10 +119,10 @@ static void AllReduce(const phi::SelectedRows &src, dev_ctx->Wait(); } PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::ncclAllGather(gpu_rows_num_ptr + strategy.local_rank_, + platform::dynload::mcclAllGather(gpu_rows_num_ptr + strategy.local_rank_, gpu_rows_num_ptr, 1, - ncclInt64, + mcclInt64, comm->comm(), stream)); @@ -163,14 +168,14 @@ static void AllReduce(const phi::SelectedRows &src, // allgather is used to speed up the allreduce by replacing broadcast. auto row_sendcount = cpu_rows_num_ptr[0]; VLOG(3) << "allgather replaces broadcast to speed up in sparse allreduce"; - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllGather(src_rows_ptr, + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclAllGather(src_rows_ptr, dst_rows_ptr, row_sendcount, - ncclInt64, + mcclInt64, comm->comm(), stream)); auto value_sendcount = cpu_rows_num_ptr[0] * feature_size; - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllGather(src_tensor_ptr, + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclAllGather(src_tensor_ptr, dst_tensor_ptr, value_sendcount, nccl_dtype, @@ -181,10 +186,10 @@ static void AllReduce(const phi::SelectedRows &src, if (cpu_rows_num_ptr[i] > 0) { // 2. Broadcast the rows of SelectedRows PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::ncclBroadcast(src_rows_ptr, + platform::dynload::mcclBroadcast(src_rows_ptr, dst_rows_ptr + row_offset, cpu_rows_num_ptr[i], - ncclInt64, + mcclInt64, i, comm->comm(), stream)); @@ -192,7 +197,7 @@ static void AllReduce(const phi::SelectedRows &src, auto *dst_tensor_ptr_i = reinterpret_cast(dst_tensor_ptr) + row_offset * feature_size * sizeof_dtype; PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::ncclBroadcast(src_tensor_ptr, + platform::dynload::mcclBroadcast(src_tensor_ptr, dst_tensor_ptr_i, cpu_rows_num_ptr[i] * feature_size, nccl_dtype, @@ -212,7 +217,7 @@ static void AllReduce(const phi::SelectedRows &src, VLOG(3) << "Result SelectedRows rows: " << string::join_strings(*dst_rows, ','); } -#endif +// #endif void AllReduce(const framework::Variable &src, framework::Variable *dst, @@ -234,7 +239,7 @@ void AllReduce(const framework::Variable &src, dst->GetMutable(), stream, comm); -#if NCCL_VERSION_CODE >= 2212 +// #if NCCL_VERSION_CODE >= 2212 } else if (src.IsType()) { if (&src != dst) { if (!dst->IsType()) { @@ -257,7 +262,7 @@ void AllReduce(const framework::Variable &src, platform::GpuStreamSync(stream); *dst = std::move(tmp_dst); } -#endif +// #endif } else { PADDLE_THROW(platform::errors::InvalidArgument( "Unsupported variable type %s for imperative allreduce, only " diff --git a/paddle/fluid/imperative/all_reduce.h b/paddle/fluid/imperative/all_reduce.h index 49e3054924205..049345772de65 100644 --- a/paddle/fluid/imperative/all_reduce.h +++ b/paddle/fluid/imperative/all_reduce.h @@ -14,7 +14,7 @@ #pragma once -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) namespace paddle { namespace framework { diff --git a/paddle/fluid/imperative/amp_auto_cast.cc b/paddle/fluid/imperative/amp_auto_cast.cc index 0c16a95035870..dfb231ead927e 100644 --- a/paddle/fluid/imperative/amp_auto_cast.cc +++ b/paddle/fluid/imperative/amp_auto_cast.cc @@ -129,7 +129,7 @@ AmpOperators::AmpOperators() block_ops_(new std::unordered_set()), unsupported_fp16_ops_(new std::unordered_set()), unsupported_bf16_ops_(new std::unordered_set()) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) auto unsupported_ops_gpu_fp16 = std::get<2>( OpSupportedInfos("GPU", paddle::framework::proto::VarType::FP16)); unsupported_fp16_ops_->insert(unsupported_ops_gpu_fp16.begin(), diff --git a/paddle/fluid/imperative/gloo_context.cc b/paddle/fluid/imperative/gloo_context.cc index 4e0df45e840f2..58ecec47cccf3 100644 --- a/paddle/fluid/imperative/gloo_context.cc +++ b/paddle/fluid/imperative/gloo_context.cc @@ -141,7 +141,7 @@ void GLOOParallelContext::AllReduce(const phi::SelectedRows &src, const auto &src_tensor = src.value(); const auto &place = src_tensor.place(); auto dtype = framework::TransToProtoVarType(src_tensor.dtype()); - // 1. Gather rows number from all workers. Here use ncclAllGather to do this, + // 1. Gather rows number from all workers. Here use mcclAllGather to do this, // but we can use other ways to implement is in the future auto &src_rows = src.rows(); auto gloo_wrapper = framework::GlooWrapper::GetInstance(); diff --git a/paddle/fluid/imperative/gradient_accumulator.cc b/paddle/fluid/imperative/gradient_accumulator.cc index 267540f080741..61bb0a1d7c14e 100644 --- a/paddle/fluid/imperative/gradient_accumulator.cc +++ b/paddle/fluid/imperative/gradient_accumulator.cc @@ -209,7 +209,7 @@ void TensorAdd(const VarType& src, VarType* dst) { } if (platform::is_gpu_place(place)) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) PADDLE_TENSOR_ADD(float, phi::GPUContext); PADDLE_TENSOR_ADD(double, phi::GPUContext); PADDLE_TENSOR_ADD(phi::dtype::float16, phi::GPUContext); @@ -326,7 +326,7 @@ void SelectedRowsAddToTensor(const VarType& src, VarType* dst) { return; \ } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) if (paddle::platform::is_gpu_place(place)) { PADDLE_SELECTED_ROWS_ADD_TO_TENSOR(phi::GPUContext, float); PADDLE_SELECTED_ROWS_ADD_TO_TENSOR(phi::GPUContext, double); @@ -334,7 +334,7 @@ void SelectedRowsAddToTensor(const VarType& src, VarType* dst) { #endif PADDLE_SELECTED_ROWS_ADD_TO_TENSOR(phi::CPUContext, float); PADDLE_SELECTED_ROWS_ADD_TO_TENSOR(phi::CPUContext, double); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) } #endif @@ -381,7 +381,7 @@ void SelectedRowsAddTensor(const VarType& src_selected_rows_var, return; \ } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) if (platform::is_gpu_place(place)) { PADDLE_SELECTED_ROWS_ADD_TENSOR(phi::GPUContext, float); PADDLE_SELECTED_ROWS_ADD_TENSOR(phi::GPUContext, double); @@ -389,7 +389,7 @@ void SelectedRowsAddTensor(const VarType& src_selected_rows_var, #endif PADDLE_SELECTED_ROWS_ADD_TENSOR(phi::CPUContext, float); PADDLE_SELECTED_ROWS_ADD_TENSOR(phi::CPUContext, double); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) } #endif @@ -447,7 +447,7 @@ std::shared_ptr SelectedRowsMerge(const VarType& src1, return dst_var; \ } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) if (paddle::platform::is_gpu_place(place)) { PADDLE_SELECTED_ROWS_ADD(phi::GPUContext, float); PADDLE_SELECTED_ROWS_ADD(phi::GPUContext, double); @@ -463,7 +463,7 @@ std::shared_ptr SelectedRowsMerge(const VarType& src1, #if defined(PADDLE_WITH_XPU) } #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) } #endif @@ -734,7 +734,7 @@ void SortedGradientAccumulator::SumGrad(std::shared_ptr var, } } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) if (paddle::platform::is_gpu_place(place)) { // sum selected rows firstly for (auto& var_info : tmp_grad_vars_) { @@ -800,7 +800,7 @@ void SortedGradientAccumulator::SumGrad(std::shared_ptr var, // Increase count IncreaseCurCnt(); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) } #endif tmp_grad_vars_.clear(); diff --git a/paddle/fluid/imperative/nccl_context.cc b/paddle/fluid/imperative/nccl_context.cc index d70d40808f915..13a3d356e61c5 100644 --- a/paddle/fluid/imperative/nccl_context.cc +++ b/paddle/fluid/imperative/nccl_context.cc @@ -14,7 +14,7 @@ #include "paddle/fluid/imperative/nccl_context.h" -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) #include "paddle/fluid/imperative/all_reduce.h" #include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/gen_comm_id_helper.h" @@ -41,10 +41,10 @@ class Variable; namespace paddle { namespace imperative { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) void NCCLParallelContext::BcastNCCLId( - std::vector &nccl_ids, // NOLINT + std::vector &nccl_ids, // NOLINT int root, int server_fd) { if (strategy_.local_rank_ == root) { @@ -64,13 +64,13 @@ void NCCLParallelContext::BcastNCCLId( void NCCLParallelContext::Init() { int server_fd = -1; - std::vector nccl_ids; + std::vector nccl_ids; nccl_ids.resize(strategy_.nrings_); if (strategy_.local_rank_ == 0) { // generate the unique ncclid on the root worker for (auto &nccl_id : nccl_ids) { - platform::dynload::ncclGetUniqueId(&nccl_id); + platform::dynload::mcclGetUniqueId(&nccl_id); } } else { // FIXME(wangxi): gloo will use rank0 endpoint, so not create socket server @@ -101,12 +101,12 @@ void NCCLParallelContext::Init() { void NCCLParallelContext::InitWithRingID(int ring_id) { int server_fd = -1; - std::vector nccl_ids; + std::vector nccl_ids; nccl_ids.resize(1); if (strategy_.local_rank_ == 0) { // generate the unique ncclid on the root worker - platform::dynload::ncclGetUniqueId(&nccl_ids[0]); + platform::dynload::mcclGetUniqueId(&nccl_ids[0]); } else { // FIXME(wangxi): gloo will use rank0 endpoint, so not create socket server // on rank0. @@ -152,7 +152,7 @@ void NCCLParallelContext::Broadcast(framework::Variable *src, int ring_id) { void *src_ptr = src_tensor->data(); auto nccl_dtype = platform::ToNCCLDataType( framework::TransToProtoVarType(src_tensor->dtype())); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclBcast( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclBcast( src_ptr, src_tensor->numel(), nccl_dtype, 0, comm->comm(), stream)); } @@ -188,6 +188,9 @@ void NCCLParallelContext::WaitCompute(int ring_id) { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event, compute_stream)); PADDLE_ENFORCE_GPU_SUCCESS(hipStreamWaitEvent(comm_stream, event, 0)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(musaEventRecord(event, compute_stream)); + PADDLE_ENFORCE_GPU_SUCCESS(musaStreamWaitEvent(comm_stream, event, 0)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event, compute_stream)); PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamWaitEvent(comm_stream, event, 0)); @@ -218,6 +221,9 @@ void NCCLParallelContext::WaitComm(int ring_id) { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event, comm_stream)); PADDLE_ENFORCE_GPU_SUCCESS(hipStreamWaitEvent(compute_stream, event, 0)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(musaEventRecord(event, comm_stream)); + PADDLE_ENFORCE_GPU_SUCCESS(musaStreamWaitEvent(compute_stream, event, 0)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event, comm_stream)); PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamWaitEvent(compute_stream, event, 0)); diff --git a/paddle/fluid/imperative/nccl_context.h b/paddle/fluid/imperative/nccl_context.h index 7db96b2ee3d48..f71c57af3f4f6 100644 --- a/paddle/fluid/imperative/nccl_context.h +++ b/paddle/fluid/imperative/nccl_context.h @@ -17,7 +17,7 @@ #include #include -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) #include "paddle/fluid/platform/device/gpu/gpu_resource_pool.h" #endif @@ -29,6 +29,10 @@ #include "paddle/fluid/platform/dynload/rccl.h" #endif +#ifdef PADDLE_WITH_MCCL +#include "paddle/fluid/platform/dynload/mccl.h" +#endif + #include "paddle/fluid/imperative/parallel_context.h" namespace paddle { @@ -40,7 +44,7 @@ class Variable; namespace paddle { namespace imperative { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) class NCCLParallelContext : public ParallelContext { public: explicit NCCLParallelContext(const ParallelStrategy& strategy, @@ -49,7 +53,7 @@ class NCCLParallelContext : public ParallelContext { ~NCCLParallelContext() override = default; - void BcastNCCLId(std::vector& nccl_ids, + void BcastNCCLId(std::vector& nccl_ids, int root, // NOLINT int server_fd); diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc index d336488a42327..1545eb0bd6e68 100644 --- a/paddle/fluid/imperative/prepared_operator.cc +++ b/paddle/fluid/imperative/prepared_operator.cc @@ -205,7 +205,7 @@ PreparedOp PrepareImpl( } #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) if (op.CanCUDNNBeUsed(dygraph_exe_ctx, expected_kernel_key.dtype())) { expected_kernel_key.set_backend(phi::Backend::GPUDNN); } @@ -555,7 +555,7 @@ static void PreparedOpRunImpl( if (FLAGS_benchmark) { dev_ctx->Wait(); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) PADDLE_ENFORCE_GPU_SUCCESS(platform::GpuGetLastError()); VLOG(4) << "Operator(" << op.Type() << "): context wait and get last error"; #endif @@ -645,7 +645,7 @@ static void PreparedOpRunPtImpl( if (FLAGS_benchmark) { dev_ctx->Wait(); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) PADDLE_ENFORCE_GPU_SUCCESS(platform::GpuGetLastError()); VLOG(4) << "Operator(" << op.Type() << "): context wait and get last error"; #endif diff --git a/paddle/fluid/imperative/reducer.cc b/paddle/fluid/imperative/reducer.cc index 4bbc52662fc96..ef63b4a1b62d3 100644 --- a/paddle/fluid/imperative/reducer.cc +++ b/paddle/fluid/imperative/reducer.cc @@ -29,7 +29,7 @@ namespace paddle { namespace imperative { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) || \ defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_GLOO) || \ defined(PADDLE_WITH_CUSTOM_DEVICE) // div the nranks @@ -40,7 +40,7 @@ void Group::DivNRanks(const platform::DeviceContext &context, int64_t nranks) { : dense_contents_.GetMutable(); if (platform::is_gpu_place(tensor->place())) { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) DivNRanks(tensor, nranks, context); #endif } else if (platform::is_cpu_place(tensor->place())) { @@ -228,7 +228,7 @@ void SplitTensorsWithType( void Group::ConcatTensors(const platform::DeviceContext &context) { auto place = context.GetPlace(); if (platform::is_gpu_place(place)) { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) ConcatTensorsWithType(static_cast(context), dense_tensors_, &dense_contents_, @@ -264,7 +264,7 @@ void Group::ConcatTensors(const platform::DeviceContext &context) { void Group::SplitTensors(const platform::DeviceContext &context) { auto place = context.GetPlace(); if (platform::is_gpu_place(place)) { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) SplitTensorsWithType(static_cast(context), &dense_contents_, &dense_tensors_, @@ -1020,7 +1020,7 @@ void Reducer::FinalizeBackward() { if (find_unused_vars_each_step_) { // TODO(liuyuhui) support xpu about Tensorcopy/TensorFromVector/TensorToVector -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) || \ defined(PADDLE_WITH_GLOO) ProcessUnusedDenseVars(); #endif diff --git a/paddle/fluid/imperative/reducer.cu b/paddle/fluid/imperative/reducer.cu index 59b7ecf915423..5d89f487bc379 100644 --- a/paddle/fluid/imperative/reducer.cu +++ b/paddle/fluid/imperative/reducer.cu @@ -17,7 +17,7 @@ namespace paddle { namespace imperative { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) void Group::DivNRanks(phi::DenseTensor *tensor, int64_t nranks, const platform::DeviceContext &context) { diff --git a/paddle/fluid/imperative/reducer.h b/paddle/fluid/imperative/reducer.h index 011c8871329a5..9a6e1de71fe9d 100644 --- a/paddle/fluid/imperative/reducer.h +++ b/paddle/fluid/imperative/reducer.h @@ -44,7 +44,7 @@ class VariableWrapper; namespace paddle { namespace imperative { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) || \ defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_GLOO) || \ defined(PADDLE_WITH_CUSTOM_DEVICE) diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc index 0f992c9b8be30..d01fefc779594 100644 --- a/paddle/fluid/imperative/tracer.cc +++ b/paddle/fluid/imperative/tracer.cc @@ -137,7 +137,7 @@ paddle::framework::GarbageCollector* Tracer::MutableGarbageCollectorIfNotExists( if (gcs_.count(place) == 0) { std::unique_ptr gc; if (platform::is_gpu_place(place)) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) gc = std::make_unique(place, 0); VLOG(10) << "Created GarbageCollector at " << place; @@ -147,7 +147,7 @@ paddle::framework::GarbageCollector* Tracer::MutableGarbageCollectorIfNotExists( "Please recompile or reinstall Paddle with GPU support.")); #endif } else if (platform::is_cuda_pinned_place(place)) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) gc = std::make_unique(place, 0); VLOG(10) << "Created GarbageCollector at " << place; @@ -309,7 +309,7 @@ void Tracer::TraceOpImpl(const std::string& type, try { if (platform::is_gpu_place(place)) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) platform::SetDeviceId(place.device); #else PADDLE_THROW(platform::errors::PreconditionNotMet( diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt index d2f834a5938e9..3f4e7a9344a30 100644 --- a/paddle/fluid/inference/CMakeLists.txt +++ b/paddle/fluid/inference/CMakeLists.txt @@ -35,7 +35,7 @@ get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES) get_property(phi_modules GLOBAL PROPERTY PHI_MODULES) get_property(ir_targets GLOBAL PROPERTY IR_TARGETS) get_property(not_infer_modules GLOBAL PROPERTY NOT_INFER_MODULES) -set(utils_modules pretty_log string_helper utf8proc) +set(utils_modules pretty_log string_helper benchmark utf8proc) if(NOT WITH_GFLAGS) set(utils_modules ${utils_modules} paddle_flags) diff --git a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc index 221e6b7de1abf..302bc160c9938 100644 --- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc +++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc @@ -38,7 +38,7 @@ namespace paddle { namespace inference { namespace analysis { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) void IrParamsSyncAmongDevicesPass::CopyParamsToGpu(Argument *argument) { // The parameters are on the cpu, therefore, synchronization is not necessary. if (!argument->use_gpu()) return; @@ -215,7 +215,7 @@ void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) { argument->scope_valid(), true, platform::errors::PreconditionNotMet("The scope field should be valid")); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) if (argument->use_gpu_valid()) { CopyParamsToGpu(argument); } diff --git a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h index ee29af1c13308..6ab7d83b8922d 100644 --- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h +++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h @@ -32,7 +32,7 @@ class IrParamsSyncAmongDevicesPass : public AnalysisPass { std::string repr() const override; private: -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) void CopyParamsToGpu(Argument *argument); #endif diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc index 94e71f1cfddf1..d9d7d5aa3659a 100644 --- a/paddle/fluid/inference/api/analysis_config.cc +++ b/paddle/fluid/inference/api/analysis_config.cc @@ -32,7 +32,7 @@ #include "paddle/fluid/inference/tensorrt/helper.h" #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) PHI_DECLARE_uint64(initial_gpu_memory_in_mb); #endif @@ -100,7 +100,7 @@ void AnalysisConfig::SetModel(const std::string &prog_file_path, void AnalysisConfig::EnableUseGpu(uint64_t memory_pool_init_size_mb, int device_id, Precision precision_mode) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) use_gpu_ = true; memory_pool_init_size_mb_ = memory_pool_init_size_mb; FLAGS_initial_gpu_memory_in_mb = memory_pool_init_size_mb_; @@ -180,11 +180,6 @@ void AnalysisConfig::EnableXpu(int l3_size, bool transformer_encoder_adaptive_seqlen, bool enable_multi_stream) { #if defined(PADDLE_WITH_XPU) || defined(LITE_SUBGRAPH_WITH_XPU) - LOG_FIRST_N(WARNING, 1) - << "Parameters in EnableXpu/enable_xpu is deprecated since version " - "2.6.1, and will be removed in version 3.0! Please use " - "EnableXpu/enable_xpu without parameters, and use " - "SetXpuConfig/set_xpu_config to set options."; use_xpu_ = true; xpu_config_.l3_size = l3_size; xpu_config_.conv_autotune_level = conv_autotune; @@ -641,7 +636,7 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) { } void AnalysisConfig::EnableCUDNN() { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) use_cudnn_ = use_gpu_; #else LOG(ERROR) << "Please compile with CUDA first to use cuDNN"; @@ -996,7 +991,7 @@ void AnalysisConfig::Update() { } if (use_gpu() && use_cudnn_) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) if (!enable_ir_optim_) { LOG(ERROR) << "EnableCUDNN() only works when IR optimization is enabled."; } else { @@ -1212,7 +1207,7 @@ void AnalysisConfig::SetCpuMathLibraryNumThreads( } float AnalysisConfig::fraction_of_gpu_memory_for_pool() const { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) // Get the GPU memory details and calculate the fraction of memory for the // GPU memory pool. size_t gpu_total, gpu_available; diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 476c78638c47f..b8d95d712bdd8 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -120,7 +120,7 @@ PHI_DECLARE_bool(pir_apply_inplace_pass); namespace paddle { namespace { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) void UpdatePrivateDeviceContext(InferGPUContext *gpu_context, GPUContextResource *gpu_resource, Place place_) { @@ -152,7 +152,7 @@ void UpdatePrivateDeviceContext(InferGPUContext *gpu_context, gpu_context->SetBlasTF32Handle( gpu_resource->GetBlasTF32TensorCoreHandleCreator()); gpu_context->SetDnnHandle(gpu_resource->GetDnnHandleCreator()); - gpu_context->SetSolverHandle(gpu_resource->GetSolverDnHandleCreator()); + // gpu_context->SetSolverHandle(gpu_resource->GetSolverDnHandleCreator()); gpu_context->SetSparseHandle(gpu_resource->GetSparseHandleCreator()); gpu_context->SetEigenDevice(gpu_resource->GetGpuEigenDevice()); @@ -292,7 +292,7 @@ bool PaddleTensorToDenseTensor(const PaddleTensor &pt, false, platform::errors::InvalidArgument( "Only one choice can be made between CPU and XPU.")); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); auto *dev_ctx = static_cast(pool.Get(place)); auto dst_gpu_place = place; @@ -424,7 +424,7 @@ bool AnalysisPredictor::Init( return true; } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) // TODO(inference): Now only gpu with external stream support private // device_context. if (config_.use_gpu_ && config_.use_external_stream_) { @@ -472,7 +472,7 @@ void AnalysisPredictor::InitPlace() { platform::errors::InvalidArgument( "Only one choice can be made between CPU and XPU.")); place_ = paddle::platform::CUDAPlace(config_.gpu_device_id()); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) if (config_.thread_local_stream_enabled()) { LOG_FIRST_N(WARNING, 1) << "We will remove this interface in the future. " "Please use config.SetExecStream instead."; @@ -543,14 +543,14 @@ void AnalysisPredictor::InitPlace() { } void AnalysisPredictor::InitResourceManager(void *stream) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) predictor_stream_ = ResourceManager::Instance().InitGPUResource(place_, stream); #endif } void AnalysisPredictor::InitDeviceContexts() { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) // Init GPUContext. if (place_.GetType() == phi::AllocationType::GPU) { device_contexts_.emplace( @@ -598,7 +598,7 @@ void AnalysisPredictor::InitDeviceContexts() { } void *AnalysisPredictor::GetExecStream() const { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) if (place_.GetType() == phi::AllocationType::GPU) { if (private_context_) { return predictor_stream_; @@ -2315,7 +2315,7 @@ bool AnalysisPredictor::ZeroCopyRun() { return true; } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) bool AnalysisPredictor::ExpRunWithExternalStream(const gpuStream_t stream) { if (!private_context_) { PADDLE_THROW(platform::errors::Fatal( @@ -2326,6 +2326,8 @@ bool AnalysisPredictor::ExpRunWithExternalStream(const gpuStream_t stream) { if (stream != predictor_stream_) { #ifdef PADDLE_WITH_HIP hipStreamSynchronize(static_cast(predictor_stream_)); +#elif defined(PADDLE_WITH_MUSA) + musaStreamSynchronize(static_cast(predictor_stream_)); #else cudaStreamSynchronize(static_cast(predictor_stream_)); #endif @@ -2365,11 +2367,13 @@ void AnalysisPredictor::HookCollectShapeRangeInfo() { paddle::platform::DeviceContextPool &pool = paddle::platform::DeviceContextPool::Instance(); if (config_.use_gpu()) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) auto *dev_ctx = pool.Get(place_); auto stream = static_cast(dev_ctx)->stream(); #ifdef PADDLE_WITH_HIP hipStreamSynchronize(stream); +#elif defined(PADDLE_WITH_MUSA) + musaStreamSynchronize(stream); #else cudaStreamSynchronize(stream); #endif @@ -2764,7 +2768,7 @@ AnalysisPredictor::~AnalysisPredictor() { // NOLINT if (config_.shape_range_info_collected()) { StatisticShapeRangeInfo(); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) if (predictor_stream_ != nullptr) { ResourceManager::Instance().DestroyGPUResource(predictor_stream_); } @@ -3330,6 +3334,15 @@ bool InternalUtils::RunWithExternalStream(paddle_infer::Predictor *p, return false; } +bool InternalUtils::RunWithExternalStream(paddle_infer::Predictor *p, + musaStream_t stream) { +#ifdef PADDLE_WITH_MUSA + auto pred = dynamic_cast(p->predictor_.get()); + return pred->ExpRunWithExternalStream(stream); +#endif + return false; +} + bool InternalUtils::RunWithRuntimeConfig(paddle_infer::Predictor *p, void *config) { auto pred = dynamic_cast(p->predictor_.get()); diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h index 4a5cfb229a459..6725915a2c00c 100644 --- a/paddle/fluid/inference/api/analysis_predictor.h +++ b/paddle/fluid/inference/api/analysis_predictor.h @@ -208,7 +208,7 @@ class AnalysisPredictor : public PaddlePredictor { /// bool ZeroCopyRun() override; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) // Note: Can only be used under thread_local semantics. bool ExpRunWithExternalStream(const gpuStream_t stream); #endif diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc index d886885edb5ba..3c26f329d4747 100644 --- a/paddle/fluid/inference/api/api_impl.cc +++ b/paddle/fluid/inference/api/api_impl.cc @@ -250,7 +250,7 @@ bool NativePaddlePredictor::SetFeed(const std::vector &inputs, false, platform::errors::InvalidArgument( "Only one choice can be made between CPU and XPU.")); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); auto *dev_ctx = static_cast(pool.Get(place_)); diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor.cc b/paddle/fluid/inference/api/details/zero_copy_tensor.cc index eee3a707a03b1..530bc6f8a3eda 100644 --- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc +++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc @@ -110,7 +110,7 @@ T *Tensor::mutable_data(PlaceType place) { return tensor->mutable_data(paddle::platform::CPUPlace()); } case static_cast(PlaceType::kGPU): { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) paddle::platform::CUDAPlace gpu_place(device_); auto *dev_ctxs = reinterpret_castmutable_data(paddle::platform::CPUPlace()); std::memcpy(static_cast(t_data), data, ele_size); } else if (place_ == PlaceType::kGPU) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) paddle::platform::CUDAPlace gpu_place(device_); auto *dev_ctxs = reinterpret_caststream()); #ifdef PADDLE_WITH_HIP hipStreamSynchronize(dev_ctx->stream()); +#elif defined(PADDLE_WITH_MUSA) + // async, return stream + if (nullptr != exec_stream) { + *(static_cast(exec_stream)) = dev_ctx->stream(); + // async with callback + } else if (cb) { + musaLaunchHostFunc(dev_ctx->stream(), cb, cb_params); + // sync + } else { + musaStreamSynchronize(dev_ctx->stream()); + } #else // async, return stream if (nullptr != exec_stream) { @@ -857,7 +868,7 @@ void InternalUtils::CopyFromCpuWithIoStream(paddle_infer::Tensor *t, auto *t_data = tensor->mutable_data(paddle::platform::CPUPlace()); std::memcpy(static_cast(t_data), data, ele_size); } else if (t->place_ == PlaceType::kGPU) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) paddle::platform::CUDAPlace gpu_place(t->device_); auto *t_data = tensor->mutable_data(gpu_place); paddle::memory::Copy(gpu_place, @@ -927,7 +938,7 @@ void InternalUtils::CopyToCpuWithIoStream(paddle_infer::Tensor *t, std::memcpy(static_cast(data), t_data, ele_num * sizeof(T)); #endif } else if (t->place_ == PlaceType::kGPU) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) paddle::memory::Copy(paddle::platform::CPUPlace(), static_cast(data), t_place, diff --git a/paddle/fluid/inference/api/infer_context.cc b/paddle/fluid/inference/api/infer_context.cc index 7879adb57d86e..d0bad85bfdee1 100644 --- a/paddle/fluid/inference/api/infer_context.cc +++ b/paddle/fluid/inference/api/infer_context.cc @@ -22,7 +22,7 @@ namespace paddle { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) InferGPUContext::InferGPUContext(const phi::Place& place) : phi::GPUContext(place, false) {} #endif diff --git a/paddle/fluid/inference/api/infer_context.h b/paddle/fluid/inference/api/infer_context.h index 216c7747f0706..518a85119ed79 100644 --- a/paddle/fluid/inference/api/infer_context.h +++ b/paddle/fluid/inference/api/infer_context.h @@ -26,7 +26,7 @@ class InferCPUContext : public phi::CPUContext { using phi::CPUContext::SetEigenDevice; }; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) class InferGPUContext : public phi::GPUContext { public: explicit InferGPUContext(const phi::Place& place); @@ -35,7 +35,7 @@ class InferGPUContext : public phi::GPUContext { using phi::GPUContext::SetBlasTF32Handle; using phi::GPUContext::SetDnnHandle; using phi::GPUContext::SetEigenDevice; - using phi::GPUContext::SetSolverHandle; + // using phi::GPUContext::SetSolverHandle; using phi::GPUContext::SetSparseHandle; using phi::GPUContext::SetStream; // using phi::GPUContext::SetDnnWorkspaceHandle; diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h index b5a26ff9225aa..6a3e943dec7e9 100644 --- a/paddle/fluid/inference/api/paddle_analysis_config.h +++ b/paddle/fluid/inference/api/paddle_analysis_config.h @@ -111,7 +111,6 @@ struct PD_INFER_DECL XpuConfig { bool conv_autotune_file_writeback{false}; // Fc autotune level. The Optional values are 0-9. Default 0 means no - // autotune. int fc_autotune_level{0}; // Base fc autotune info is read from fc_autotune_file. std::string fc_autotune_file; @@ -368,7 +367,7 @@ struct PD_INFER_DECL AnalysisConfig { /// void EnableXpu(int l3_size = 0xfffc00, bool l3_locked = false, - bool conv_autotune = false, + bool conv_autotune = true, const std::string& conv_autotune_file = "", const std::string& transformer_encoder_precision = "int16", bool transformer_encoder_adaptive_seqlen = false, diff --git a/paddle/fluid/inference/api/paddle_api.h b/paddle/fluid/inference/api/paddle_api.h index 3fefba9ef22be..10e6d38e5a900 100644 --- a/paddle/fluid/inference/api/paddle_api.h +++ b/paddle/fluid/inference/api/paddle_api.h @@ -470,6 +470,7 @@ PD_INFER_DECL std::shared_ptr MakeCipher( // forward declation using cudaStream_t = struct CUstream_st*; using hipStream_t = struct ihipStream_t*; +using musaStream_t = struct MUstream_st*; namespace paddle_infer { class Predictor; @@ -507,6 +508,8 @@ class PD_INFER_DECL InternalUtils { cudaStream_t stream); static bool RunWithExternalStream(paddle_infer::Predictor* pred, hipStream_t stream); + static bool RunWithExternalStream(paddle_infer::Predictor* pred, + musaStream_t stream); static bool RunWithRuntimeConfig(paddle_infer::Predictor* pred, void* config); static void UpdateConfigInterleaved(paddle_infer::Config* c, diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc index 4af87b029fd22..9aaa2184875dc 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.cc +++ b/paddle/fluid/inference/api/paddle_pass_builder.cc @@ -16,7 +16,10 @@ #ifdef PADDLE_WITH_CUDA #include #endif -#ifdef PADDLE_WITH_HIP +#ifdef PADDLE_WITH_MUSA +#include +#endif +#ifdef PADDLE_WITH_HIP #include #endif #ifdef PADDLE_WITH_TENSORRT diff --git a/paddle/fluid/inference/api/resource_manager.cc b/paddle/fluid/inference/api/resource_manager.cc index 2a8029555e94f..96676ff818c56 100644 --- a/paddle/fluid/inference/api/resource_manager.cc +++ b/paddle/fluid/inference/api/resource_manager.cc @@ -44,7 +44,7 @@ namespace paddle { namespace internal { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)|| defined(PADDLE_WITH_MUSA) class EigenGpuStreamDevice : public Eigen::StreamInterface { public: EigenGpuStreamDevice() : scratch_(nullptr), semaphore_(nullptr) { @@ -102,6 +102,9 @@ class EigenGpuStreamDevice : public Eigen::StreamInterface { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS( hipMemsetAsync(semaphore_, 0, sizeof(unsigned int), stream_)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS( + musaMemsetAsync(semaphore_, 0, sizeof(unsigned int), stream_)); #else PADDLE_ENFORCE_GPU_SUCCESS( cudaMemsetAsync(semaphore_, 0, sizeof(unsigned int), stream_)); @@ -132,7 +135,7 @@ void CPUContextResource::InitCPUResource() { CPUContextResource::CPUContextResource() { InitCPUResource(); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) GPUContextResource::GPUContextResource(const phi::Place& place, void* stream) : place_(place) { InitGPUResource(stream); @@ -158,6 +161,8 @@ void GPUContextResource::DestroyGPUResource() { if (owned_stream_) { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipStreamDestroy(stream_)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(musaStreamDestroy(stream_)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamDestroy(stream_)); #endif @@ -166,8 +171,8 @@ void GPUContextResource::DestroyGPUResource() { DestroyDnnHandle(); DestroyBlasHandle(); - DestroyBlasLtHandle(); - DestroySolverHandle(); + // DestroyBlasLtHandle(); + // DestroySolverHandle(); DestroySparseHandle(); } @@ -205,21 +210,21 @@ void GPUContextResource::DestroyBlasHandle() { phi::DestroyBlasHandle(blas_tf32_tensor_core_handle_); } -void GPUContextResource::InitBlasLtHandle() { - phi::InitBlasLtHandle(&blaslt_handle_); -} +// void GPUContextResource::InitBlasLtHandle() { +// phi::InitBlasLtHandle(&blaslt_handle_); +// } -void GPUContextResource::DestroyBlasLtHandle() { - phi::DestroyBlasLtHandle(blaslt_handle_); -} +// void GPUContextResource::DestroyBlasLtHandle() { +// phi::DestroyBlasLtHandle(blaslt_handle_); +// } -void GPUContextResource::InitSolverHandle() { - phi::InitSolverHandle(&solver_handle_, stream_); -} +// void GPUContextResource::InitSolverHandle() { +// phi::InitSolverHandle(&solver_handle_, stream_); +// } -void GPUContextResource::DestroySolverHandle() { - phi::DestroySolverHandle(solver_handle_); -} +// void GPUContextResource::DestroySolverHandle() { +// phi::DestroySolverHandle(solver_handle_); +// } void GPUContextResource::InitSparseHandle() { phi::InitSparseHandle(&sparse_handle_, stream_); @@ -287,29 +292,29 @@ GPUContextResource::GetBlasTF32TensorCoreHandleCreator() { }; } -blasLtHandle_t GPUContextResource::GetBlasLtHandle() const { - return blaslt_handle_; -} +// blasLtHandle_t GPUContextResource::GetBlasLtHandle() const { +// return blaslt_handle_; +// } -std::function -GPUContextResource::GetBlasLtHandleCreator() { - return [&]() { - InitBlasLtHandle(); - return blaslt_handle_; - }; -} +// std::function +// GPUContextResource::GetBlasLtHandleCreator() { +// return [&]() { +// InitBlasLtHandle(); +// return blaslt_handle_; +// }; +// } -phi::solverHandle_t GPUContextResource::GetSolverDnHandle() const { - return solver_handle_; -} +// phi::solverHandle_t GPUContextResource::GetSolverDnHandle() const { +// return solver_handle_; +// } -std::function -GPUContextResource::GetSolverDnHandleCreator() { - return [&]() { - InitSolverHandle(); - return solver_handle_; - }; -} +// std::function +// GPUContextResource::GetSolverDnHandleCreator() { +// return [&]() { +// InitSolverHandle(); +// return solver_handle_; +// }; +// } phi::sparseHandle_t GPUContextResource::GetSparseHandle() const { return sparse_handle_; @@ -380,7 +385,7 @@ CPUContextResource* ResourceManager::GetCPUResource() const { return cpu_resource_.get(); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) void* ResourceManager::InitGPUResource(const phi::Place& place, void* stream) { std::lock_guard lock_gurad(gpu_mutex_); if (gpu_resources_.count(stream)) { diff --git a/paddle/fluid/inference/api/resource_manager.h b/paddle/fluid/inference/api/resource_manager.h index 1f4d4ea420e1b..96d534e8cc954 100644 --- a/paddle/fluid/inference/api/resource_manager.h +++ b/paddle/fluid/inference/api/resource_manager.h @@ -26,7 +26,7 @@ #include "paddle/utils/test_macros.h" #include "unsupported/Eigen/CXX11/Tensor" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) #include "paddle/fluid/platform/device/gpu/gpu_types.h" #include "paddle/phi/backends/gpu/forwards.h" #include "paddle/phi/backends/gpu/gpu_decls.h" @@ -50,7 +50,7 @@ class CPUContextResource { std::unique_ptr cpu_eigen_device_; }; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) class GPUContextResource { public: explicit GPUContextResource(const phi::Place& place, void* stream); @@ -61,8 +61,8 @@ class GPUContextResource { std::function GetBlasHandleCreator(); std::function GetBlasTensorCoreHandleCreator(); std::function GetBlasTF32TensorCoreHandleCreator(); - std::function GetBlasLtHandleCreator(); - std::function GetSolverDnHandleCreator(); + // std::function GetBlasLtHandleCreator(); + // std::function GetSolverDnHandleCreator(); std::function GetSparseHandleCreator(); std::function GetGpuEigenDeviceCreator(); @@ -71,8 +71,8 @@ class GPUContextResource { blasHandle_t GetBlasHandle() const; blasHandle_t GetBlasTensorCoreHandle() const; blasHandle_t GetBlasTF32Handle() const; - blasLtHandle_t GetBlasLtHandle() const; - phi::solverHandle_t GetSolverDnHandle() const; + // blasLtHandle_t GetBlasLtHandle() const; + // phi::solverHandle_t GetSolverDnHandle() const; phi::sparseHandle_t GetSparseHandle() const; Eigen::GpuDevice* GetGpuEigenDevice() const; int GetGpuComputeCapability() const; @@ -91,10 +91,10 @@ class GPUContextResource { void InitDnnHanlde(); void DestroyDnnHandle(); void DestroyBlasHandle(); - void InitBlasLtHandle(); - void DestroyBlasLtHandle(); - void InitSolverHandle(); - void DestroySolverHandle(); + // void InitBlasLtHandle(); + // void DestroyBlasLtHandle(); + // void InitSolverHandle(); + // void DestroySolverHandle(); void InitSparseHandle(); void DestroySparseHandle(); @@ -117,9 +117,9 @@ class GPUContextResource { blasHandle_t blas_handle_{nullptr}; blasHandle_t blas_tensor_core_handle_{nullptr}; blasHandle_t blas_tf32_tensor_core_handle_{nullptr}; - blasLtHandle_t blaslt_handle_{nullptr}; + // blasLtHandle_t blaslt_handle_{nullptr}; dnnHandle_t dnn_handle_{nullptr}; - phi::solverHandle_t solver_handle_{nullptr}; + // phi::solverHandle_t solver_handle_{nullptr}; phi::sparseHandle_t sparse_handle_{nullptr}; // DnnWorkspaceHandle }; @@ -139,7 +139,7 @@ class ResourceManager { std::mutex cpu_mutex_; std::unique_ptr cpu_resource_{nullptr}; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) // GPU Resource public: void* InitGPUResource(const phi::Place& place, void* stream); diff --git a/paddle/fluid/inference/lite/tensor_utils.cc b/paddle/fluid/inference/lite/tensor_utils.cc index 9b36b6dc745e8..f3c953fb60a97 100644 --- a/paddle/fluid/inference/lite/tensor_utils.cc +++ b/paddle/fluid/inference/lite/tensor_utils.cc @@ -127,7 +127,7 @@ void MemoryCopyAsync(const platform::Place& dst_place, if (platform::is_cpu_place(dst_place) && platform::is_cpu_place(src_place)) { memory::Copy(cpu_place, dst_data, cpu_place, src_data, size); } else { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) if (platform::is_cpu_place(dst_place) && platform::is_gpu_place(src_place)) { PADDLE_THROW(platform::errors::Unimplemented( diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc old mode 100644 new mode 100755 index 10763eb911543..8cf589541b1e0 --- a/paddle/fluid/inference/tensorrt/op_teller.cc +++ b/paddle/fluid/inference/tensorrt/op_teller.cc @@ -47,7 +47,6 @@ struct SimpleOpTypeSetTeller : public Teller { #endif #if IS_TRT_VERSION_GE(7000) teller_set.insert("tile"); - int8_teller_set.insert("tile"); teller_set.insert("flatten_contiguous_range"); int8_teller_set.insert("flatten_contiguous_range"); teller_set.insert("rnn"); @@ -2303,20 +2302,15 @@ struct SimpleOpTypeSetTeller : public Teller { if (!with_dynamic_shape) { if (tile_inputs.find("repeat_times_tensor") != tile_inputs.end()) { if (!desc.Input("repeat_times_tensor").empty()) { - VLOG(3) << "Tile op: repeat_times_tensor is not empty."; return false; } } if (tile_inputs.find("RepeatTimes") != tile_inputs.end()) { if (!desc.Input("RepeatTimes").empty()) { - VLOG(3) << "Tile op: RepeatTimes is not empty."; return false; } } - if (!desc.HasAttr("repeat_times")) { - VLOG(3) << "Tile op:`repeat_times` is not set."; - return false; - } + if (!desc.HasAttr("repeat_times")) return false; } } #endif diff --git a/paddle/fluid/inference/tensorrt/plugin/c_allreduce_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/c_allreduce_op_plugin.cu index b3b0cd35fb300..6da8e874adc81 100644 --- a/paddle/fluid/inference/tensorrt/plugin/c_allreduce_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/c_allreduce_op_plugin.cu @@ -19,7 +19,7 @@ #include "paddle/fluid/platform/collective_helper.h" #include "paddle/phi/core/distributed/comm_context_manager.h" #include "paddle/phi/core/distributed/utils.h" -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) #include "paddle/phi/core/distributed/nccl_comm_context.h" #include "paddle/phi/core/flags.h" PHI_DECLARE_bool(dynamic_static_unified_comm); @@ -30,13 +30,13 @@ namespace inference { namespace tensorrt { namespace plugin { #if defined(PADDLE_WITH_NCCL) -inline ncclDataType_t NvInferDtypeToNCCLDType(nvinfer1::DataType type) { +inline mcclDataType_t NvInferDtypeToNCCLDType(nvinfer1::DataType type) { if (type == nvinfer1::DataType::kFLOAT) { - return ncclFloat; + return mcclFloat; } else if (type == nvinfer1::DataType::kHALF) { - return ncclFloat16; + return mcclFloat16; } else if (type == nvinfer1::DataType::kINT8) { - return ncclInt8; + return mcclInt8; } else if (type == nvinfer1::DataType::kINT32) { return ncclInt32; } else { @@ -159,23 +159,23 @@ int CAllReducePluginDynamic::enqueue( auto input_type = input_desc[0].type; void* sendbuff = const_cast(inputs[0]); void* recvbuff = outputs[0]; - ncclDataType_t dtype = NvInferDtypeToNCCLDType(input_type); - ncclRedOp_t nccl_red_type = ncclSum; + mcclDataType_t dtype = NvInferDtypeToNCCLDType(input_type); + mcclRedOp_t nccl_red_type = mcclSum; switch (red_type_) { case kRedSum: - nccl_red_type = ncclSum; + nccl_red_type = mcclSum; break; case kRedMax: - nccl_red_type = ncclMax; + nccl_red_type = mcclMax; break; case kRedMin: - nccl_red_type = ncclMin; + nccl_red_type = mcclMin; break; case kRedProd: - nccl_red_type = ncclProd; + nccl_red_type = mcclProd; break; default: @@ -202,9 +202,9 @@ int CAllReducePluginDynamic::enqueue( "NCCLCommContext is nullptr, collective op should " "has ring_id attr.")); auto stream = comm_ctx->GetStream(); - ncclRedOp_t nccl_red_type = ncclSum; + mcclRedOp_t nccl_red_type = mcclSum; // comm_ctx->AllReduce(&inputs[0], inputs[0], nccl_red_type, stream); - phi::dynload::ncclAllReduce(sendbuff, + phi::dynload::mcclAllReduce(sendbuff, recvbuff, numel, dtype, @@ -215,7 +215,7 @@ int CAllReducePluginDynamic::enqueue( } else { auto comm = platform::NCCLCommContext::Instance().Get(ring_id_); cudaStream_t custream = use_calc_stream_ ? stream : comm->stream(); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(sendbuff, + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclAllReduce(sendbuff, recvbuff, numel, dtype, diff --git a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu index 298f54de48e8f..fec0a927b20e8 100644 --- a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu @@ -218,6 +218,9 @@ void QkvToContextPluginDynamic::configurePlugin( #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS( hipMemsetAsync(fake_qk_bias_, 0, size, dev_ctx.stream())); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS( + musaMemsetAsync(fake_qk_bias_, 0, size, dev_ctx.stream())); #else PADDLE_ENFORCE_GPU_SUCCESS( cudaMemsetAsync(fake_qk_bias_, 0, size, dev_ctx.stream())); diff --git a/paddle/fluid/inference/utils/CMakeLists.txt b/paddle/fluid/inference/utils/CMakeLists.txt index 0ad2cb0e3f0c8..3dbc06bfc11b7 100644 --- a/paddle/fluid/inference/utils/CMakeLists.txt +++ b/paddle/fluid/inference/utils/CMakeLists.txt @@ -1,3 +1,8 @@ +cc_library( + benchmark + SRCS benchmark.cc + DEPS enforce common) +paddle_test(test_benchmark SRCS benchmark_tester.cc DEPS benchmark) cc_library( infer_io_utils SRCS io_utils.cc @@ -8,5 +13,13 @@ cc_library( DEPS proto_desc enforce common) cc_library(table_printer SRCS table_printer.cc) +paddle_test(test_table_printer SRCS table_printer_tester.cc) proto_library(shape_range_info_proto SRCS shape_range_info.proto) + +if(WITH_ONNXRUNTIME AND WIN32) + # Copy onnxruntime for some c++ test in Windows, since the test will + # be build only in CI, so suppose the generator in Windows is Ninja. + copy_onnx(test_benchmark) + copy_onnx(test_table_printer) +endif() diff --git a/paddle/fluid/inference/utils/benchmark.cc b/paddle/fluid/inference/utils/benchmark.cc new file mode 100644 index 0000000000000..24bc99ed183fa --- /dev/null +++ b/paddle/fluid/inference/utils/benchmark.cc @@ -0,0 +1,54 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/utils/benchmark.h" + +#include + +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace inference { + +std::string Benchmark::SerializeToString() const { + std::stringstream ss; + ss << "-----------------------------------------------------\n"; + ss << "name\t"; + ss << "batch_size\t"; + ss << "num_threads\t"; + ss << "latency\t"; + ss << "qps"; + ss << '\n'; + + ss << name_ << "\t"; + ss << batch_size_ << "\t\t"; + ss << num_threads_ << "\t"; + ss << latency_ << "\t"; + ss << 1000.0 / latency_; + ss << '\n'; + return ss.str(); +} +void Benchmark::PersistToFile(const std::string &path) const { + std::ofstream file(path, std::ios::app); + PADDLE_ENFORCE_EQ( + file.is_open(), + true, + platform::errors::Unavailable("Can not open %s to add benchmark.", path)); + file << SerializeToString(); + file.flush(); + file.close(); +} + +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/utils/benchmark.h b/paddle/fluid/inference/utils/benchmark.h new file mode 100644 index 0000000000000..56789843c3728 --- /dev/null +++ b/paddle/fluid/inference/utils/benchmark.h @@ -0,0 +1,56 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include + +#include "paddle/utils/test_macros.h" + +namespace paddle { +namespace inference { + +/* + * Helper class to calculate the performance. + */ +struct TEST_API Benchmark { + int batch_size() const { return batch_size_; } + void SetBatchSize(int x) { batch_size_ = x; } + + int num_threads() const { return num_threads_; } + void SetNumThreads(int x) { num_threads_ = x; } + + bool use_gpu() const { return use_gpu_; } + void SetUseGpu() { use_gpu_ = true; } + + float latency() const { return latency_; } + void SetLatency(float x) { latency_ = x; } + + const std::string& name() const { return name_; } + void SetName(const std::string& name) { name_ = name; } + + std::string SerializeToString() const; + void PersistToFile(const std::string& path) const; + + private: + bool use_gpu_{false}; + int batch_size_{0}; + float latency_; + int num_threads_{1}; + std::string name_; +}; + +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/utils/benchmark_tester.cc b/paddle/fluid/inference/utils/benchmark_tester.cc new file mode 100644 index 0000000000000..8f7614cb10a44 --- /dev/null +++ b/paddle/fluid/inference/utils/benchmark_tester.cc @@ -0,0 +1,40 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#include "paddle/fluid/inference/utils/benchmark.h" + +using namespace paddle::inference; // NOLINT +TEST(Benchmark, basic) { + Benchmark benchmark; + benchmark.SetName("key0"); + benchmark.SetBatchSize(10); + benchmark.SetUseGpu(); + benchmark.SetLatency(220); + LOG(INFO) << "benchmark:\n" << benchmark.SerializeToString(); +} + +TEST(Benchmark, PersistToFile) { + Benchmark benchmark; + benchmark.SetName("key0"); + benchmark.SetBatchSize(10); + benchmark.SetUseGpu(); + benchmark.SetLatency(220); + + benchmark.PersistToFile("1.log"); + benchmark.PersistToFile("2.log"); + benchmark.PersistToFile("3.log"); +} diff --git a/paddle/fluid/inference/utils/table_printer_tester.cc b/paddle/fluid/inference/utils/table_printer_tester.cc new file mode 100644 index 0000000000000..fc482807b2854 --- /dev/null +++ b/paddle/fluid/inference/utils/table_printer_tester.cc @@ -0,0 +1,82 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#include "paddle/fluid/inference/utils/table_printer.h" + +namespace paddle { +namespace inference {} // namespace inference +} // namespace paddle + +TEST(table_printer, output) { + std::vector header{"config", "value"}; + paddle::inference::TablePrinter table(header); + + // model_dir + table.InsertRow({"model_dir", "./model_dir"}); + // model + table.InsertRow({"model_file", "./model.pdmodel"}); + table.InsertRow({"params_file", "./model.pdiparams"}); + + table.InsetDivider(); + // gpu + table.InsertRow({"use_gpu", "true"}); + table.InsertRow({"gpu_device_id", "0"}); + table.InsertRow({"memory_pool_init_size", "100MB"}); + table.InsertRow({"thread_local_stream", "false"}); + table.InsetDivider(); + + // trt precision + table.InsertRow({"use_trt", "true"}); + table.InsertRow({"trt_precision", "fp32"}); + table.InsertRow({"enable_dynamic_shape", "true"}); + table.InsertRow({"DisableTensorRtOPs", "{}"}); + table.InsertRow({"EnableVarseqlen", "ON"}); + table.InsertRow({"tensorrt_dla_enabled", "ON"}); + table.InsetDivider(); + + // lite + table.InsertRow({"use_lite", "ON"}); + table.InsetDivider(); + + // xpu + table.InsertRow({"use_xpu", "true"}); + table.InsertRow({"xpu_device_id", "0"}); + table.InsetDivider(); + + // ir + table.InsertRow({"ir_optim", "true"}); + table.InsertRow({"ir_debug", "false"}); + table.InsertRow({"enable_memory_optim", "false"}); + table.InsertRow({"EnableProfile", "false"}); + table.InsertRow({"glog_info_disabled", "false"}); + table.InsetDivider(); + + // cpu + table.InsertRow({"CpuMathLibrary", "4"}); + // mkldnn + table.InsertRow({"enable_mkldnn", "false"}); + table.InsertRow({"mkldnn_cache_capacity", "10"}); + + // a long string + table.InsertRow( + {"~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ a long string " + "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~", + "------------------------------------------ a long value " + "-----------------------------------------------------"}); + + LOG(INFO) << table.PrintTable(); +} diff --git a/paddle/fluid/memory/CMakeLists.txt b/paddle/fluid/memory/CMakeLists.txt index 5b49d927ae676..aed5d674e49ff 100644 --- a/paddle/fluid/memory/CMakeLists.txt +++ b/paddle/fluid/memory/CMakeLists.txt @@ -62,6 +62,17 @@ if(WITH_ROCM) DEPS malloc gpu_info place) endif() +if(WITH_MUSA) + musa_test( + malloc_test + SRCS malloc_test.cu + DEPS device_context malloc) + musa_test( + cuda_managed_memory_test + SRCS cuda_managed_memory_test.cu + DEPS malloc gpu_info place) +endif() + if(WITH_TESTING AND TEST cuda_managed_memory_test) set_tests_properties( cuda_managed_memory_test diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt index ffce57d78f164..eae17991ff2fe 100644 --- a/paddle/fluid/memory/allocation/CMakeLists.txt +++ b/paddle/fluid/memory/allocation/CMakeLists.txt @@ -19,7 +19,7 @@ set(ALLOCATOR_SRCS buddy_allocator.cc system_allocator.cc) -if(WITH_GPU OR WITH_ROCM) +if(WITH_GPU OR WITH_ROCM OR WITH_MUSA) list( APPEND ALLOCATOR_SRCS @@ -90,6 +90,13 @@ if(WITH_ROCM) SRCS thread_local_allocator_test.cc DEPS allocator) endif() +if(WITH_MUSA) + musa_test( + thread_local_allocator_test + SRCS thread_local_allocator_test.cc + DEPS allocator) +endif() + if(WITH_GPU) nv_test( @@ -101,6 +108,11 @@ elseif(WITH_ROCM) best_fit_allocator_test SRCS best_fit_allocator_test.cc best_fit_allocator_test.cu DEPS allocator memcpy) +elseif(WITH_MUSA) + musa_test( + best_fit_allocator_test + SRCS best_fit_allocator_test.cc best_fit_allocator_test.cu + DEPS allocator memcpy) else() cc_test_old(best_fit_allocator_test SRCS best_fit_allocator_test.cc DEPS allocator) diff --git a/paddle/fluid/memory/allocation/allocator.h b/paddle/fluid/memory/allocation/allocator.h index dd86ba9855fba..17839ecf0caec 100644 --- a/paddle/fluid/memory/allocation/allocator.h +++ b/paddle/fluid/memory/allocation/allocator.h @@ -26,9 +26,9 @@ #include "paddle/phi/core/enforce.h" #include "paddle/phi/core/flags.h" -#ifdef PADDLE_WITH_NCCL -#include -#include "paddle/fluid/platform/dynload/nccl.h" +#ifdef PADDLE_WITH_MCCL +#include +#include "paddle/fluid/platform/dynload/mccl.h" #endif PHI_DECLARE_string(allocator_strategy); @@ -144,22 +144,22 @@ using DecoratedAllocationPtr = template static T&& FillValue(T&& allocation) { -#if defined(PADDLE_WITH_CUDA) +#if defined(PADDLE_WITH_MUSA) if (allocation != nullptr) { if (FLAGS_sync_after_alloc || FLAGS_alloc_fill_value >= 0) { - PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize()); + PADDLE_ENFORCE_GPU_SUCCESS(musaDeviceSynchronize()); if (FLAGS_alloc_fill_value >= 0) { VLOG(10) << "Set " << FLAGS_alloc_fill_value << " on " << allocation->ptr() << " " << allocation->place() << " " << allocation->size(); if (platform::is_gpu_place(allocation->place())) { - PADDLE_ENFORCE_GPU_SUCCESS(cudaMemset( + PADDLE_ENFORCE_GPU_SUCCESS(musaMemset( allocation->ptr(), FLAGS_alloc_fill_value, allocation->size())); } else { std::memset( allocation->ptr(), FLAGS_alloc_fill_value, allocation->size()); } - PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize()); + PADDLE_ENFORCE_GPU_SUCCESS(musaDeviceSynchronize()); } } } diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc index 59ab4eaf15472..e7df0f7213363 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.cc +++ b/paddle/fluid/memory/allocation/allocator_facade.cc @@ -27,7 +27,7 @@ #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/place.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) #include #include "paddle/fluid/memory/allocation/cuda_allocator.h" @@ -165,7 +165,7 @@ class AllocatorFacadePrivate { public: using AllocatorMap = std::map>; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) using CUDAAllocatorMap = std::map>>; @@ -193,7 +193,7 @@ class AllocatorFacadePrivate { InitNaiveBestFitIPUAllocator(platform::IPUPlace(dev_id)); } #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount(); ++dev_id) { InitNaiveBestFitCUDAAllocator(platform::CUDAPlace(dev_id)); } @@ -219,7 +219,7 @@ class AllocatorFacadePrivate { case AllocatorStrategy::kAutoGrowth: { InitNaiveBestFitCPUAllocator(); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) allow_free_idle_chunk_ = allow_free_idle_chunk; for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount(); ++dev_id) { InitAutoGrowthCUDAAllocator(platform::CUDAPlace(dev_id), @@ -294,7 +294,7 @@ class AllocatorFacadePrivate { InitNaiveBestFitIPUAllocator(platform::IPUPlace(dev_id)); } #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount(); ++dev_id) { InitThreadLocalCUDAAllocator(platform::CUDAPlace(dev_id)); } @@ -353,7 +353,7 @@ class AllocatorFacadePrivate { LIKELY(FLAGS_use_system_allocator == false); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) bool HasCUDAAllocator(const platform::CUDAPlace& place, gpuStream_t stream) { auto it = cuda_allocators_.find(place); if (it == cuda_allocators_.end()) { @@ -730,7 +730,7 @@ class AllocatorFacadePrivate { #endif } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) void InitNaiveBestFitCUDAPinnedAllocator() { if (FLAGS_use_auto_growth_pinned_allocator) { auto chunk_size = FLAGS_auto_growth_chunk_size_in_mb << 20; @@ -804,7 +804,7 @@ class AllocatorFacadePrivate { auto chunk_size = FLAGS_auto_growth_chunk_size_in_mb << 20; VLOG(4) << "FLAGS_auto_growth_chunk_size_in_mb is " << FLAGS_auto_growth_chunk_size_in_mb; -#if defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) auto cuda_allocator = CreateCUDAAllocator(p); cuda_allocators_[p][stream] = std::make_shared( cuda_allocator, @@ -890,7 +890,7 @@ class AllocatorFacadePrivate { auto chunk_size = FLAGS_auto_growth_chunk_size_in_mb << 20; VLOG(4) << "FLAGS_auto_growth_chunk_size_in_mb is " << FLAGS_auto_growth_chunk_size_in_mb; -#if defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) auto cuda_allocator = CreateCUDAAllocator(p); allocators_[p] = std::make_shared( cuda_allocator, @@ -1252,7 +1252,7 @@ class AllocatorFacadePrivate { system_allocators_[p] = std::make_shared(p); } #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) system_allocators_[platform::CUDAPinnedPlace()] = std::make_shared(); int device_count = platform::GetGPUDeviceCount(); @@ -1276,7 +1276,7 @@ class AllocatorFacadePrivate { if (!zero_size_allocators_.empty()) return; std::vector places; places.emplace_back(platform::CPUPlace()); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) int device_count = platform::GetGPUDeviceCount(); for (int dev_id = 0; dev_id < device_count; ++dev_id) { places.emplace_back(platform::CUDAPlace(dev_id)); @@ -1322,7 +1322,7 @@ class AllocatorFacadePrivate { CheckAllocThreadSafe(allocators_); CheckAllocThreadSafe(zero_size_allocators_); CheckAllocThreadSafe(system_allocators_); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) if (is_stream_safe_cuda_allocator_used_) { CheckCUDAAllocThreadSafe(cuda_allocators_); } @@ -1355,7 +1355,7 @@ class AllocatorFacadePrivate { } } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) // a standalone CUDA allocator to support multi-stream GC in new executor std::map> default_stream_safe_cuda_allocators_; @@ -1489,7 +1489,7 @@ AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, } } #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) AllocatorFacadePrivate* m = GetPrivate(); if (!m->IsStreamSafeCUDAAllocatorUsed()) { VLOG(6) << "Warning: StreamSafeCUDAAllocator is not used!"; @@ -1515,7 +1515,7 @@ AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, bool AllocatorFacade::InSameStream( const std::shared_ptr& allocation, const phi::Stream& stream) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) gpuStream_t s = reinterpret_cast(stream.id()); return s == GetStream(allocation); #else @@ -1527,7 +1527,7 @@ bool AllocatorFacade::IsStreamSafeCUDAAllocatorUsed() { return GetPrivate()->IsStreamSafeCUDAAllocatorUsed(); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) uint64_t AllocatorFacade::Release(const platform::CUDAPlace& place, gpuStream_t stream) { AllocatorFacadePrivate* m = GetPrivate(); diff --git a/paddle/fluid/memory/allocation/allocator_facade.h b/paddle/fluid/memory/allocation/allocator_facade.h index acfd73a411932..39819e0d66bdc 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.h +++ b/paddle/fluid/memory/allocation/allocator_facade.h @@ -81,7 +81,7 @@ class AllocatorFacade { bool IsStreamSafeCUDAAllocatorUsed(); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) // TODO(zhiqiu): change gpuStream_t to phi::Stream if needed. uint64_t Release(const platform::CUDAPlace& place, gpuStream_t stream); void RecordStream(std::shared_ptr allocation, gpuStream_t stream); diff --git a/paddle/fluid/memory/allocation/buddy_allocator.cc b/paddle/fluid/memory/allocation/buddy_allocator.cc index 4f08db4921f8b..0f532d1fff4d7 100644 --- a/paddle/fluid/memory/allocation/buddy_allocator.cc +++ b/paddle/fluid/memory/allocation/buddy_allocator.cc @@ -19,7 +19,7 @@ limitations under the License. */ #include "glog/logging.h" #include "paddle/phi/core/flags.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) #define USE_DEVICE PHI_DECLARE_uint64(reallocate_gpu_memory_in_mb); #endif @@ -54,7 +54,7 @@ BuddyAllocator::BuddyAllocator( }; use_custom_device_ = true; } else { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) init_allocate_size_func_ = &platform::GpuInitAllocSize; re_allocate_size_func_ = &platform::GpuReallocSize; #endif @@ -279,7 +279,7 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool( allocate_bytes = DeviceAllocateSize( init_allocate_size_func_, re_allocate_size_func_, request_bytes); #else -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) allocate_bytes = DeviceAllocateSize( &platform::GpuInitAllocSize, &platform::GpuReallocSize, request_bytes); #endif diff --git a/paddle/fluid/memory/allocation/cuda_allocator.cc b/paddle/fluid/memory/allocation/cuda_allocator.cc index 781addd7dba60..3f50fa9651ced 100644 --- a/paddle/fluid/memory/allocation/cuda_allocator.cc +++ b/paddle/fluid/memory/allocation/cuda_allocator.cc @@ -23,6 +23,10 @@ #include #endif +#ifdef PADDLE_WITH_MUSA +#include +#endif + #include #include "paddle/fluid/platform/cuda_device_guard.h" diff --git a/paddle/fluid/memory/allocation/cuda_device_context_allocator.h b/paddle/fluid/memory/allocation/cuda_device_context_allocator.h index 7286f84160c6a..139e2358d161c 100644 --- a/paddle/fluid/memory/allocation/cuda_device_context_allocator.h +++ b/paddle/fluid/memory/allocation/cuda_device_context_allocator.h @@ -82,6 +82,9 @@ class GPUContextAllocator : public Allocator { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS( hipEventCreateWithFlags(&event_, hipEventDisableTiming)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS( + musaEventCreateWithFlags(&event_, musaEventDisableTiming)); #else PADDLE_ENFORCE_GPU_SUCCESS( cudaEventCreate(&event_, cudaEventDisableTiming)); @@ -92,8 +95,9 @@ class GPUContextAllocator : public Allocator { if (event_) { platform::CUDADeviceGuard guard(place_.device); #ifdef PADDLE_WITH_HIP - PADDLE_WARN_GPU_SUCCESS(hipEventDestroy(event_)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_WARN_GPU_SUCCESS(musaEventDestroy(event_)); #else PADDLE_WARN_GPU_SUCCESS(cudaEventDestroy(event_)); #endif @@ -113,6 +117,9 @@ class GPUContextAllocator : public Allocator { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event_, default_stream_)); PADDLE_ENFORCE_GPU_SUCCESS(hipStreamWaitEvent(default_stream_, event_, 0)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(musaEventRecord(event_, default_stream_)); + PADDLE_ENFORCE_GPU_SUCCESS(musaStreamWaitEvent(default_stream_, event_, 0)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event_, default_stream_)); PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamWaitEvent(default_stream_, event_, 0)); diff --git a/paddle/fluid/memory/allocation/cuda_managed_allocator.cc b/paddle/fluid/memory/allocation/cuda_managed_allocator.cc index 77ca495cacbc7..331fe723d32bb 100644 --- a/paddle/fluid/memory/allocation/cuda_managed_allocator.cc +++ b/paddle/fluid/memory/allocation/cuda_managed_allocator.cc @@ -19,6 +19,11 @@ #include #endif +#ifdef PADDLE_WITH_MUSA +#include +#include +#endif + #ifdef PADDLE_WITH_HIP #include #endif diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc index d39cb285517f2..c8ac552bf1b73 100644 --- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc +++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc @@ -26,7 +26,7 @@ #include "paddle/fluid/string/printf.h" #include "paddle/fluid/string/split.h" #include "paddle/phi/common/place.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) #include "paddle/fluid/platform/cuda_device_guard.h" #endif #include "paddle/fluid/platform/flags.h" @@ -213,7 +213,7 @@ size_t Used(const platform::XPUPlace &place) { } // For CUDA -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) class GPUBuddyAllocatorList { private: GPUBuddyAllocatorList() : devices_(platform::GetSelectedDevices()) { @@ -283,7 +283,7 @@ BuddyAllocator *GetGPUBuddyAllocator(int gpu_id) { template <> size_t Used(const platform::CUDAPlace &place) { -#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_HIP) +#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_HIP || defined PADDLE_WITH_MUSA) return GetGPUBuddyAllocator(place.device)->Used(); #else PADDLE_THROW(platform::errors::PermissionDenied( @@ -294,7 +294,7 @@ size_t Used(const platform::CUDAPlace &place) { template <> void *Alloc(const platform::CUDAPlace &place, size_t size) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) auto *buddy_allocator = GetGPUBuddyAllocator(place.device); auto *ptr = buddy_allocator->Alloc(size); if (ptr == nullptr) { @@ -315,6 +315,8 @@ void *Alloc(const platform::CUDAPlace &place, if (FLAGS_init_allocated_mem) { #ifdef PADDLE_WITH_HIP hipMemset(ptr, 0xEF, size); +#elif defined(PADDLE_WITH_MUSA) + musaMemset(ptr, 0xEF, size); #else cudaMemset(ptr, 0xEF, size); #endif @@ -331,7 +333,7 @@ template <> void Free(const platform::CUDAPlace &place, void *p, size_t size) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) GetGPUBuddyAllocator(place.device)->Free(p); #else PADDLE_THROW(platform::errors::PermissionDenied( @@ -341,7 +343,7 @@ void Free(const platform::CUDAPlace &place, template <> uint64_t Release(const platform::CUDAPlace &place) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) return GetGPUBuddyAllocator(place.device)->Release(); #else PADDLE_THROW(platform::errors::PermissionDenied( @@ -349,7 +351,7 @@ uint64_t Release(const platform::CUDAPlace &place) { #endif } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) BuddyAllocator *GetCUDAPinnedBuddyAllocator() { static std::once_flag init_flag; static BuddyAllocator *ba = nullptr; @@ -367,7 +369,7 @@ BuddyAllocator *GetCUDAPinnedBuddyAllocator() { template <> size_t Used(const platform::CUDAPinnedPlace &place) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) return GetCUDAPinnedBuddyAllocator()->Used(); #else PADDLE_THROW(platform::errors::PermissionDenied( @@ -378,7 +380,7 @@ size_t Used(const platform::CUDAPinnedPlace &place) { template <> void *Alloc(const platform::CUDAPinnedPlace &place, size_t size) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place); auto *buddy_allocator = GetCUDAPinnedBuddyAllocator(); void *ptr = buddy_allocator->Alloc(size); @@ -400,7 +402,7 @@ template <> void Free(const platform::CUDAPinnedPlace &place, void *p, size_t size) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) VLOG(10) << "Free " << size << " bytes on " << platform::Place(place); GetCUDAPinnedBuddyAllocator()->Free(p); #else @@ -412,7 +414,7 @@ void Free(const platform::CUDAPinnedPlace &place, template <> uint64_t Release( const platform::CUDAPinnedPlace &place) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) VLOG(10) << "Release on " << platform::Place(place); return GetCUDAPinnedBuddyAllocator()->Release(); #else @@ -603,7 +605,7 @@ size_t Usage::operator()(const platform::CPUPlace &cpu) const { } size_t Usage::operator()(const platform::CUDAPlace &gpu) const { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) return Used(gpu); #else PADDLE_THROW(platform::errors::PermissionDenied( @@ -612,7 +614,7 @@ size_t Usage::operator()(const platform::CUDAPlace &gpu) const { } size_t Usage::operator()(const platform::CUDAPinnedPlace &cuda_pinned) const { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) return Used(cuda_pinned); #else PADDLE_THROW(platform::errors::PermissionDenied( diff --git a/paddle/fluid/memory/allocation/pinned_allocator.cc b/paddle/fluid/memory/allocation/pinned_allocator.cc index 32853f08f94e5..206ad95446801 100644 --- a/paddle/fluid/memory/allocation/pinned_allocator.cc +++ b/paddle/fluid/memory/allocation/pinned_allocator.cc @@ -23,6 +23,8 @@ bool CPUPinnedAllocator::IsAllocThreadSafe() const { return true; } void CPUPinnedAllocator::FreeImpl(phi::Allocation *allocation) { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipHostFree(allocation->ptr())); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(musaFreeHost(allocation->ptr())); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaFreeHost(allocation->ptr())); #endif @@ -38,6 +40,8 @@ phi::Allocation *CPUPinnedAllocator::AllocateImpl(size_t size) { void *ptr; #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipHostMalloc(&ptr, size, hipHostMallocPortable)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(musaHostAlloc(&ptr, size, musaHostAllocPortable)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaHostAlloc(&ptr, size, cudaHostAllocPortable)); #endif diff --git a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc index 48b18f07456c6..30fe2d9b095eb 100644 --- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc +++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc @@ -92,6 +92,17 @@ bool StreamSafeCUDAAllocation::CanBeFreed() { } PADDLE_ENFORCE_GPU_SUCCESS(err); PADDLE_ENFORCE_GPU_SUCCESS(cudaEventDestroy(event)); + +#elif defined(PADDLE_WITH_MUSA) + gpuError_t err = musaEventQuery(event); + if (err == musaErrorNotReady) { + VLOG(9) << "Event " << event << " for " << ptr() << " is not completed"; + // Erase the completded event before "it" + outstanding_event_map_.erase(outstanding_event_map_.begin(), it); + return false; + } + PADDLE_ENFORCE_GPU_SUCCESS(err); + PADDLE_ENFORCE_GPU_SUCCESS(musaEventDestroy(event)); #else gpuError_t err = hipEventQuery(event); if (err == hipErrorNotReady) { @@ -128,6 +139,9 @@ void StreamSafeCUDAAllocation::RecordStreamWithNoGraphCapturing( #ifdef PADDLE_WITH_CUDA PADDLE_ENFORCE_GPU_SUCCESS( cudaEventCreateWithFlags(&new_event, cudaEventDisableTiming)); +#elif defined (PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS( + musaEventCreateWithFlags(&new_event, musaEventDisableTiming)); #else PADDLE_ENFORCE_GPU_SUCCESS( hipEventCreateWithFlags(&new_event, hipEventDisableTiming)); @@ -142,6 +156,8 @@ void StreamSafeCUDAAllocation::RecordStreamWithNoGraphCapturing( #ifdef PADDLE_WITH_CUDA PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(record_event, stream)); +#elif defined (PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(musaEventRecord(record_event, stream)); #else PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(record_event, stream)); #endif diff --git a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h index 31508a1079922..79a7c7abf01de 100644 --- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h +++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h @@ -24,6 +24,9 @@ #ifdef PADDLE_WITH_CUDA #include +#elif defined(PADDLE_WITH_MUSA) +#include +#include #else #include #endif diff --git a/paddle/fluid/memory/allocation/system_allocator.cc b/paddle/fluid/memory/allocation/system_allocator.cc index e9a9fcbff9831..cb9c4afd7b9fc 100644 --- a/paddle/fluid/memory/allocation/system_allocator.cc +++ b/paddle/fluid/memory/allocation/system_allocator.cc @@ -33,7 +33,7 @@ limitations under the License. */ #include "paddle/phi/backends/cpu/cpu_info.h" #include "paddle/phi/core/flags.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) #include "paddle/fluid/platform/cuda_device_guard.h" #endif @@ -120,7 +120,7 @@ void CPUAllocator::Free(void* p, size_t size, size_t index) { bool CPUAllocator::UseGpu() const { return false; } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) void* GPUAllocator::Alloc(size_t* index, size_t size) { // CUDA documentation doesn't explain if cudaMalloc returns nullptr @@ -216,6 +216,8 @@ void* CUDAPinnedAllocator::Alloc(size_t* index, size_t size) { // PINNED memory is visible to all CUDA contexts. #ifdef PADDLE_WITH_HIP hipError_t result = hipHostMalloc(&p, size, hipHostMallocPortable); +#elif defined(PADDLE_WITH_MUSA) + musaError_t result = musaHostAlloc(&p, size, musaHostAllocPortable); #else cudaError_t result = cudaHostAlloc(&p, size, cudaHostAllocPortable); #endif @@ -259,6 +261,22 @@ void CUDAPinnedAllocator::Free(void* p, size_t size, size_t index) { platform::errors::Fatal( "hipFreeHost failed in GPUPinnedAllocator, error code is %d", err)); } +#elif defined(PADDLE_WITH_MUSA) + err = musaFreeHost(p); + + // Purposefully allow cudaErrorCudartUnloading, because + // that is returned if you ever call cudaFreeHost after the + // driver has already shutdown. This happens only if the + // process is terminating, in which case we don't care if + // cudaFreeHost succeeds. + if (err != musaErrorMusartUnloading) { + PADDLE_ENFORCE_EQ( + err, + 0, + platform::errors::Fatal( + "cudaFreeHost failed in GPUPinnedAllocator, error code is %d", + err)); + } #else err = cudaFreeHost(p); diff --git a/paddle/fluid/memory/allocation/system_allocator.h b/paddle/fluid/memory/allocation/system_allocator.h index 67376a3e39a22..b2cce04a04d37 100644 --- a/paddle/fluid/memory/allocation/system_allocator.h +++ b/paddle/fluid/memory/allocation/system_allocator.h @@ -43,7 +43,7 @@ class CPUAllocator : public SystemAllocator { virtual bool UseGpu() const; }; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) class GPUAllocator : public SystemAllocator { public: explicit GPUAllocator(int gpu_id) : gpu_id_(gpu_id) {} diff --git a/paddle/fluid/memory/malloc.cc b/paddle/fluid/memory/malloc.cc index 0c40da19d47e5..63504621f98c5 100644 --- a/paddle/fluid/memory/malloc.cc +++ b/paddle/fluid/memory/malloc.cc @@ -57,7 +57,7 @@ void* GetBasePtr(const std::shared_ptr& allocation) { return allocation::AllocatorFacade::Instance().GetBasePtr(allocation); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) uint64_t Release(const platform::CUDAPlace& place, gpuStream_t stream) { return allocation::AllocatorFacade::Instance().Release(place, stream); } diff --git a/paddle/fluid/memory/malloc.h b/paddle/fluid/memory/malloc.h index 3b098e5a13e51..48fbc541e5fa9 100644 --- a/paddle/fluid/memory/malloc.h +++ b/paddle/fluid/memory/malloc.h @@ -49,7 +49,7 @@ extern bool InSameStream(const std::shared_ptr& allocation, extern void* GetBasePtr(const std::shared_ptr& allocation); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) extern uint64_t Release(const platform::CUDAPlace& place, gpuStream_t stream); void RecordStream(std::shared_ptr allocation, gpuStream_t stream); diff --git a/paddle/fluid/memory/memcpy.cc b/paddle/fluid/memory/memcpy.cc index bffbcbdfad76b..c8ce60e7c39d6 100644 --- a/paddle/fluid/memory/memcpy.cc +++ b/paddle/fluid/memory/memcpy.cc @@ -18,7 +18,6 @@ limitations under the License. */ #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/profiler/event_tracing.h" #include "paddle/phi/common/place.h" -#include "paddle/utils/test_macros.h" #ifdef PADDLE_WITH_XPU #include "paddle/fluid/platform/device/xpu/xpu_header.h" @@ -111,11 +110,11 @@ void Copy( #endif // PADDLE_WITH_CUSTOM_DEVICE template <> -TEST_API void Copy(platform::CPUPlace, - void* dst, - platform::CPUPlace, - const void* src, - size_t num) { +void Copy(platform::CPUPlace, + void* dst, + platform::CPUPlace, + const void* src, + size_t num) { if (UNLIKELY(num == 0)) return; VLOG(4) << "src: " << src << ", dst: " << dst << ", num: " << num; std::memcpy(dst, src, num); @@ -257,7 +256,8 @@ void Copy(phi::Place dst_place, #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) static constexpr size_t kMaxGpuAsyncCopyBytes = 64 * 1024; // 64K #ifdef PADDLE_WITH_HIP @@ -272,10 +272,22 @@ inline void SyncCUDAStream() { } #endif } +#elif defined(PADDLE_WITH_MUSA) +inline void SyncCUDAStream() { +#if !defined(_WIN32) + musaStreamSynchronize(0); +#else + musaError_t e_sync = musaSuccess; + while (e_sync = musaStreamQuery(0)) { + if (e_sync == musaErrorNotReady) continue; + break; + } +#endif +} #else inline void SyncCUDAStream() { #if !defined(_WIN32) - cudaStreamSynchronize(nullptr); + cudaStreamSynchronize(0); #else cudaError_t e_sync = cudaSuccess; while (e_sync = cudaStreamQuery(0)) { @@ -293,7 +305,7 @@ inline void SyncCUDAStream() { // https://devblogs.nvidia.com/gpu-pro-tip-cuda-7-streams-simplify-concurrency/ template <> -TEST_API void Copy( +void Copy( platform::CPUPlace dst_place, void* dst, platform::CUDAPlace src_place, @@ -314,6 +326,12 @@ TEST_API void Copy( num, hipMemcpyDeviceToHost, reinterpret_cast(stream)); +#elif defined(PADDLE_WITH_MUSA) + platform::GpuMemcpyAsync(dst, + src, + num, + musaMemcpyDeviceToHost, + reinterpret_cast(stream)); #else platform::GpuMemcpyAsync(dst, src, @@ -326,6 +344,8 @@ TEST_API void Copy( "GpuMemcpySync:GPU->CPU", platform::TracerEventType::UserDefined, 1); #ifdef PADDLE_WITH_HIP platform::GpuMemcpySync(dst, src, num, hipMemcpyDeviceToHost); +#elif defined(PADDLE_WITH_MUSA) + platform::GpuMemcpySync(dst, src, num, musaMemcpyDeviceToHost); #else platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToHost); #endif @@ -337,7 +357,7 @@ TEST_API void Copy( } template <> -TEST_API void Copy( +void Copy( platform::CUDAPlace dst_place, void* dst, platform::CPUPlace src_place, @@ -358,6 +378,12 @@ TEST_API void Copy( num, hipMemcpyHostToDevice, reinterpret_cast(stream)); +#elif defined(PADDLE_WITH_MUSA) + platform::GpuMemcpyAsync(dst, + src, + num, + musaMemcpyHostToDevice, + reinterpret_cast(stream)); #else platform::GpuMemcpyAsync(dst, src, @@ -370,6 +396,8 @@ TEST_API void Copy( "GpuMemcpySync:CPU->GPU", platform::TracerEventType::UserDefined, 1); #ifdef PADDLE_WITH_HIP platform::GpuMemcpySync(dst, src, num, hipMemcpyHostToDevice); +#elif defined(PADDLE_WITH_MUSA) + platform::GpuMemcpySync(dst, src, num, musaMemcpyHostToDevice); #else platform::GpuMemcpySync(dst, src, num, cudaMemcpyHostToDevice); #endif @@ -404,6 +432,12 @@ void Copy( num, hipMemcpyDeviceToDevice, reinterpret_cast(stream)); +#elif defined(PADDLE_WITH_MUSA) + platform::GpuMemcpyAsync(dst, + src, + num, + musaMemcpyDeviceToDevice, + reinterpret_cast(stream)); #else platform::GpuMemcpyAsync(dst, src, @@ -417,6 +451,8 @@ void Copy( 1); #ifdef PADDLE_WITH_HIP platform::GpuMemcpySync(dst, src, num, hipMemcpyDeviceToDevice); +#elif defined(PADDLE_WITH_MUSA) + platform::GpuMemcpySync(dst, src, num, musaMemcpyDeviceToDevice); #else platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToDevice); #endif @@ -456,7 +492,7 @@ void Copy( } template <> -TEST_API void Copy( +void Copy( platform::CUDAPinnedPlace dst_place, void* dst, platform::CPUPlace src_place, @@ -492,7 +528,7 @@ void Copy( if (UNLIKELY(num == 0)) return; platform::SetDeviceId(src_place.device); VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " - << dst_place << " by stream(" << stream << ")"; + << dst_place << " by thream(" << stream << ")"; if (stream) { platform::RecordEvent record_event("GpuMemcpyAsync:GPU->CUDAPinned", platform::TracerEventType::UserDefined, @@ -503,6 +539,12 @@ void Copy( num, hipMemcpyDeviceToHost, reinterpret_cast(stream)); +#elif defined(PADDLE_WITH_MUSA) + platform::GpuMemcpyAsync(dst, + src, + num, + musaMemcpyDeviceToHost, + reinterpret_cast(stream)); #else platform::GpuMemcpyAsync(dst, src, @@ -516,6 +558,8 @@ void Copy( 1); #ifdef PADDLE_WITH_HIP platform::GpuMemcpySync(dst, src, num, hipMemcpyDeviceToHost); +#elif defined(PADDLE_WITH_MUSA) + platform::GpuMemcpySync(dst, src, num, musaMemcpyDeviceToHost); #else platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToHost); #endif @@ -534,7 +578,7 @@ void Copy( platform::SetDeviceId(dst_place.device); VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " - << dst_place << " by stream(" << stream << ")"; + << dst_place << " by thream(" << stream << ")"; if (stream) { platform::RecordEvent record_event("GpuMemcpyAsync:CUDAPinned->GPU", platform::TracerEventType::UserDefined, @@ -545,6 +589,12 @@ void Copy( num, hipMemcpyHostToDevice, reinterpret_cast(stream)); +#elif defined(PADDLE_WITH_MUSA) + platform::GpuMemcpyAsync(dst, + src, + num, + musaMemcpyHostToDevice, + reinterpret_cast(stream)); #else platform::GpuMemcpyAsync(dst, src, @@ -558,6 +608,8 @@ void Copy( 1); #ifdef PADDLE_WITH_HIP platform::GpuMemcpySync(dst, src, num, hipMemcpyHostToDevice); +#elif defined(PADDLE_WITH_MUSA) + platform::GpuMemcpySync(dst, src, num, musaMemcpyHostToDevice); #else platform::GpuMemcpySync(dst, src, num, cudaMemcpyHostToDevice); #endif @@ -744,10 +796,11 @@ void Copy(phi::Place dst_place, VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " << dst_place; if (src_place.GetType() == phi::AllocationType::CPU && - dst_place.GetType() == phi::AllocationType::CPU) { // NOLINT + dst_place.GetType() == phi::AllocationType::CPU) { std::memcpy(dst, src, num); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) else if (src_place.GetType() == phi::AllocationType::CPU && // NOLINT dst_place.GetType() == phi::AllocationType::GPUPINNED) { std::memcpy(dst, src, num); diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index fe5fae7bafaeb..6754c17978ea3 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -102,7 +102,7 @@ op_library(quantize_linear_op DEPS phi common) op_library(save_combine_op DEPS string_array phi common) op_library(load_combine_op DEPS string_array) -if (WITH_GPU OR WITH_ROCM) +if (WITH_GPU OR WITH_ROCM OR WITH_MUSA) register_cu_kernel(class_center_sample_op SRCS class_center_sample_op.cu DEPS ${OP_HEADER_DEPS}) endif() @@ -110,7 +110,7 @@ if (WITH_MKLDNN) register_mkldnn_kernel(layer_norm_op SRCS layer_norm_mkldnn_op.cc DEPS ${OP_HEADER_DEPS}) endif() -if (WITH_GPU OR WITH_ROCM) +if (WITH_GPU OR WITH_ROCM OR WITH_MUSA) op_library(activation_op SRCS activation_op.cc activation_op.kps soft_relu_op.cu DEPS ${OP_HEADER_DEPS}) elseif (WITH_XPU_KP) op_library(activation_op SRCS activation_op.cc activation_op.kps DEPS ${OP_HEADER_DEPS}) @@ -118,9 +118,9 @@ else() op_library(activation_op SRCS activation_op.cc DEPS ${OP_HEADER_DEPS}) endif() -if (WITH_GPU OR WITH_ROCM) +if (WITH_GPU OR WITH_ROCM OR WITH_MUSA) op_library(sync_batch_norm_op DEPS processgroup_comm_utils) - if ((NOT WIN32) AND (NOT WITH_ROCM) AND (NOT PADDLE_WITH_ARM) AND (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_LESS 11.3) ) + if ((NOT WIN32) AND (NOT WITH_ROCM) AND (NOT WITH_MUSA) AND (NOT PADDLE_WITH_ARM) AND (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_LESS 11.3) ) op_library(sparse_attention_op DEPS processgroup_comm_utils) endif() endif() @@ -152,10 +152,10 @@ set(COMMON_OP_DEPS ${COMMON_OP_DEPS} beam_search) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} box_wrapper ps_gpu_wrapper) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} common_infer_shape_functions) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} processgroup_comm_utils) -if(WITH_NCCL OR WITH_RCCL) +if(WITH_NCCL OR WITH_RCCL OR WITH_MCCL) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} process_group_nccl) endif() -if (WITH_GPU OR WITH_ROCM) +if (WITH_GPU OR WITH_ROCM OR WITH_MUSA) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} depthwise_conv prelu bert_encoder_functor) endif() if(WITH_XPU) diff --git a/paddle/fluid/operators/affine_channel_op.cu b/paddle/fluid/operators/affine_channel_op.cu index a07f311c6125e..dcbe58ffceb6a 100644 --- a/paddle/fluid/operators/affine_channel_op.cu +++ b/paddle/fluid/operators/affine_channel_op.cu @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#ifdef __NVCC__ +#if defined(__NVCC__) || defined(__MUSACC__) #include "cub/cub.cuh" #endif diff --git a/paddle/fluid/operators/array_to_lod_tensor_op.cc b/paddle/fluid/operators/array_to_lod_tensor_op.cc index 2c85ec6ea2076..79e677034ce0f 100644 --- a/paddle/fluid/operators/array_to_lod_tensor_op.cc +++ b/paddle/fluid/operators/array_to_lod_tensor_op.cc @@ -55,7 +55,7 @@ struct ArrayToLoDFunctor { if (std::is_same::value) { Apply(static_cast(pool.Get(place))); } else { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) Apply(static_cast(pool.Get(place))); #else PADDLE_THROW( diff --git a/paddle/fluid/operators/batch_norm_op.cu b/paddle/fluid/operators/batch_norm_op.cu index 012edde57294a..c25344994cb50 100644 --- a/paddle/fluid/operators/batch_norm_op.cu +++ b/paddle/fluid/operators/batch_norm_op.cu @@ -16,7 +16,7 @@ limitations under the License. */ #include #include #include -#ifdef __NVCC__ +#if defined(__NVCC__) || defined(__MUSACC__) #include "cub/cub.cuh" #endif #ifdef __HIPCC__ diff --git a/paddle/fluid/operators/class_center_sample_op.cu b/paddle/fluid/operators/class_center_sample_op.cu index ecfae25270f91..aa03c2b57355c 100644 --- a/paddle/fluid/operators/class_center_sample_op.cu +++ b/paddle/fluid/operators/class_center_sample_op.cu @@ -19,6 +19,14 @@ #include typedef hiprandState curandState; namespace cub = hipcub; + +#elif defined(PADDLE_WITH_MUSA) +#include +#include + +#include +typedef murandState curandState; + #else #include #include @@ -34,7 +42,7 @@ namespace cub = hipcub; #include "paddle/phi/core/enforce.h" #include "paddle/phi/core/tensor_utils.h" -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) #include "paddle/fluid/distributed/collective/process_group.h" #include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/device/gpu/nccl_helper.h" @@ -76,6 +84,11 @@ __global__ void RandomSampleClassCenter(const int64_t n, CUDA_KERNEL_LOOP(i, n) { buffer[i] = static_cast(hiprand(&localState) % max_val); } +#elif defined(PADDLE_WITH_MUSA) + murand_init(local_seed, id, increment, &localState); + CUDA_KERNEL_LOOP(i, n) { + buffer[i] = static_cast(murand(&localState) % max_val); + } #else curand_init(local_seed, id, increment, &localState); CUDA_KERNEL_LOOP(i, n) { @@ -352,7 +365,7 @@ void ClassCenterSampleKernel(const Context& dev_ctx, phi::TensorFromVector(shard_dim_vec, dev_ctx, &num_classes_per_device); T* num_classes_per_device_ptr = num_classes_per_device.data(); -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) if (nranks > 1) { auto map = paddle::distributed::ProcessGroupMapFromGid::getInstance(); if (map->has(ring_id)) { @@ -397,15 +410,15 @@ void ClassCenterSampleKernel(const Context& dev_ctx, if (comm_ctx) { comm_ctx->AllReduce( - &num_classes_per_device, num_classes_per_device, ncclSum, stream); + &num_classes_per_device, num_classes_per_device, mcclSum, stream); paddle::platform::GpuStreamSync(stream); } else { - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclAllReduce( + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mcclAllReduce( num_classes_per_device_ptr, num_classes_per_device_ptr, num_classes_per_device.numel(), phi::ToNCCLDataType(num_classes_per_device.dtype()), - ncclSum, + mcclSum, comm->comm(), stream)); } diff --git a/paddle/fluid/operators/collective/CMakeLists.txt b/paddle/fluid/operators/collective/CMakeLists.txt index 1c8c8f00217cc..fdecbca81fc59 100644 --- a/paddle/fluid/operators/collective/CMakeLists.txt +++ b/paddle/fluid/operators/collective/CMakeLists.txt @@ -30,7 +30,7 @@ register_operators( DEPS ${COLLECTIVE_DEPS}) -if(WITH_NCCL OR WITH_RCCL) +if(WITH_NCCL OR WITH_RCCL OR WITH_MCCL) set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} nccl_common collective_helper phi common) op_library(c_gen_nccl_id_op DEPS ${COLLECTIVE_DEPS}) diff --git a/paddle/fluid/operators/collective/alltoall_op.cu.cc b/paddle/fluid/operators/collective/alltoall_op.cu.cc index 11b51602d4d75..b554d658126f5 100644 --- a/paddle/fluid/operators/collective/alltoall_op.cu.cc +++ b/paddle/fluid/operators/collective/alltoall_op.cu.cc @@ -16,7 +16,7 @@ limitations under the License. */ #include "paddle/phi/core/distributed/comm_context_manager.h" #include "paddle/phi/core/distributed/utils.h" -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) #include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/device/gpu/nccl_helper.h" #include "paddle/phi/core/distributed/nccl_comm_context.h" @@ -33,12 +33,12 @@ template class AllToAllOpCUDAKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) #if NCCL_VERSION_CODE >= 2703 auto x = ctx.Input("X"); auto out = ctx.Output("Out"); int send_numel = x->numel(); - ncclDataType_t dtype = + mcclDataType_t dtype = platform::ToNCCLDataType(framework::TransToProtoVarType(x->dtype())); int ring_id = ctx.Attr("ring_id"); @@ -114,7 +114,7 @@ class AllToAllOpCUDAKernel : public framework::OpKernel { comm_ctx->GroupEnd(); VLOG(3) << "new comm_context_manager has rid " << ring_id; } else { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart()); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclGroupStart()); for (auto i = 0; i < nranks; ++i) { PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclSend( send_buf + offset, send_numel, dtype, i, comm->comm(), stream)); @@ -122,7 +122,7 @@ class AllToAllOpCUDAKernel : public framework::OpKernel { recv_buf + offset, send_numel, dtype, i, comm->comm(), stream)); offset += send_numel; } - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd()); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclGroupEnd()); VLOG(3) << "old NCCLCommContext has rid " << ring_id; } #else diff --git a/paddle/fluid/operators/collective/barrier_op.cu.cc b/paddle/fluid/operators/collective/barrier_op.cu.cc index 210c42d30f6d5..2b1f04a491d5e 100644 --- a/paddle/fluid/operators/collective/barrier_op.cu.cc +++ b/paddle/fluid/operators/collective/barrier_op.cu.cc @@ -15,7 +15,7 @@ limitations under the License. */ #include "paddle/fluid/operators/collective/barrier_op.h" #include "paddle/phi/core/distributed/comm_context_manager.h" -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) #include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/device/gpu/nccl_helper.h" #include "paddle/phi/core/distributed/nccl_comm_context.h" @@ -30,12 +30,12 @@ template class BarrierOpCUDAKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) auto in = ctx.Input("X"); auto out = ctx.Output("Out"); auto place = ctx.GetPlace(); - ncclDataType_t dtype = + mcclDataType_t dtype = platform::ToNCCLDataType(framework::TransToProtoVarType(in->dtype())); int64_t numel = in->numel(); const void* sendbuff = in->data(); @@ -62,7 +62,7 @@ class BarrierOpCUDAKernel : public framework::OpKernel { "NCCLCommContext is nullptr, collective op should " "has ring_id attr.")); auto stream = comm_ctx->GetStream(); - ncclRedOp_t nccl_red_type = ncclSum; + mcclRedOp_t nccl_red_type = mcclSum; comm_ctx->AllReduce(out, *in, nccl_red_type, stream); platform::GpuStreamSync(stream); VLOG(3) << "new NCCLCommContext has rid " << rid; @@ -70,8 +70,8 @@ class BarrierOpCUDAKernel : public framework::OpKernel { auto comm = platform::NCCLCommContext::Instance().Get(rid, place); // should ExecutionContext for calc stream. auto stream = ctx.cuda_device_context().stream(); - ncclRedOp_t nccl_red_type = ncclSum; - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(sendbuff, + mcclRedOp_t nccl_red_type = mcclSum; + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclAllReduce(sendbuff, recvbuff, numel, dtype, diff --git a/paddle/fluid/operators/collective/c_allgather_op.cu.cc b/paddle/fluid/operators/collective/c_allgather_op.cu.cc index bd105c35886cb..0de5e22aaabeb 100644 --- a/paddle/fluid/operators/collective/c_allgather_op.cu.cc +++ b/paddle/fluid/operators/collective/c_allgather_op.cu.cc @@ -15,7 +15,7 @@ limitations under the License. */ #include "paddle/fluid/operators/collective/c_allgather_op.h" #include "paddle/phi/core/distributed/comm_context_manager.h" -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) #include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/device/gpu/nccl_helper.h" #include "paddle/phi/core/distributed/nccl_comm_context.h" @@ -33,10 +33,10 @@ template class CAllGatherOpCUDAKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) auto in = ctx.Input("X"); auto out = ctx.Output("Out"); - ncclDataType_t dtype = + mcclDataType_t dtype = platform::ToNCCLDataType(framework::TransToProtoVarType(in->dtype())); int nranks = ctx.Attr("nranks"); @@ -103,10 +103,10 @@ class CAllGatherOpCUDAKernel : public framework::OpKernel { comm_ctx->AllGather(out, *in, stream); } else { PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::ncclAllGather(send_buff, + platform::dynload::mcclAllGather(send_buff, recv_buff, send_numel, - static_cast(dtype), + static_cast(dtype), comm->comm(), stream)); } diff --git a/paddle/fluid/operators/collective/c_allreduce_max_op.cu.cc b/paddle/fluid/operators/collective/c_allreduce_max_op.cu.cc index 277988b56916f..b45f568b835f8 100644 --- a/paddle/fluid/operators/collective/c_allreduce_max_op.cu.cc +++ b/paddle/fluid/operators/collective/c_allreduce_max_op.cu.cc @@ -28,9 +28,9 @@ PD_REGISTER_STRUCT_KERNEL(c_allreduce_max, ALL_LAYOUT, ops::CAllReduceMaxCUDAKernel, float, -#if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000 +// #if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000 plat::bfloat16, -#endif +// #endif double, int, int64_t, diff --git a/paddle/fluid/operators/collective/c_allreduce_op.h b/paddle/fluid/operators/collective/c_allreduce_op.h index 9cd472f421788..7bf5e59431f8f 100644 --- a/paddle/fluid/operators/collective/c_allreduce_op.h +++ b/paddle/fluid/operators/collective/c_allreduce_op.h @@ -25,14 +25,14 @@ limitations under the License. */ #include "paddle/phi/api/include/tensor.h" #include "paddle/phi/core/distributed/comm_context_manager.h" -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) || \ defined(PADDLE_WITH_XPU_BKCL) #include "paddle/fluid/platform/collective_helper.h" #include "paddle/phi/core/flags.h" PHI_DECLARE_bool(dynamic_static_unified_comm); #endif -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) #include "paddle/fluid/platform/device/gpu/nccl_helper.h" #include "paddle/phi/core/distributed/nccl_comm_context.h" #elif defined(PADDLE_WITH_XPU_BKCL) @@ -309,13 +309,13 @@ class CAllReduceOpCUDAKernel : public framework::OpKernel { } } -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) auto in = ctx.Input("X"); auto out = ctx.Output("Out"); int rid = ctx.Attr("ring_id"); auto place = ctx.GetPlace(); - ncclDataType_t dtype = + mcclDataType_t dtype = platform::ToNCCLDataType(framework::TransToProtoVarType(in->dtype())); int64_t numel = in->numel(); const void* sendbuff = in->data(); @@ -395,22 +395,22 @@ class CAllReduceOpCUDAKernel : public framework::OpKernel { << ", dtype:" << dtype << ", comm:" << comm << ", stream:" << stream; - ncclRedOp_t nccl_red_type = ncclSum; + mcclRedOp_t nccl_red_type = mcclSum; switch (red_type) { case kRedSum: - nccl_red_type = ncclSum; + nccl_red_type = mcclSum; break; case kRedMax: - nccl_red_type = ncclMax; + nccl_red_type = mcclMax; break; case kRedMin: - nccl_red_type = ncclMin; + nccl_red_type = mcclMin; break; case kRedProd: - nccl_red_type = ncclProd; + nccl_red_type = mcclProd; break; default: @@ -421,7 +421,7 @@ class CAllReduceOpCUDAKernel : public framework::OpKernel { if (comm_ctx) { comm_ctx->AllReduce(out, *in, nccl_red_type, stream); } else { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(sendbuff, + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclAllReduce(sendbuff, recvbuff, numel, dtype, diff --git a/paddle/fluid/operators/collective/c_allreduce_sum_op.cu.cc b/paddle/fluid/operators/collective/c_allreduce_sum_op.cu.cc index 76d809cd234f0..f886e4aaab212 100644 --- a/paddle/fluid/operators/collective/c_allreduce_sum_op.cu.cc +++ b/paddle/fluid/operators/collective/c_allreduce_sum_op.cu.cc @@ -28,9 +28,9 @@ PD_REGISTER_STRUCT_KERNEL(c_allreduce_sum, ALL_LAYOUT, ops::CAllReduceSumCUDAKernel, float, -#if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000 +// #if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000 plat::bfloat16, -#endif +// #endif double, int, int64_t, diff --git a/paddle/fluid/operators/collective/c_broadcast_op.cu.cc b/paddle/fluid/operators/collective/c_broadcast_op.cu.cc index 4d49bc4990c6e..348c22bd8be48 100644 --- a/paddle/fluid/operators/collective/c_broadcast_op.cu.cc +++ b/paddle/fluid/operators/collective/c_broadcast_op.cu.cc @@ -16,7 +16,7 @@ limitations under the License. */ #include "paddle/phi/core/distributed/comm_context_manager.h" #include "paddle/phi/core/distributed/nccl_comm_context.h" -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) #include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/device/gpu/nccl_helper.h" #endif @@ -29,7 +29,7 @@ template class CBroadcastOpCUDAKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) auto x = ctx.Input("X"); auto out = ctx.Output("Out"); @@ -50,11 +50,11 @@ class CBroadcastOpCUDAKernel : public framework::OpKernel { } else { // NOTE(liyurui): This will be removed after moving this operator to phi. int numel = x->numel(); - ncclDataType_t dtype = + mcclDataType_t dtype = platform::ToNCCLDataType(framework::TransToProtoVarType(x->dtype())); auto comm = platform::NCCLCommContext::Instance().Get(rid, place); if (root == comm->rank()) { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclBcast( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclBcast( reinterpret_cast(const_cast(x->data())), numel, dtype, @@ -71,7 +71,7 @@ class CBroadcastOpCUDAKernel : public framework::OpKernel { static_cast(out)); } } else { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclBcast( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclBcast( out->data(), numel, dtype, root, comm->comm(), stream)); VLOG(3) << "rank " << comm->rank() << " invoke Bcast. received " << common::product(out->dims()); @@ -100,8 +100,8 @@ PD_REGISTER_STRUCT_KERNEL(c_broadcast, int64_t, float, double, -#if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000 +// #if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000 plat::bfloat16, -#endif +// #endif plat::float16) { } diff --git a/paddle/fluid/operators/collective/c_comm_init_all_op.cc b/paddle/fluid/operators/collective/c_comm_init_all_op.cc index 2dc9af0139546..2e84a0e80c2dc 100644 --- a/paddle/fluid/operators/collective/c_comm_init_all_op.cc +++ b/paddle/fluid/operators/collective/c_comm_init_all_op.cc @@ -18,7 +18,7 @@ limitations under the License. */ #include "paddle/fluid/framework/threadpool.h" #include "paddle/fluid/platform/collective_helper.h" -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) #include "paddle/fluid/platform/device/gpu/nccl_helper.h" #endif @@ -56,7 +56,7 @@ class CCommInitAllOp : public framework::OperatorBase { // platform::errors::PreconditionNotMet( // "CCommInitAllOp can run on gpu place only")); -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) std::vector devices = Attr>("devices"); if (devices.empty()) { devices = platform::GetSelectedDevices(); diff --git a/paddle/fluid/operators/collective/c_comm_init_multitrainer_op.cc b/paddle/fluid/operators/collective/c_comm_init_multitrainer_op.cc index 39d22fcd5f50d..4d92c369abfeb 100644 --- a/paddle/fluid/operators/collective/c_comm_init_multitrainer_op.cc +++ b/paddle/fluid/operators/collective/c_comm_init_multitrainer_op.cc @@ -17,6 +17,10 @@ limitations under the License. */ #if defined(PADDLE_WITH_RCCL) #include #endif + +#if defined(PADDLE_WITH_MCCL) +#include +#endif #include #include @@ -28,7 +32,7 @@ limitations under the License. */ #include "paddle/fluid/framework/threadpool.h" // #include "paddle/fluid/operators/distributed/distributed.h" // #include "paddle/fluid/operators/distributed/request_handler_impl.h" -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) #include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/device/gpu/nccl_helper.h" #endif @@ -55,8 +59,8 @@ class CCommInitMultiTrainerOp : public framework::OperatorBase { auto var = scope.FindVar(Input("X")); PADDLE_ENFORCE_NOT_NULL( var, platform::errors::InvalidArgument("Input X must be provided.")); -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) - ncclUniqueId* nccl_id = var->GetMutable(); +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) + mcclUniqueId* nccl_id = var->GetMutable(); int ntrainers = Attr("ntrainers"); int train_id = Attr("trainer_id"); diff --git a/paddle/fluid/operators/collective/c_comm_init_op.cc b/paddle/fluid/operators/collective/c_comm_init_op.cc index 086257eab6038..3f7683fb405cb 100644 --- a/paddle/fluid/operators/collective/c_comm_init_op.cc +++ b/paddle/fluid/operators/collective/c_comm_init_op.cc @@ -17,6 +17,11 @@ limitations under the License. */ #if defined(PADDLE_WITH_RCCL) #include #endif + +#if defined(PADDLE_WITH_MCCL) +#include +#endif + #if defined(PADDLE_WITH_XPU_BKCL) #include "xpu/bkcl.h" #endif @@ -24,12 +29,12 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) || \ defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_CUSTOM_DEVICE) #include "paddle/fluid/platform/collective_helper.h" #endif -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) #include "paddle/phi/core/distributed/nccl_comm_context.h" PHI_DECLARE_bool(dynamic_static_unified_comm); #elif defined(PADDLE_WITH_XPU_BKCL) @@ -92,8 +97,8 @@ class CCommInitOp : public framework::OperatorBase { #endif } else { // TODO(wangxi): Put this in the unified header file -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) - using UniqueId = ncclUniqueId; +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) + using UniqueId = mcclUniqueId; using CommContext = platform::NCCLCommContext; #elif defined(PADDLE_WITH_XPU_BKCL) using UniqueId = BKCLUniqueId; @@ -109,7 +114,7 @@ class CCommInitOp : public framework::OperatorBase { platform::errors::PreconditionNotMet( "CCommInitOp can run on gpu or xpu place only.")); -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) || \ defined(PADDLE_WITH_XPU_BKCL) auto var = scope.FindVar(Input("X")); PADDLE_ENFORCE_NOT_NULL( @@ -145,7 +150,7 @@ class CCommInitOp : public framework::OperatorBase { return; } #endif -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) || \ defined(PADDLE_WITH_XPU_BKCL) VLOG(3) << "#### use old comm lab ####"; UniqueId* comm_id = var->GetMutable(); diff --git a/paddle/fluid/operators/collective/c_concat_op.cu.cc b/paddle/fluid/operators/collective/c_concat_op.cu.cc index d13179cbae48b..f170e07b6532f 100644 --- a/paddle/fluid/operators/collective/c_concat_op.cu.cc +++ b/paddle/fluid/operators/collective/c_concat_op.cu.cc @@ -20,7 +20,7 @@ limitations under the License. */ #include "paddle/fluid/operators/math/concat_and_split.h" #include "paddle/phi/api/include/tensor.h" -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) #include "paddle/fluid/distributed/collective/process_group.h" #include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/device/gpu/nccl_helper.h" @@ -38,7 +38,7 @@ class CConcatOpCUDAKernel : public framework::OpKernel { void Compute(const framework::ExecutionContext& ctx) const override { auto x = ctx.Input("X"); auto out = ctx.Output("Out"); - ncclDataType_t dtype = + mcclDataType_t dtype = platform::ToNCCLDataType(framework::TransToProtoVarType(x->dtype())); int nranks = ctx.Attr("nranks"); @@ -65,7 +65,7 @@ class CConcatOpCUDAKernel : public framework::OpKernel { rank, nranks)); -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) phi::DenseTensor temp_out; framework::DDim temp_out_dims = x->dims(); temp_out_dims[0] *= nranks; @@ -130,10 +130,10 @@ class CConcatOpCUDAKernel : public framework::OpKernel { comm_ctx->AllGather(&temp_out, *x, stream); } else { PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::ncclAllGather(send_buff, + platform::dynload::mcclAllGather(send_buff, recv_buff, send_numel, - static_cast(dtype), + static_cast(dtype), comm->comm(), stream)); } @@ -175,8 +175,8 @@ PD_REGISTER_STRUCT_KERNEL(c_concat, double, int, int64_t, -#if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000 +// #if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000 plat::bfloat16, -#endif +// #endif plat::float16) { } diff --git a/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc b/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc index 4a07f7e98f793..9851b9d9d9f68 100644 --- a/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc +++ b/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc @@ -27,14 +27,14 @@ PHI_DECLARE_bool(dynamic_static_unified_comm); namespace paddle { namespace operators { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) -static void GenNCCLID(std::vector* nccl_ids) { +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +static void GenNCCLID(std::vector* nccl_ids) { for (auto& nccl_id : *nccl_ids) { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGetUniqueId(&nccl_id)); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclGetUniqueId(&nccl_id)); } } -static void CopyNCCLIDToVar(const std::vector& nccl_ids, +static void CopyNCCLIDToVar(const std::vector& nccl_ids, std::function func, const framework::Scope& scope) { for (size_t i = 0; i < nccl_ids.size(); ++i) { @@ -44,8 +44,8 @@ static void CopyNCCLIDToVar(const std::vector& nccl_ids, var, platform::errors::NotFound("Variable with name %s is not found", var_name.c_str())); - auto nccl_id = var->GetMutable(); - memcpy(nccl_id, &nccl_ids[i], sizeof(ncclUniqueId)); + auto nccl_id = var->GetMutable(); + memcpy(nccl_id, &nccl_ids[i], sizeof(mcclUniqueId)); } } @@ -68,7 +68,7 @@ class CGenNCCLIdOp : public framework::OperatorBase { std::string endpoint = Attr("endpoint"); - std::vector nccl_ids; + std::vector nccl_ids; nccl_ids.resize(1); if (!FLAGS_dynamic_static_unified_comm) { diff --git a/paddle/fluid/operators/collective/c_reduce_op.h b/paddle/fluid/operators/collective/c_reduce_op.h index 20884d1ae8a96..26cacdd87fa86 100644 --- a/paddle/fluid/operators/collective/c_reduce_op.h +++ b/paddle/fluid/operators/collective/c_reduce_op.h @@ -26,14 +26,14 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/phi/core/distributed/comm_context_manager.h" -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) || \ defined(PADDLE_WITH_XPU_BKCL) #include "paddle/fluid/platform/collective_helper.h" #include "paddle/phi/core/flags.h" PHI_DECLARE_bool(dynamic_static_unified_comm); #endif -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) #include "paddle/fluid/platform/device/gpu/nccl_helper.h" #include "paddle/phi/core/distributed/nccl_comm_context.h" #elif defined(PADDLE_WITH_XPU_BKCL) @@ -236,12 +236,12 @@ template class CReduceOpCUDAKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) auto in = ctx.Input("X"); auto out = ctx.Output("Out"); auto place = ctx.GetPlace(); - ncclDataType_t dtype = + mcclDataType_t dtype = platform::ToNCCLDataType(framework::TransToProtoVarType(in->dtype())); int64_t numel = in->numel(); const void* sendbuff = in->data(); @@ -286,22 +286,22 @@ class CReduceOpCUDAKernel : public framework::OpKernel { stream = ctx.cuda_device_context().stream(); } - ncclRedOp_t nccl_red_type = ncclSum; + mcclRedOp_t nccl_red_type = mcclSum; switch (red_type) { case kRedSum: - nccl_red_type = ncclSum; + nccl_red_type = mcclSum; break; case kRedMax: - nccl_red_type = ncclMax; + nccl_red_type = mcclMax; break; case kRedMin: - nccl_red_type = ncclMin; + nccl_red_type = mcclMin; break; case kRedProd: - nccl_red_type = ncclProd; + nccl_red_type = mcclProd; break; default: @@ -315,7 +315,7 @@ class CReduceOpCUDAKernel : public framework::OpKernel { if (comm_ctx) { comm_ctx->Reduce(out, *in, nccl_red_type, root, stream); } else { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclReduce(sendbuff, + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclReduce(sendbuff, recvbuff, numel, dtype, diff --git a/paddle/fluid/operators/collective/c_reducescatter_op.cu.cc b/paddle/fluid/operators/collective/c_reducescatter_op.cu.cc index cd1cf0c017636..af26bf7d858ba 100644 --- a/paddle/fluid/operators/collective/c_reducescatter_op.cu.cc +++ b/paddle/fluid/operators/collective/c_reducescatter_op.cu.cc @@ -15,7 +15,7 @@ limitations under the License. */ #include "paddle/fluid/operators/collective/c_reducescatter_op.h" #include "paddle/phi/core/distributed/comm_context_manager.h" -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) #include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/device/gpu/nccl_helper.h" #include "paddle/phi/core/distributed/nccl_comm_context.h" @@ -30,7 +30,7 @@ template class CReduceScatterOpCUDAKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) auto in = ctx.Input("X"); auto out = ctx.Output("Out"); @@ -105,14 +105,14 @@ class CReduceScatterOpCUDAKernel : public framework::OpKernel { platform::ToNCCLDataType(framework::TransToProtoVarType(in->dtype())); if (comm_ctx) { - comm_ctx->ReduceScatter(out, *in, ncclSum, stream); + comm_ctx->ReduceScatter(out, *in, mcclSum, stream); } else { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclReduceScatter( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclReduceScatter( send_buff, recv_buff, recv_numel, - static_cast(dtype), - ncclSum, + static_cast(dtype), + mcclSum, comm->comm(), stream)); } @@ -135,9 +135,9 @@ PD_REGISTER_STRUCT_KERNEL(c_reducescatter, ops::CReduceScatterOpCUDAKernel, float, double, -#if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000 +// #if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000 plat::bfloat16, -#endif +// #endif int, int64_t, plat::float16) { diff --git a/paddle/fluid/operators/collective/c_scatter_op.cu.cc b/paddle/fluid/operators/collective/c_scatter_op.cu.cc index 7f4b4f6734de0..86bb602256aef 100644 --- a/paddle/fluid/operators/collective/c_scatter_op.cu.cc +++ b/paddle/fluid/operators/collective/c_scatter_op.cu.cc @@ -15,7 +15,7 @@ limitations under the License. */ #include "paddle/fluid/operators/collective/c_scatter_op.h" #include "paddle/phi/core/distributed/comm_context_manager.h" -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) #include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/device/gpu/nccl_helper.h" #include "paddle/phi/core/distributed/nccl_comm_context.h" @@ -30,11 +30,11 @@ template class CScatterOpCUDAKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) auto x = ctx.Input("X"); auto out = ctx.Output("Out"); int numel = x->numel(); - ncclDataType_t dtype = + mcclDataType_t dtype = platform::ToNCCLDataType(framework::TransToProtoVarType(x->dtype())); int nranks = ctx.Attr("nranks"); @@ -123,7 +123,7 @@ class CScatterOpCUDAKernel : public framework::OpKernel { } } else { if (root_id == comm->rank()) { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclBcast( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclBcast( reinterpret_cast(const_cast(x->data())), numel, dtype, @@ -137,7 +137,7 @@ class CScatterOpCUDAKernel : public framework::OpKernel { *platform::DeviceContextPool::Instance().Get(place), static_cast(&temp)); } else { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclBcast( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclBcast( out_ptr, numel, dtype, root_id, comm->comm(), stream)); } } diff --git a/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu b/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu index f8f43d5c9da48..7ea80d8a54e9a 100644 --- a/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu +++ b/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu @@ -27,7 +27,7 @@ limitations under the License. */ #include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/softmax_impl.h" -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) #include "paddle/phi/core/distributed/nccl_comm_context.h" #include "paddle/phi/core/flags.h" PHI_DECLARE_bool(dynamic_static_unified_comm); @@ -208,17 +208,17 @@ struct CSoftmaxWithCrossEntropyFunctor { eigen_logits.maximum(along_axis); if (comm_ctx) { - comm_ctx->AllReduce(&logits_max, logits_max, ncclMax, stream); + comm_ctx->AllReduce(&logits_max, logits_max, mcclMax, stream); } else { void* logits_max_buff = logits_max.mutable_data(place); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclAllReduce( logits_max_buff, logits_max_buff, logits_max.numel(), platform::ToNCCLDataType( framework::TransToProtoVarType(logits_max.dtype())), - ncclMax, + mcclMax, comm->comm(), stream)); } @@ -273,16 +273,16 @@ struct CSoftmaxWithCrossEntropyFunctor { predicted_logits.mutable_data(place); if (comm_ctx) { - comm_ctx->AllReduce(&predicted_logits, predicted_logits, ncclSum, stream); + comm_ctx->AllReduce(&predicted_logits, predicted_logits, mcclSum, stream); } else { void* predict_logits_buff = predicted_logits.data(); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclAllReduce( predict_logits_buff, predict_logits_buff, predicted_logits.numel(), platform::ToNCCLDataType( framework::TransToProtoVarType(predicted_logits.dtype())), - ncclSum, + mcclSum, comm->comm(), stream)); } @@ -301,16 +301,16 @@ struct CSoftmaxWithCrossEntropyFunctor { eigen_softmax.sum(along_axis); if (comm_ctx) { - comm_ctx->AllReduce(&sum_exp_logits, sum_exp_logits, ncclSum, stream); + comm_ctx->AllReduce(&sum_exp_logits, sum_exp_logits, mcclSum, stream); } else { void* sum_exp_logits_buff = sum_exp_logits.data(); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclAllReduce( sum_exp_logits_buff, sum_exp_logits_buff, sum_exp_logits.numel(), platform::ToNCCLDataType( framework::TransToProtoVarType(sum_exp_logits.dtype())), - ncclSum, + mcclSum, comm->comm(), stream)); } diff --git a/paddle/fluid/operators/collective/c_sync_calc_stream_op.h b/paddle/fluid/operators/collective/c_sync_calc_stream_op.h index e100397924af5..79c32bc907045 100644 --- a/paddle/fluid/operators/collective/c_sync_calc_stream_op.h +++ b/paddle/fluid/operators/collective/c_sync_calc_stream_op.h @@ -39,7 +39,7 @@ template class CSyncCalcStreamKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { -#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && !defined(_WIN32) +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)) && !defined(_WIN32) auto place = ctx.GetPlace(); auto dev_ctx = static_cast( diff --git a/paddle/fluid/operators/collective/c_sync_comm_stream_op.h b/paddle/fluid/operators/collective/c_sync_comm_stream_op.h index 8d60d633272a9..52f4e6f6d88fe 100644 --- a/paddle/fluid/operators/collective/c_sync_comm_stream_op.h +++ b/paddle/fluid/operators/collective/c_sync_comm_stream_op.h @@ -18,14 +18,14 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) || \ defined(PADDLE_WITH_XPU_BKCL) #include "paddle/phi/core/distributed/comm_context_manager.h" #include "paddle/phi/core/flags.h" PHI_DECLARE_bool(dynamic_static_unified_comm); #endif -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) #include "paddle/fluid/platform/device/gpu/nccl_helper.h" #include "paddle/phi/core/distributed/nccl_comm_context.h" #elif defined(PADDLE_WITH_XPU_BKCL) @@ -40,7 +40,7 @@ template class CSyncCommStreamKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) auto place = ctx.GetPlace(); int ring_id = ctx.Attr("ring_id"); diff --git a/paddle/fluid/operators/collective/c_wait_comm_op.cc b/paddle/fluid/operators/collective/c_wait_comm_op.cc index f2eab0532b9df..c97da1a737b0f 100644 --- a/paddle/fluid/operators/collective/c_wait_comm_op.cc +++ b/paddle/fluid/operators/collective/c_wait_comm_op.cc @@ -19,7 +19,7 @@ namespace framework { class Scope; } // namespace framework } // namespace paddle -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) #include "paddle/fluid/platform/collective_helper.h" #include "paddle/phi/core/distributed/comm_context_manager.h" #include "paddle/phi/core/distributed/nccl_comm_context.h" @@ -47,7 +47,7 @@ class CWaitCommOp : public framework::OperatorBase { "wait_comm op can run on gpu place only for now, but got %s", place.DebugString())); -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) int ring_id = Attr("ring_id"); gpuStream_t compute_stream = @@ -89,6 +89,9 @@ class CWaitCommOp : public framework::OperatorBase { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event, comm_stream)); PADDLE_ENFORCE_GPU_SUCCESS(hipStreamWaitEvent(compute_stream, event, 0)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(musaEventRecord(event, comm_stream)); + PADDLE_ENFORCE_GPU_SUCCESS(musaStreamWaitEvent(compute_stream, event, 0)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event, comm_stream)); PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamWaitEvent(compute_stream, event, 0)); diff --git a/paddle/fluid/operators/collective/c_wait_compute_op.cc b/paddle/fluid/operators/collective/c_wait_compute_op.cc index 33b56cbe6581d..3088e1ed61d66 100644 --- a/paddle/fluid/operators/collective/c_wait_compute_op.cc +++ b/paddle/fluid/operators/collective/c_wait_compute_op.cc @@ -19,7 +19,7 @@ namespace framework { class Scope; } // namespace framework } // namespace paddle -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) #include "paddle/fluid/platform/collective_helper.h" #include "paddle/phi/core/distributed/comm_context_manager.h" #include "paddle/phi/core/distributed/nccl_comm_context.h" @@ -47,7 +47,7 @@ class CWaitComputeOp : public framework::OperatorBase { "wait_compute op can run on gpu place only for now, but got %s", place.DebugString())); -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) int ring_id = Attr("ring_id"); gpuStream_t compute_stream = @@ -89,6 +89,9 @@ class CWaitComputeOp : public framework::OperatorBase { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event, compute_stream)); PADDLE_ENFORCE_GPU_SUCCESS(hipStreamWaitEvent(comm_stream, event, 0)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(musaEventRecord(event, compute_stream)); + PADDLE_ENFORCE_GPU_SUCCESS(musaStreamWaitEvent(comm_stream, event, 0)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event, compute_stream)); PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamWaitEvent(comm_stream, event, 0)); diff --git a/paddle/fluid/operators/collective/gen_nccl_id_op.cc b/paddle/fluid/operators/collective/gen_nccl_id_op.cc index 1d03cb151e4a0..da13a5ba800a6 100644 --- a/paddle/fluid/operators/collective/gen_nccl_id_op.cc +++ b/paddle/fluid/operators/collective/gen_nccl_id_op.cc @@ -34,14 +34,14 @@ class Scope; namespace paddle { namespace operators { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) -static void GenNCCLID(std::vector* nccl_ids) { +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +static void GenNCCLID(std::vector* nccl_ids) { for (auto& nccl_id : *nccl_ids) { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGetUniqueId(&nccl_id)); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclGetUniqueId(&nccl_id)); } } -static void CopyNCCLIDToVar(const std::vector& nccl_ids, +static void CopyNCCLIDToVar(const std::vector& nccl_ids, std::function func, const framework::Scope& scope) { for (size_t i = 0; i < nccl_ids.size(); ++i) { @@ -51,8 +51,8 @@ static void CopyNCCLIDToVar(const std::vector& nccl_ids, var, platform::errors::NotFound("Variable with name %s is not found", var_name.c_str())); - auto nccl_id = var->GetMutable(); - memcpy(nccl_id, &nccl_ids[i], sizeof(ncclUniqueId)); + auto nccl_id = var->GetMutable(); + memcpy(nccl_id, &nccl_ids[i], sizeof(mcclUniqueId)); } } @@ -130,7 +130,7 @@ class GenNCCLIdOp : public framework::OperatorBase { << ", trainers:" << ss.str(); int server_fd = -1; - std::vector nccl_ids; + std::vector nccl_ids; nccl_ids.resize(nccl_comm_num); /// 1. init flat diff --git a/paddle/fluid/operators/collective/global_gather_op.cu.cc b/paddle/fluid/operators/collective/global_gather_op.cu.cc index 7a9c02628088f..a1e09d2c35cbb 100644 --- a/paddle/fluid/operators/collective/global_gather_op.cu.cc +++ b/paddle/fluid/operators/collective/global_gather_op.cu.cc @@ -14,7 +14,7 @@ limitations under the License. */ #include "paddle/fluid/operators/collective/global_gather_op.h" -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) #include "paddle/fluid/distributed/collective/process_group_nccl.h" #include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/device/gpu/nccl_helper.h" @@ -31,8 +31,8 @@ namespace operators { template struct GlobalGatherFunctor { void operator()(const framework::ExecutionContext& ctx) { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) -#if NCCL_VERSION_CODE >= 2703 +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +// #if NCCL_VERSION_CODE >= 2703 auto x = ctx.Input("X"); auto local_count = ctx.Input("local_count"); auto global_count = ctx.Input("global_count"); @@ -73,7 +73,7 @@ struct GlobalGatherFunctor { cpu_global_count_data = cpu_global_count.data(); } - ncclDataType_t dtype = + mcclDataType_t dtype = platform::ToNCCLDataType(framework::TransToProtoVarType(x->dtype())); int ring_id = ctx.Attr("ring_id"); @@ -165,11 +165,11 @@ struct GlobalGatherFunctor { auto send_buf = x->data(); auto recv_buf = out->data(); for (auto i = 0; i < n_expert; ++i) { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart()); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclGroupStart()); for (auto j = 0; j < nranks; ++j) { int idx = i + j * n_expert; if (cpu_global_count_data[idx]) { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclSend( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclSend( send_buf + send_ptr * in_feat, cpu_global_count_data[idx] * in_feat, dtype, @@ -179,7 +179,7 @@ struct GlobalGatherFunctor { send_ptr += cpu_global_count_data[idx]; } if (cpu_local_count_data[idx]) { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclRecv( recv_buf + expert_ptr[idx] * in_feat, cpu_local_count_data[idx] * in_feat, dtype, @@ -188,13 +188,13 @@ struct GlobalGatherFunctor { stream)); } } - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd()); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclGroupEnd()); } } -#else - PADDLE_THROW( - platform::errors::Unavailable("NCCL version >= 2.7.3 is needed.")); -#endif +// #else + // PADDLE_THROW( + // platform::errors::Unavailable("NCCL version >= 2.7.3 is needed.")); +// #endif #else PADDLE_THROW( platform::errors::Unavailable("PaddlePaddle should compile with GPU.")); @@ -205,8 +205,8 @@ struct GlobalGatherFunctor { template struct GlobalGatherProcessGroupFunctor { void operator()(const framework::ExecutionContext& ctx) { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) -#if NCCL_VERSION_CODE >= 2703 +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +// #if NCCL_VERSION_CODE >= 2703 auto x = ctx.Input("X"); auto local_count = ctx.Input("local_count"); auto global_count = ctx.Input("global_count"); @@ -304,14 +304,16 @@ struct GlobalGatherProcessGroupFunctor { #ifdef PADDLE_WITH_CUDA PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize()); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(musaDeviceSynchronize()); #else PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceSynchronize()); #endif -#else - PADDLE_THROW( - platform::errors::Unavailable("NCCL version >= 2.7.3 is needed.")); -#endif +// #else +// PADDLE_THROW( +// platform::errors::Unavailable("NCCL version >= 2.7.3 is needed.")); +// #endif #else PADDLE_THROW( platform::errors::Unavailable("PaddlePaddle should compile with GPU.")); diff --git a/paddle/fluid/operators/collective/global_scatter_op.cu.cc b/paddle/fluid/operators/collective/global_scatter_op.cu.cc index 6b915d35be043..38a992d3baaa3 100644 --- a/paddle/fluid/operators/collective/global_scatter_op.cu.cc +++ b/paddle/fluid/operators/collective/global_scatter_op.cu.cc @@ -17,7 +17,7 @@ limitations under the License. */ #include "paddle/fluid/framework/convert_utils.h" -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) #include "paddle/fluid/distributed/collective/process_group_nccl.h" #include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/device/gpu/nccl_helper.h" @@ -32,8 +32,8 @@ namespace operators { template struct GlobalScatterFunctor { void operator()(const framework::ExecutionContext& ctx) { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) -#if NCCL_VERSION_CODE >= 2703 +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +// #if NCCL_VERSION_CODE >= 2703 auto x = ctx.Input("X"); auto local_count = ctx.Input("local_count"); auto global_count = ctx.Input("global_count"); @@ -72,7 +72,7 @@ struct GlobalScatterFunctor { global_count_len = cpu_global_count.numel(); } - ncclDataType_t dtype = + mcclDataType_t dtype = platform::ToNCCLDataType(framework::TransToProtoVarType(x->dtype())); int ring_id = ctx.Attr("ring_id"); @@ -173,11 +173,11 @@ struct GlobalScatterFunctor { auto recv_buf = out->data(); for (auto i = 0; i < n_expert; ++i) { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart()); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclGroupStart()); for (auto j = 0; j < nranks; ++j) { int idx = i + j * n_expert; if (cpu_local_count_data[idx]) { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclSend( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclSend( send_buf + expert_ptr[idx] * in_feat, cpu_local_count_data[idx] * in_feat, dtype, @@ -186,7 +186,7 @@ struct GlobalScatterFunctor { stream)); } if (cpu_global_count_data[idx]) { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclRecv( recv_buf + recv_ptr * in_feat, cpu_global_count_data[idx] * in_feat, dtype, @@ -196,14 +196,14 @@ struct GlobalScatterFunctor { recv_ptr += cpu_global_count_data[idx]; } } - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd()); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclGroupEnd()); } } -#else - PADDLE_THROW( - platform::errors::Unavailable("NCCL version >= 2.7.3 is needed.")); -#endif +// #else +// PADDLE_THROW( +// platform::errors::Unavailable("NCCL version >= 2.7.3 is needed.")); +// #endif #else PADDLE_THROW( platform::errors::Unavailable("PaddlePaddle should compile with GPU.")); @@ -214,8 +214,8 @@ struct GlobalScatterFunctor { template struct GlobalScatterProcessGroupFunctor { void operator()(const framework::ExecutionContext& ctx) { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) -#if NCCL_VERSION_CODE >= 2703 +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +// #if NCCL_VERSION_CODE >= 2703 auto x = ctx.Input("X"); auto local_count = ctx.Input("local_count"); auto global_count = ctx.Input("global_count"); @@ -311,14 +311,16 @@ struct GlobalScatterProcessGroupFunctor { #ifdef PADDLE_WITH_CUDA PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize()); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(musaDeviceSynchronize()); #else PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceSynchronize()); #endif -#else - PADDLE_THROW( - platform::errors::Unavailable("NCCL version >= 2.7.3 is needed.")); -#endif +// #else +// PADDLE_THROW( +// platform::errors::Unavailable("NCCL version >= 2.7.3 is needed.")); +// #endif #else PADDLE_THROW( platform::errors::Unavailable("PaddlePaddle should compile with GPU.")); diff --git a/paddle/fluid/operators/collective/mp_allreduce_sum_op.cu.cc b/paddle/fluid/operators/collective/mp_allreduce_sum_op.cu.cc index b4773a8eb5456..d53a92369df40 100644 --- a/paddle/fluid/operators/collective/mp_allreduce_sum_op.cu.cc +++ b/paddle/fluid/operators/collective/mp_allreduce_sum_op.cu.cc @@ -31,8 +31,8 @@ PD_REGISTER_STRUCT_KERNEL(mp_allreduce_sum, double, int, int64_t, -#if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000 +// #if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000 plat::bfloat16, -#endif +// #endif plat::float16) { } diff --git a/paddle/fluid/operators/collective/partial_allgather_op.cu.cc b/paddle/fluid/operators/collective/partial_allgather_op.cu.cc index b0cdabce48503..863850b6e3839 100644 --- a/paddle/fluid/operators/collective/partial_allgather_op.cu.cc +++ b/paddle/fluid/operators/collective/partial_allgather_op.cu.cc @@ -14,7 +14,7 @@ limitations under the License. */ #include "paddle/fluid/operators/collective/partial_allgather_op.h" -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) #include "paddle/fluid/distributed/collective/process_group.h" #include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/device/gpu/nccl_helper.h" @@ -32,11 +32,11 @@ template class PartialAllGatherOpCUDAKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) auto in = ctx.Input("X"); auto out = ctx.Output("Out"); int64_t numel = in->numel(); - ncclDataType_t dtype = + mcclDataType_t dtype = platform::ToNCCLDataType(framework::TransToProtoVarType(in->dtype())); int nranks = ctx.Attr("nranks"); @@ -128,10 +128,10 @@ class PartialAllGatherOpCUDAKernel : public framework::OpKernel { const T* send_buff = in->data() + offset; T* recv_buff = out->data(); PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::ncclAllGather(send_buff, + platform::dynload::mcclAllGather(send_buff, recv_buff, send_numel, - static_cast(dtype), + static_cast(dtype), comm->comm(), stream)); } @@ -155,9 +155,9 @@ PD_REGISTER_STRUCT_KERNEL(partial_allgather, ops::PartialAllGatherOpCUDAKernel, float, double, -#if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000 +// #if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000 plat::bfloat16, -#endif +// #endif int, int64_t, plat::float16) { diff --git a/paddle/fluid/operators/collective/partial_recv_op.cu.cc b/paddle/fluid/operators/collective/partial_recv_op.cu.cc index c8844058696e1..fdfb31e7b2eab 100644 --- a/paddle/fluid/operators/collective/partial_recv_op.cu.cc +++ b/paddle/fluid/operators/collective/partial_recv_op.cu.cc @@ -14,7 +14,7 @@ limitations under the License. */ #include "paddle/fluid/operators/collective/partial_recv_op.h" -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) #include "paddle/fluid/distributed/collective/process_group.h" #include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/device/gpu/nccl_helper.h" @@ -32,8 +32,8 @@ template class PartialRecvOpCUDAKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { -#if (defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_NCCL)) && \ - NCCL_VERSION_CODE >= 2703 +#if (defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) || defined(PADDLE_WITH_NCCL)) + // NCCL_VERSION_CODE >= 2703 auto out = ctx.Output("Out"); auto out_dims = out->dims(); auto numel = out->numel(); @@ -142,7 +142,7 @@ class PartialRecvOpCUDAKernel : public framework::OpKernel { peer, nranks)); - ncclDataType_t dtype = platform::ToNCCLDataType(type); + mcclDataType_t dtype = platform::ToNCCLDataType(type); if (comm_ctx) { auto recv_buf = distributed::GetPartialTensor(*out, offset, recv_numel); @@ -150,7 +150,7 @@ class PartialRecvOpCUDAKernel : public framework::OpKernel { comm_ctx->Recv(&recv_buf, recv_numel, peer, stream); } else { PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::ncclRecv(out->data() + offset, + platform::dynload::mcclRecv(out->data() + offset, recv_numel, dtype, peer, @@ -180,9 +180,9 @@ PD_REGISTER_STRUCT_KERNEL(partial_recv, ops::PartialRecvOpCUDAKernel, float, double, -#if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000 +// #if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000 plat::bfloat16, -#endif +// #endif int, int64_t, plat::float16) { diff --git a/paddle/fluid/operators/collective/partial_send_op.cu.cc b/paddle/fluid/operators/collective/partial_send_op.cu.cc index 39858b3ed37a2..d395f3a5febb3 100644 --- a/paddle/fluid/operators/collective/partial_send_op.cu.cc +++ b/paddle/fluid/operators/collective/partial_send_op.cu.cc @@ -14,7 +14,7 @@ limitations under the License. */ #include "paddle/fluid/operators/collective/partial_send_op.h" -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) #include "paddle/fluid/distributed/collective/process_group.h" #include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/device/gpu/nccl_helper.h" @@ -32,8 +32,8 @@ template class PartialSendCUDAKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { -#if (defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_NCCL)) && \ - NCCL_VERSION_CODE >= 2703 +#if (defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) || defined(PADDLE_WITH_NCCL)) + // NCCL_VERSION_CODE >= 2703 auto x = ctx.Input("X"); int numel = x->numel(); int rid = ctx.Attr("ring_id"); @@ -136,7 +136,7 @@ class PartialSendCUDAKernel : public framework::OpKernel { peer, nranks)); - ncclDataType_t dtype = + mcclDataType_t dtype = platform::ToNCCLDataType(framework::TransToProtoVarType(x->dtype())); if (comm_ctx) { @@ -145,7 +145,7 @@ class PartialSendCUDAKernel : public framework::OpKernel { comm_ctx->Send(send_buf, send_numel, peer, stream); } else { PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::ncclSend(x->data() + offset, + platform::dynload::mcclSend(x->data() + offset, send_numel, dtype, peer, @@ -176,9 +176,9 @@ PD_REGISTER_STRUCT_KERNEL(partial_send, ops::PartialSendCUDAKernel, float, double, -#if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000 +// #if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000 plat::bfloat16, -#endif +// #endif int, int64_t, plat::float16) { diff --git a/paddle/fluid/operators/collective/recv_v2_op.cu.cc b/paddle/fluid/operators/collective/recv_v2_op.cu.cc index 41c2e70df8c35..283e75d7a53e8 100644 --- a/paddle/fluid/operators/collective/recv_v2_op.cu.cc +++ b/paddle/fluid/operators/collective/recv_v2_op.cu.cc @@ -14,7 +14,7 @@ limitations under the License. */ #include "paddle/fluid/operators/collective/recv_v2_op.h" -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) #include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/device/gpu/nccl_helper.h" #include "paddle/phi/core/distributed/comm_context_manager.h" @@ -29,8 +29,7 @@ PHI_DECLARE_bool(dynamic_static_unified_comm); namespace paddle { namespace operators { -#if (defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_NCCL)) && \ - NCCL_VERSION_CODE >= 2703 +#if (defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) || defined(PADDLE_WITH_NCCL)) framework::DDim recv_shape_info(const platform::Place &place, const gpuStream_t &stream, platform::NCCLComm *comm, @@ -47,7 +46,7 @@ framework::DDim recv_shape_info(const platform::Place &place, } phi::DataType shape_dtype = phi::DataType::INT32; - ncclDataType_t nccl_dtype = + mcclDataType_t nccl_dtype = platform::ToNCCLDataType(framework::TransToProtoVarType(shape_dtype)); // step1: recv the shape size @@ -60,7 +59,7 @@ framework::DDim recv_shape_info(const platform::Place &place, if (comm_ctx) { comm_ctx->Recv(&gpu_shape_size_tensor, 1, peer, stream); } else { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclRecv( gpu_data, 1, nccl_dtype, peer, comm->comm(), stream)); } } @@ -90,7 +89,7 @@ framework::DDim recv_shape_info(const platform::Place &place, if (comm_ctx) { comm_ctx->Recv(&gpu_shape_tensor, shape_size, peer, stream); } else { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclRecv( gpu_shape_data, shape_size, nccl_dtype, peer, comm->comm(), stream)); } } @@ -124,8 +123,7 @@ template class RecvOpV2CUDAKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &ctx) const override { -#if (defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_NCCL)) && \ - NCCL_VERSION_CODE >= 2703 +#if (defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) || defined(PADDLE_WITH_NCCL)) int rid = ctx.Attr("ring_id"); bool dynamic_shape = ctx.Attr("dynamic_shape"); PADDLE_ENFORCE_GE( @@ -216,7 +214,7 @@ class RecvOpV2CUDAKernel : public framework::OpKernel { int data_type = ctx.Attr("dtype"); framework::proto::VarType::Type type = framework::proto::VarType::Type(data_type); - ncclDataType_t dtype = platform::ToNCCLDataType(type); + mcclDataType_t dtype = platform::ToNCCLDataType(type); auto *out_var = ctx.OutputVar("Out"); if (out_var->IsType()) { @@ -235,7 +233,7 @@ class RecvOpV2CUDAKernel : public framework::OpKernel { if (comm_ctx) { comm_ctx->Recv(out, numel, peer, stream); } else { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclRecv( out->data(), numel, dtype, peer, comm->comm(), stream)); VLOG(3) << "rank " << comm->rank() << " recv " << common::product(out_dims) << " from " << peer; @@ -274,7 +272,7 @@ class RecvOpV2CUDAKernel : public framework::OpKernel { "be less than comm->nranks (%d).", peer, comm->nranks())); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclRecv( out->data(), numel, dtype, peer, comm->comm(), stream)); VLOG(3) << "rank " << comm->rank() << " recv " << common::product(out->dims()) << " from " << peer; @@ -299,9 +297,9 @@ PD_REGISTER_STRUCT_KERNEL(recv_v2, ops::RecvOpV2CUDAKernel, float, double, -#if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000 +// #if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000 plat::bfloat16, -#endif +// #endif int, int64_t, int8_t, diff --git a/paddle/fluid/operators/collective/send_v2_op.cu.cc b/paddle/fluid/operators/collective/send_v2_op.cu.cc index 86be6908e3cd2..5ad3124b32017 100644 --- a/paddle/fluid/operators/collective/send_v2_op.cu.cc +++ b/paddle/fluid/operators/collective/send_v2_op.cu.cc @@ -14,7 +14,7 @@ limitations under the License. */ #include "paddle/fluid/operators/collective/send_v2_op.h" -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) #include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/device/gpu/nccl_helper.h" #include "paddle/phi/core/distributed/comm_context_manager.h" @@ -28,8 +28,7 @@ PHI_DECLARE_bool(dynamic_static_unified_comm); namespace paddle { namespace operators { -#if (defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_NCCL)) && \ - NCCL_VERSION_CODE >= 2703 +#if (defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) || defined(PADDLE_WITH_NCCL)) void send_shape_info(const phi::DenseTensor& x, const platform::Place& place, const gpuStream_t& stream, @@ -46,7 +45,7 @@ void send_shape_info(const phi::DenseTensor& x, "to send the shape info.")); } phi::DataType shape_dtype = phi::DataType::INT32; - ncclDataType_t nccl_dtype = + mcclDataType_t nccl_dtype = platform::ToNCCLDataType(framework::TransToProtoVarType(shape_dtype)); auto dims = x.dims(); int shape_size = dims.size(); @@ -73,7 +72,7 @@ void send_shape_info(const phi::DenseTensor& x, comm_ctx->Send(*gpu_shape_size_tensor, 1, peer, stream); } else { PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::ncclSend(gpu_shape_size_tensor->data(), + platform::dynload::mcclSend(gpu_shape_size_tensor->data(), 1, nccl_dtype, peer, @@ -106,7 +105,7 @@ void send_shape_info(const phi::DenseTensor& x, comm_ctx->Send(*gpu_shape_tensor, shape_size, peer, stream); } else { PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::ncclSend(gpu_shape_tensor->data(), + platform::dynload::mcclSend(gpu_shape_tensor->data(), shape_size, nccl_dtype, peer, @@ -122,8 +121,7 @@ template class SendOpV2CUDAKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { -#if (defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_NCCL)) && \ - NCCL_VERSION_CODE >= 2703 +#if (defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) || defined(PADDLE_WITH_NCCL)) int rid = ctx.Attr("ring_id"); bool dynamic_shape = ctx.Attr("dynamic_shape"); PADDLE_ENFORCE_GE( @@ -217,12 +215,12 @@ class SendOpV2CUDAKernel : public framework::OpKernel { VLOG(3) << "LodTensorArray: idx(" << idx << ")"; auto& x = x_array.at(idx); int numel = x.numel(); - ncclDataType_t dtype = + mcclDataType_t dtype = platform::ToNCCLDataType(framework::TransToProtoVarType(x.dtype())); if (comm_ctx) { comm_ctx->Send(x, numel, peer, stream); } else { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclSend( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclSend( x.data(), numel, dtype, peer, comm->comm(), stream)); } VLOG(3) << "rank " << comm->rank() << " send " @@ -247,9 +245,9 @@ class SendOpV2CUDAKernel : public framework::OpKernel { if (comm_ctx) { comm_ctx->Send(*x, numel, peer, stream); } else { - ncclDataType_t dtype = + mcclDataType_t dtype = platform::ToNCCLDataType(framework::TransToProtoVarType(x->dtype())); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclSend( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclSend( x->data(), numel, dtype, peer, comm->comm(), stream)); VLOG(3) << "rank " << comm->rank() << " send " << common::product(x->dims()) << " to " << peer; @@ -274,9 +272,9 @@ PD_REGISTER_STRUCT_KERNEL(send_v2, ops::SendOpV2CUDAKernel, float, double, -#if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000 +// #if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000 plat::bfloat16, -#endif +// #endif int, int64_t, int8_t, diff --git a/paddle/fluid/operators/controlflow/conditional_block_op.h b/paddle/fluid/operators/controlflow/conditional_block_op.h index 0f04a295ed263..d5419d2b13a4e 100644 --- a/paddle/fluid/operators/controlflow/conditional_block_op.h +++ b/paddle/fluid/operators/controlflow/conditional_block_op.h @@ -77,7 +77,7 @@ class ConditionalOp : public framework::OperatorBase { ips[0]->numel())); bool res = false; if (platform::is_gpu_place(ips[0]->place())) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) phi::DenseTensor cpu_tensor; framework::TensorCopy(*ips[0], platform::CPUPlace(), &cpu_tensor); platform::DeviceContextPool::Instance().Get(ips[0]->place())->Wait(); diff --git a/paddle/fluid/operators/controlflow/feed_op.cc b/paddle/fluid/operators/controlflow/feed_op.cc index 94b946e43dc7a..b44be01ca1a8e 100644 --- a/paddle/fluid/operators/controlflow/feed_op.cc +++ b/paddle/fluid/operators/controlflow/feed_op.cc @@ -222,7 +222,7 @@ PD_REGISTER_KERNEL_FOR_ALL_DTYPE( ALL_LAYOUT, paddle::operators::FeedSparseCooTensorKernel) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) PD_REGISTER_KERNEL_FOR_ALL_DTYPE( feed_sparse_coo_tensor, GPU, diff --git a/paddle/fluid/operators/controlflow/get_places_op.cc b/paddle/fluid/operators/controlflow/get_places_op.cc index 9262ca59af970..3fb50e695d1a3 100644 --- a/paddle/fluid/operators/controlflow/get_places_op.cc +++ b/paddle/fluid/operators/controlflow/get_places_op.cc @@ -26,7 +26,7 @@ namespace imperative { class OpBase; } // namespace imperative } // namespace paddle -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) #include "paddle/fluid/platform/device/gpu/gpu_info.h" #endif @@ -34,7 +34,7 @@ namespace paddle { namespace operators { static size_t CUDADevCount() { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) return platform::GetGPUDeviceCount(); #else return 0UL; diff --git a/paddle/fluid/operators/controlflow/while_op_helper.cc b/paddle/fluid/operators/controlflow/while_op_helper.cc index 8ddce0da7faac..ef0dccff7197f 100644 --- a/paddle/fluid/operators/controlflow/while_op_helper.cc +++ b/paddle/fluid/operators/controlflow/while_op_helper.cc @@ -227,7 +227,7 @@ bool GetCondData(const phi::DenseTensor &cond) { // when platform::is_gpu_place(cond.place()) or // platform::is_xpu_place(cond.place()) is true std::unique_ptr cpu_cond{new phi::DenseTensor()}; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || \ defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_CUSTOM_DEVICE) framework::TensorCopySync(cond, platform::CPUPlace(), cpu_cond.get()); #else diff --git a/paddle/fluid/operators/data_norm_op.cu b/paddle/fluid/operators/data_norm_op.cu index 509c067e24e42..da1eec366937d 100644 --- a/paddle/fluid/operators/data_norm_op.cu +++ b/paddle/fluid/operators/data_norm_op.cu @@ -18,7 +18,7 @@ limitations under the License. */ #include "paddle/fluid/framework/data_layout.h" #include "paddle/fluid/operators/data_norm_op.h" #include "paddle/phi/backends/gpu/gpu_primitives.h" -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) #include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/device/gpu/nccl_helper.h" #include "paddle/phi/core/distributed/comm_context_manager.h" @@ -216,7 +216,7 @@ class DataNormGradKernel : public framework::OpKernel { d_batch_square_sum); if (need_sync_stats) { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) int rid = 0; platform::NCCLComm *comm = nullptr; const auto &comm_context_manager = @@ -247,59 +247,59 @@ class DataNormGradKernel : public framework::OpKernel { } if (comm_ctx) { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclAllReduce( reinterpret_cast(d_batch_size), reinterpret_cast(d_batch_size), C, platform::ToNCCLDataType( framework::TransToProtoVarType(x->dtype())), - ncclSum, + mcclSum, comm_ctx->GetNcclComm(), stream)); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclAllReduce( reinterpret_cast(d_batch_sum), reinterpret_cast(d_batch_sum), C, platform::ToNCCLDataType( framework::TransToProtoVarType(x->dtype())), - ncclSum, + mcclSum, comm_ctx->GetNcclComm(), stream)); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclAllReduce( reinterpret_cast(d_batch_square_sum), reinterpret_cast(d_batch_square_sum), C, platform::ToNCCLDataType( framework::TransToProtoVarType(x->dtype())), - ncclSum, + mcclSum, comm_ctx->GetNcclComm(), stream)); } else { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclAllReduce( reinterpret_cast(d_batch_size), reinterpret_cast(d_batch_size), C, platform::ToNCCLDataType( framework::TransToProtoVarType(x->dtype())), - ncclSum, + mcclSum, comm->comm(), stream)); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclAllReduce( reinterpret_cast(d_batch_sum), reinterpret_cast(d_batch_sum), C, platform::ToNCCLDataType( framework::TransToProtoVarType(x->dtype())), - ncclSum, + mcclSum, comm->comm(), stream)); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclAllReduce( reinterpret_cast(d_batch_square_sum), reinterpret_cast(d_batch_square_sum), C, platform::ToNCCLDataType( framework::TransToProtoVarType(x->dtype())), - ncclSum, + mcclSum, comm->comm(), stream)); } diff --git a/paddle/fluid/operators/detection/CMakeLists.txt b/paddle/fluid/operators/detection/CMakeLists.txt index d38a72556f759..688178ac7b582 100644 --- a/paddle/fluid/operators/detection/CMakeLists.txt +++ b/paddle/fluid/operators/detection/CMakeLists.txt @@ -11,7 +11,7 @@ function(detection_library TARGET_NAME) set(srcs) # filter cuda source file when not build with cuda/rocm foreach(src ${detection_library_SRCS}) - if(NOT WITH_GPU AND NOT WITH_ROCM) + if(NOT WITH_GPU AND NOT WITH_ROCM AND NOT WITH_MUSA) if(${src} MATCHES ".*\\.cc$") list(APPEND srcs ${src}) endif() @@ -57,7 +57,7 @@ detection_library(box_decoder_and_assign_op SRCS box_decoder_and_assign_op.cc detection_library(retinanet_detection_output_op SRCS retinanet_detection_output_op.cc) -if(WITH_GPU OR WITH_ROCM) +if(WITH_GPU OR WITH_ROCM OR WITH_MUSA) set(TMPDEPS memory) if(WITH_GPU) if(${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0) diff --git a/paddle/fluid/operators/detection/bbox_util.cu.h b/paddle/fluid/operators/detection/bbox_util.cu.h index adb60a8a8d064..945678dfd96ac 100644 --- a/paddle/fluid/operators/detection/bbox_util.cu.h +++ b/paddle/fluid/operators/detection/bbox_util.cu.h @@ -16,7 +16,7 @@ limitations under the License. */ #include #include #include -#ifdef __NVCC__ +#if defined(__NVCC__) || defined(__MUSACC__) #include "cub/cub.cuh" #endif #ifdef __HIPCC__ diff --git a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu index b2bbd9c82095c..6f203e9cca737 100644 --- a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu +++ b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu @@ -9,7 +9,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#ifdef __NVCC__ +#if defined(__NVCC__) || defined(__MUSACC__) #include "cub/cub.cuh" #endif #ifdef __HIPCC__ diff --git a/paddle/fluid/operators/dgc_clip_by_norm_op.h b/paddle/fluid/operators/dgc_clip_by_norm_op.h index d954ea1bf82af..807f7e907e5ce 100644 --- a/paddle/fluid/operators/dgc_clip_by_norm_op.h +++ b/paddle/fluid/operators/dgc_clip_by_norm_op.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include "paddle/fluid/operators/clip_by_norm_op.h" +#include "paddle/phi/core/enforce.h" #include "paddle/phi/kernels/clip_by_norm_kernel.h" #include "paddle/phi/kernels/selected_rows/clip_by_norm_kernel.h" @@ -25,48 +26,49 @@ template class DGCClipByNormKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - auto rampup_begin_step = ctx.Attr("rampup_begin_step"); - if (static_cast(rampup_begin_step) < 0) { - return; - } + PADDLE_ENFORCE(false, "not supported"); + // auto rampup_begin_step = ctx.Attr("rampup_begin_step"); + // if (static_cast(rampup_begin_step) < 0) { + // return; + // } - auto current_step_tensor = ctx.Input("current_step"); - auto* current_step = current_step_tensor->data(); + // auto current_step_tensor = ctx.Input("current_step"); + // auto* current_step = current_step_tensor->data(); - VLOG(10) << "current_step:" << *current_step - << ", rampup_begin_step:" << rampup_begin_step; + // VLOG(10) << "current_step:" << *current_step + // << ", rampup_begin_step:" << rampup_begin_step; - if (static_cast(*current_step) < static_cast(rampup_begin_step)) { - VLOG(10) << "current_step:" << *current_step - << " < rampup_begin_step:" << rampup_begin_step - << " so does't use dgc_clip_by_norm"; - return; - } + // if (static_cast(*current_step) < static_cast(rampup_begin_step)) { + // VLOG(10) << "current_step:" << *current_step + // << " < rampup_begin_step:" << rampup_begin_step + // << " so does't use dgc_clip_by_norm"; + // return; + // } - auto in_var = ctx.InputVar("X"); - auto max_norm = ctx.Attr("max_norm"); - auto& dev_ctx = ctx.device_context(); + // auto in_var = ctx.InputVar("X"); + // auto max_norm = ctx.Attr("max_norm"); + // auto& dev_ctx = ctx.device_context(); - if (in_var->IsType()) { - auto* x = ctx.Input("X"); - auto* y = ctx.Output("Out"); - return phi::ClipByNormKernel( - static_cast::TYPE&>(dev_ctx), - *x, - max_norm, - y); - } else if (in_var->IsType()) { - auto* x = ctx.Input("X"); - phi::SelectedRows* output_selected_rows = - ctx.Output("Out"); - return phi::sr::ClipByNormKernel( - static_cast::TYPE&>(dev_ctx), - *x, - max_norm, - output_selected_rows); - } + // if (in_var->IsType()) { + // auto* x = ctx.Input("X"); + // auto* y = ctx.Output("Out"); + // return phi::ClipByNormKernel( + // static_cast::TYPE&>(dev_ctx), + // *x, + // max_norm, + // y); + // } else if (in_var->IsType()) { + // auto* x = ctx.Input("X"); + // phi::SelectedRows* output_selected_rows = + // ctx.Output("Out"); + // return phi::sr::ClipByNormKernel( + // static_cast::TYPE&>(dev_ctx), + // *x, + // max_norm, + // output_selected_rows); + // } }; }; diff --git a/paddle/fluid/operators/elementwise/elementwise_op_function.h b/paddle/fluid/operators/elementwise/elementwise_op_function.h index face0f758f848..1b2dc157fb402 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op_function.h +++ b/paddle/fluid/operators/elementwise/elementwise_op_function.h @@ -32,11 +32,14 @@ limitations under the License. */ #include "paddle/phi/kernels/cpu/elementwise.h" #include "paddle/phi/kernels/cpu/elementwise_grad.h" -#if defined(__NVCC__) || defined(__HIPCC__) +#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) #ifdef __NVCC__ #include #elif defined(__HIPCC__) #include +#elif defined(__MUSACC__) +#include +#include #endif #include @@ -311,7 +314,7 @@ static void FusedElemwiseAndActBroadcast2CPU(const T *x, } } -#if defined(__NVCC__) || defined(__HIPCC__) +#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) template GetReduceDim(const framework::DDim &in, return phi::funcs::GetReduceDim(in, out, axis); } -#if defined(__NVCC__) || defined(__HIPCC__) +#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) template void GetGradXAndYOut(const phi::GPUContext &dev_ctx, diff --git a/paddle/fluid/operators/expand_op.cc b/paddle/fluid/operators/expand_op.cc index 4c2dd99265781..8be70c6fc8e93 100644 --- a/paddle/fluid/operators/expand_op.cc +++ b/paddle/fluid/operators/expand_op.cc @@ -283,7 +283,7 @@ REGISTER_OP_CPU_KERNEL(expand_grad, ops::ExpandGradKernel, ops::ExpandGradKernel, ops::ExpandGradKernel); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) REGISTER_OP_CUDA_KERNEL( expand, ops::ExpandKernel, diff --git a/paddle/fluid/operators/fake_quantize_op.cu.h b/paddle/fluid/operators/fake_quantize_op.cu.h index bdf8a80debb64..976ce30d2f0be 100644 --- a/paddle/fluid/operators/fake_quantize_op.cu.h +++ b/paddle/fluid/operators/fake_quantize_op.cu.h @@ -193,6 +193,8 @@ struct FindChannelAbsMaxFunctor { #ifdef PADDLE_WITH_HIP hipMemset(out_abs_max, 0, sizeof(T) * cout); +#elif defined(PADDLE_WITH_MUSA) + musaMemset(out_abs_max, 0, sizeof(T) * cout); #else cudaMemset(out_abs_max, 0, sizeof(T) * cout); #endif // PADDLE_FLUID_OPERATORS_FAKE_QUANTIZE_OP_CU_H_ diff --git a/paddle/fluid/operators/fused/CMakeLists.txt b/paddle/fluid/operators/fused/CMakeLists.txt index ced20a0108a52..942dd94f4dca2 100755 --- a/paddle/fluid/operators/fused/CMakeLists.txt +++ b/paddle/fluid/operators/fused/CMakeLists.txt @@ -32,16 +32,16 @@ if(WITH_XPU) op_library(fused_feedforward_op) endif() -if(WITH_GPU OR WITH_ROCM) +if(WITH_GPU OR WITH_ROCM OR WITH_MUSA) # fused_bn_activation_op needs cudnn 7.4.1 above # HIP not support bn act fuse in MIOPEN - if((NOT WITH_ROCM) AND (NOT ${CUDNN_VERSION} VERSION_LESS 7401)) + if((NOT WITH_ROCM AND NOT WITH_MUSA) AND (NOT ${CUDNN_VERSION} VERSION_LESS 7401)) op_library(fused_bn_activation_op) endif() # HIP not support cudnnTransformTensor # fusion_conv_inception_op needs cudnn 7 above # HIP not support cudnnConvolutionBiasActivationForward - if((NOT WITH_ROCM) AND (NOT ${CUDNN_VERSION} VERSION_LESS 7100)) + if((NOT WITH_ROCM AND NOT WITH_MUSA) AND (NOT ${CUDNN_VERSION} VERSION_LESS 7100)) op_library(fusion_conv_inception_op) endif() op_library(yolo_box_head_op) @@ -53,12 +53,12 @@ if(WITH_GPU OR WITH_ROCM) endif() # fused_bn_add_activation # HIP not support bn act fuse in MIOPEN - if((NOT WITH_ROCM) AND (NOT ${CUDNN_VERSION} VERSION_LESS 7401)) + if((NOT WITH_ROCM AND NOT WITH_MUSA) AND (NOT ${CUDNN_VERSION} VERSION_LESS 7401)) op_library(fused_bn_add_activation_op) endif() # fused_dropout # only support CUDA - if(NOT WITH_ROCM) + if(NOT WITH_ROCM AND NOT WITH_MUSA) op_library(fused_feedforward_op) # fused_attention_op op_library(fused_attention_op) @@ -66,7 +66,7 @@ if(WITH_GPU OR WITH_ROCM) op_library(fused_multi_transformer_int8_op) endif() # resnet_unit needs cudnn 8.0 above - if((NOT WITH_ROCM) AND (NOT ${CUDNN_VERSION} VERSION_LESS 8000)) + if((NOT WITH_ROCM AND NOT WITH_MUSA) AND (NOT ${CUDNN_VERSION} VERSION_LESS 8000)) op_library(resnet_unit_op) endif() diff --git a/paddle/fluid/operators/fused/attn_bias_add.cu.h b/paddle/fluid/operators/fused/attn_bias_add.cu.h index 8ea1e11cd29f4..6b3e435529e71 100644 --- a/paddle/fluid/operators/fused/attn_bias_add.cu.h +++ b/paddle/fluid/operators/fused/attn_bias_add.cu.h @@ -14,7 +14,7 @@ limitations under the License. */ #pragma once -#ifdef __NVCC__ +#if defined(__NVCC__) || defined(__MUSACC__) #include "cub/cub.cuh" #endif #ifdef __HIPCC__ diff --git a/paddle/fluid/operators/fused/fused_attention_utils.h b/paddle/fluid/operators/fused/fused_attention_utils.h index b198c4a579291..c37b6e2307b58 100644 --- a/paddle/fluid/operators/fused/fused_attention_utils.h +++ b/paddle/fluid/operators/fused/fused_attention_utils.h @@ -14,7 +14,7 @@ #pragma once -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) #include "paddle/fluid/distributed/collective/process_group_nccl.h" #include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/device/gpu/nccl_helper.h" @@ -34,7 +34,7 @@ static void AllReduce(phi::DenseTensor &tensor, // NOLINT const int ring_id, const phi::GPUContext &dev_ctx) { if (ring_id == -1) return; -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) auto map = paddle::distributed::ProcessGroupMapFromGid::getInstance(); if (map->has(ring_id)) { @@ -86,10 +86,10 @@ static void AllReduce(phi::DenseTensor &tensor, // NOLINT VLOG(3) << "old NCCLCommContext has ring_id " << ring_id; } if (comm_ctx) { - comm_ctx->AllReduce(&tensor, tensor, ncclSum, stream); + comm_ctx->AllReduce(&tensor, tensor, mcclSum, stream); } else { - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclAllReduce( - sendbuff, recvbuff, numel, dtype, ncclSum, comm->comm(), stream)); + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mcclAllReduce( + sendbuff, recvbuff, numel, dtype, mcclSum, comm->comm(), stream)); } } #else diff --git a/paddle/fluid/operators/fused/fused_dropout_common.h b/paddle/fluid/operators/fused/fused_dropout_common.h index ccd099109487c..7081180ea6766 100644 --- a/paddle/fluid/operators/fused/fused_dropout_common.h +++ b/paddle/fluid/operators/fused/fused_dropout_common.h @@ -15,8 +15,8 @@ limitations under the License. */ #pragma once #include -#include -#include +#include +#include #include "paddle/fluid/memory/memory.h" #include "paddle/fluid/operators/fused/quant_dequant_kernel.h" diff --git a/paddle/fluid/operators/fused/fused_multi_transformer_op.cu.h b/paddle/fluid/operators/fused/fused_multi_transformer_op.cu.h index 40717402846db..ad73be604fddb 100644 --- a/paddle/fluid/operators/fused/fused_multi_transformer_op.cu.h +++ b/paddle/fluid/operators/fused/fused_multi_transformer_op.cu.h @@ -19,7 +19,7 @@ limitations under the License. */ #pragma once -#include +#include #include #include @@ -39,7 +39,7 @@ limitations under the License. */ #include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/fusion/gpu/attn_gemm.h" -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) #include "paddle/fluid/distributed/collective/process_group.h" #include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/device/gpu/nccl_helper.h" @@ -61,7 +61,7 @@ static void AllReduce(phi::DenseTensor &tensor, // NOLINT const int count, const phi::GPUContext &ctx) { if (ring_id == -1) return; -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) auto map = paddle::distributed::ProcessGroupMapFromGid::getInstance(); if (map->has(ring_id)) { @@ -117,10 +117,10 @@ static void AllReduce(phi::DenseTensor &tensor, // NOLINT VLOG(3) << "old NCCLCommContext has ring_id " << ring_id; } if (comm_ctx) { - comm_ctx->AllReduce(&tensor, tensor, ncclSum, stream); + comm_ctx->AllReduce(&tensor, tensor, mcclSum, stream); } else { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce( - sendbuff, recvbuff, count, dtype, ncclSum, comm->comm(), stream)); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclAllReduce( + sendbuff, recvbuff, count, dtype, mcclSum, comm->comm(), stream)); } } #else diff --git a/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu b/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu index 362860aa23bdf..e78579a27c1a9 100644 --- a/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu +++ b/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu @@ -150,6 +150,34 @@ void FusedSeqpoolCVM(const framework::ExecutionContext lods.size() * sizeof(size_t *), hipMemcpyHostToDevice, stream); +#elif defined(PADDLE_WITH_MUSA) + T **gpu_input_values = reinterpret_cast(temp_ptr->ptr()); + platform::GpuMemcpyAsync(gpu_input_values, + input_data.data(), + input_data.size() * sizeof(T *), + musaMemcpyHostToDevice, + stream); + T **gpu_output_values = + reinterpret_cast(&gpu_input_values[input_data.size()]); + platform::GpuMemcpyAsync(gpu_output_values, + output_data.data(), + output_data.size() * sizeof(T *), + musaMemcpyHostToDevice, + stream); + T **gpu_seqpool_output_values = + reinterpret_cast(&gpu_output_values[output_data.size()]); + platform::GpuMemcpyAsync(gpu_seqpool_output_values, + seqpool_output_data.data(), + seqpool_output_data.size() * sizeof(T *), + musaMemcpyHostToDevice, + stream); + size_t **lods_values = reinterpret_cast( + &gpu_seqpool_output_values[seqpool_output_data.size()]); + platform::GpuMemcpyAsync(lods_values, + lods.data(), + lods.size() * sizeof(size_t *), + musaMemcpyHostToDevice, + stream); #else T **gpu_input_values = reinterpret_cast(temp_ptr->ptr()); platform::GpuMemcpyAsync(gpu_input_values, @@ -356,6 +384,37 @@ void FusedSeqpoolCVMGrad(const framework::ExecutionContext &ctx, lods.size() * sizeof(size_t *), hipMemcpyHostToDevice, stream); +#elif defined(PADDLE_WITH_MUSA) + T **gpu_out_grads_values = reinterpret_cast(temp_ptr->ptr()); + platform::GpuMemcpyAsync(gpu_out_grads_values, + out_grads_data.data(), + out_grads_data.size() * sizeof(T *), + musaMemcpyHostToDevice, + stream); + + T **gpu_in_grads_values = + reinterpret_cast(&gpu_out_grads_values[out_grads_data.size()]); + platform::GpuMemcpyAsync(gpu_in_grads_values, + in_grads_data.data(), + in_grads_data.size() * sizeof(T *), + musaMemcpyHostToDevice, + stream); + + T **gpu_cvm_values = + reinterpret_cast(&gpu_in_grads_values[in_grads_data.size()]); + platform::GpuMemcpyAsync(gpu_cvm_values, + cvm_data.data(), + cvm_data.size() * sizeof(T *), + musaMemcpyHostToDevice, + stream); + + size_t **lods_values = + reinterpret_cast(&gpu_cvm_values[cvm_data.size()]); + platform::GpuMemcpyAsync(lods_values, + lods.data(), + lods.size() * sizeof(size_t *), + musaMemcpyHostToDevice, + stream); #else T **gpu_out_grads_values = reinterpret_cast(temp_ptr->ptr()); platform::GpuMemcpyAsync(gpu_out_grads_values, diff --git a/paddle/fluid/operators/fused/yolo_box_post_op.cu b/paddle/fluid/operators/fused/yolo_box_post_op.cu index 72bb97a2aae9e..c6fe13548033a 100644 --- a/paddle/fluid/operators/fused/yolo_box_post_op.cu +++ b/paddle/fluid/operators/fused/yolo_box_post_op.cu @@ -255,6 +255,9 @@ static void YoloTensorParseCuda( #ifdef PADDLE_WITH_HIP hipMemcpy( bbox_count_device_ptr, &bbox_count, sizeof(int), hipMemcpyHostToDevice); +#elif defined(PADDLE_WITH_MUSA) + musaMemcpy( + bbox_count_device_ptr, &bbox_count, sizeof(int), musaMemcpyHostToDevice); #else cudaMemcpy( bbox_count_device_ptr, &bbox_count, sizeof(int), cudaMemcpyHostToDevice); @@ -268,6 +271,9 @@ static void YoloTensorParseCuda( #ifdef PADDLE_WITH_HIP hipMemcpy( &bbox_count, bbox_count_device_ptr, sizeof(int), hipMemcpyDeviceToHost); +#elif defined(PADDLE_WITH_MUSA) + musaMemcpy( + &bbox_count, bbox_count_device_ptr, sizeof(int), musaMemcpyDeviceToHost); #else cudaMemcpy( &bbox_count, bbox_count_device_ptr, sizeof(int), cudaMemcpyDeviceToHost); @@ -283,6 +289,9 @@ static void YoloTensorParseCuda( #ifdef PADDLE_WITH_HIP hipFree(bbox_tensor); hipMalloc(&bbox_tensor, bbox_count * (5 + class_num) * sizeof(float)); +#elif defined(PADDLE_WITH_MUSA) + musaFree(bbox_tensor); + musaMalloc(&bbox_tensor, bbox_count * (5 + class_num) * sizeof(float)); #else cudaFree(bbox_tensor); cudaMalloc(&bbox_tensor, bbox_count * (5 + class_num) * sizeof(float)); @@ -296,6 +305,9 @@ static void YoloTensorParseCuda( #ifdef PADDLE_WITH_HIP hipMemcpy( bbox_index_device_ptr, &bbox_index, sizeof(int), hipMemcpyHostToDevice); +#elif defined(PADDLE_WITH_MUSA) + musaMemcpy( + bbox_index_device_ptr, &bbox_index, sizeof(int), musaMemcpyHostToDevice); #else cudaMemcpy( bbox_index_device_ptr, &bbox_index, sizeof(int), cudaMemcpyHostToDevice); @@ -356,6 +368,13 @@ class YoloBoxPostKernel : public framework::OpKernel { anchors.data(), anchors.size() * sizeof(int), hipMemcpyHostToDevice); +#elif defined(PADDLE_WITH_MUSA) + musaMalloc(reinterpret_cast(&device_anchors), + anchors.size() * sizeof(int)); + musaMemcpy(device_anchors, + anchors.data(), + anchors.size() * sizeof(int), + musaMemcpyHostToDevice); #else cudaMalloc(reinterpret_cast(&device_anchors), anchors.size() * sizeof(int)); @@ -388,6 +407,10 @@ class YoloBoxPostKernel : public framework::OpKernel { hipMalloc( reinterpret_cast(&ts_info[i].bboxes_dev_ptr), ts_info[i].bbox_count_max_alloc * (5 + class_num) * sizeof(float)); +#elif defined(PADDLE_WITH_MUSA) + musaMalloc( + reinterpret_cast(&ts_info[i].bboxes_dev_ptr), + ts_info[i].bbox_count_max_alloc * (5 + class_num) * sizeof(float)); #else cudaMalloc( reinterpret_cast(&ts_info[i].bboxes_dev_ptr), @@ -398,6 +421,9 @@ class YoloBoxPostKernel : public framework::OpKernel { #ifdef PADDLE_WITH_HIP hipMalloc(reinterpret_cast(&ts_info[i].bbox_count_device_ptr), sizeof(int)); +#elif defined(PADDLE_WITH_MUSA) + musaMalloc(reinterpret_cast(&ts_info[i].bbox_count_device_ptr), + sizeof(int)); #else cudaMalloc(reinterpret_cast(&ts_info[i].bbox_count_device_ptr), sizeof(int)); @@ -409,6 +435,8 @@ class YoloBoxPostKernel : public framework::OpKernel { int* bbox_index_device_ptr; #ifdef PADDLE_WITH_HIP hipMalloc(reinterpret_cast(&bbox_index_device_ptr), sizeof(int)); +#elif defined(PADDLE_WITH_MUSA) + musaMalloc(reinterpret_cast(&bbox_index_device_ptr), sizeof(int)); #else cudaMalloc(reinterpret_cast(&bbox_index_device_ptr), sizeof(int)); #endif @@ -456,6 +484,12 @@ class YoloBoxPostKernel : public framework::OpKernel { ts_info[ts_id].bboxes_dev_ptr, ts_info[ts_id].bbox_count_host * (5 + class_num) * sizeof(float), hipMemcpyDeviceToHost); +#elif defined(PADDLE_WITH_MUSA) + musaMemcpyAsync( + ts_info[ts_id].bboxes_host_ptr, + ts_info[ts_id].bboxes_dev_ptr, + ts_info[ts_id].bbox_count_host * (5 + class_num) * sizeof(float), + musaMemcpyDeviceToHost); #else cudaMemcpyAsync( ts_info[ts_id].bboxes_host_ptr, @@ -534,6 +568,8 @@ class YoloBoxPostKernel : public framework::OpKernel { #ifdef PADDLE_WITH_HIP hipFree(bbox_index_device_ptr); +#elif defined(PADDLE_WITH_MUSA) + musaFree(bbox_index_device_ptr); #else cudaFree(bbox_index_device_ptr); #endif @@ -541,6 +577,9 @@ class YoloBoxPostKernel : public framework::OpKernel { #ifdef PADDLE_WITH_HIP hipFree(ts_info[i].bboxes_dev_ptr); hipFree(ts_info[i].bbox_count_device_ptr); +#elif defined(PADDLE_WITH_MUSA) + musaFree(ts_info[i].bboxes_dev_ptr); + musaFree(ts_info[i].bbox_count_device_ptr); #else cudaFree(ts_info[i].bboxes_dev_ptr); cudaFree(ts_info[i].bbox_count_device_ptr); diff --git a/paddle/fluid/operators/get_tensor_from_selected_rows_op.cc b/paddle/fluid/operators/get_tensor_from_selected_rows_op.cc index 8ae92b04b7df4..c6a8a4fe7b982 100644 --- a/paddle/fluid/operators/get_tensor_from_selected_rows_op.cc +++ b/paddle/fluid/operators/get_tensor_from_selected_rows_op.cc @@ -111,7 +111,7 @@ PD_REGISTER_STRUCT_KERNEL(get_tensor_from_selected_rows, int, int64_t) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) PD_REGISTER_STRUCT_KERNEL(get_tensor_from_selected_rows, GPU, ALL_LAYOUT, diff --git a/paddle/fluid/operators/graph_khop_sampler_op.cu b/paddle/fluid/operators/graph_khop_sampler_op.cu index b4e0f511f6d61..b45fdd9619a61 100644 --- a/paddle/fluid/operators/graph_khop_sampler_op.cu +++ b/paddle/fluid/operators/graph_khop_sampler_op.cu @@ -32,6 +32,9 @@ limitations under the License. */ #ifdef PADDLE_WITH_HIP #include #include +#elif defined(PADDLE_WITH_MUSA) +#include +#include #else #include #include @@ -95,6 +98,12 @@ __global__ void GraphSampleNeighborsCUDAKernel(const uint64_t rand_seed, threadIdx.y * WARP_SIZE + threadIdx.x, 0, &rng); +#elif defined(PADDLE_WITH_MUSA) + murandState rng; + murand_init(rand_seed * gridDim.x + blockIdx.x, + threadIdx.y * WARP_SIZE + threadIdx.x, + 0, + &rng); #else curandState rng; curand_init(rand_seed * gridDim.x + blockIdx.x, @@ -128,6 +137,8 @@ __global__ void GraphSampleNeighborsCUDAKernel(const uint64_t rand_seed, for (int idx = k + threadIdx.x; idx < deg; idx += WARP_SIZE) { #ifdef PADDLE_WITH_HIP const int num = hiprand(&rng) % (idx + 1); +#elif defined(PADDLE_WITH_MUSA) + const int num = murand(&rng) % (idx + 1); #else const int num = curand(&rng) % (idx + 1); #endif diff --git a/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc b/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc index c88d36602bd79..3530beda000b4 100644 --- a/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc +++ b/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#ifndef PADDLE_WITH_HIP +#if !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA) // HIP not support cudnnSpatialTfGridGeneratorForward #include "paddle/fluid/framework/op_registry.h" diff --git a/paddle/fluid/operators/hinge_loss_op.cc b/paddle/fluid/operators/hinge_loss_op.cc index dea3ce3fe695b..ea38db87e63e7 100644 --- a/paddle/fluid/operators/hinge_loss_op.cc +++ b/paddle/fluid/operators/hinge_loss_op.cc @@ -156,7 +156,7 @@ PD_REGISTER_STRUCT_KERNEL( PD_REGISTER_STRUCT_KERNEL( hinge_loss_grad, CPU, ALL_LAYOUT, ops::HingeLossGradKernel, float) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) PD_REGISTER_STRUCT_KERNEL( hinge_loss, GPU, ALL_LAYOUT, ops::HingeLossKernel, float) {} PD_REGISTER_STRUCT_KERNEL( diff --git a/paddle/fluid/operators/im2sequence_op.cc b/paddle/fluid/operators/im2sequence_op.cc index 8c123bb8a32f2..e1e9ca5ef6667 100644 --- a/paddle/fluid/operators/im2sequence_op.cc +++ b/paddle/fluid/operators/im2sequence_op.cc @@ -201,7 +201,7 @@ PD_REGISTER_STRUCT_KERNEL( PD_REGISTER_STRUCT_KERNEL( im2sequence_grad, CPU, ALL_LAYOUT, ops::Im2SequenceGradKernel, float) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) PD_REGISTER_STRUCT_KERNEL( im2sequence, GPU, ALL_LAYOUT, ops::Im2SequenceKernel, float) {} PD_REGISTER_STRUCT_KERNEL( diff --git a/paddle/fluid/operators/isfinite_op.h b/paddle/fluid/operators/isfinite_op.h index 5352ccc99df92..5c03b7395a4f2 100644 --- a/paddle/fluid/operators/isfinite_op.h +++ b/paddle/fluid/operators/isfinite_op.h @@ -67,7 +67,7 @@ bool TensorIsfinite(const phi::DenseTensor& tensor); FiniteVisitor(Isnan, Any, CPU); FiniteVisitor(Isinf, Any, CPU); FiniteVisitor(Isfinite, All, CPU); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) FiniteVisitor(Isnan, Any, GPU); FiniteVisitor(Isinf, Any, GPU); FiniteVisitor(Isfinite, All, GPU); @@ -82,7 +82,7 @@ inline void TensorContainsNAN(const phi::DenseTensor& tensor, IsnanVisitorCPU(tensor, out)); return; } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) if (platform::is_gpu_place(place)) { VisitDataTypeNormal(paddle::framework::TransToProtoVarType(tensor.dtype()), IsnanVisitorGPU(tensor, out)); @@ -99,7 +99,7 @@ inline void TensorContainsInf(const phi::DenseTensor& tensor, IsinfVisitorCPU(tensor, out)); return; } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) if (platform::is_gpu_place(place)) { VisitDataTypeNormal(paddle::framework::TransToProtoVarType(tensor.dtype()), IsinfVisitorGPU(tensor, out)); @@ -116,7 +116,7 @@ inline void TensorIsfinite(const phi::DenseTensor& tensor, IsfiniteVisitorCPU(tensor, out)); return; } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) if (platform::is_gpu_place(place)) { VisitDataTypeNormal(paddle::framework::TransToProtoVarType(tensor.dtype()), IsfiniteVisitorGPU(tensor, out)); diff --git a/paddle/fluid/operators/l1_norm_op.cc b/paddle/fluid/operators/l1_norm_op.cc index 8f0b705c8de79..3918ba5459980 100644 --- a/paddle/fluid/operators/l1_norm_op.cc +++ b/paddle/fluid/operators/l1_norm_op.cc @@ -96,7 +96,7 @@ PD_REGISTER_STRUCT_KERNEL(l1_norm, CPU, ALL_LAYOUT, ops::L1NormKernel, float) {} PD_REGISTER_STRUCT_KERNEL( l1_norm_grad, CPU, ALL_LAYOUT, ops::L1NormGradKernel, float) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) PD_REGISTER_STRUCT_KERNEL(l1_norm, GPU, ALL_LAYOUT, ops::L1NormKernel, float) {} PD_REGISTER_STRUCT_KERNEL( l1_norm_grad, GPU, ALL_LAYOUT, ops::L1NormGradKernel, float) {} diff --git a/paddle/fluid/operators/load_op.cc b/paddle/fluid/operators/load_op.cc index dd85ccff87f2d..197aaa74bb3e1 100644 --- a/paddle/fluid/operators/load_op.cc +++ b/paddle/fluid/operators/load_op.cc @@ -133,7 +133,7 @@ PD_REGISTER_KERNEL(load, CPU, ALL_LAYOUT, ops::LoadKernel, float) {} PD_REGISTER_KERNEL( load_sr, CPU, ALL_LAYOUT, ops::LoadSelectedRowsKernel, float) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) PD_REGISTER_KERNEL(load, GPU, ALL_LAYOUT, ops::LoadKernel, float) {} PD_REGISTER_KERNEL( load_sr, GPU, ALL_LAYOUT, ops::LoadSelectedRowsKernel, float) {} diff --git a/paddle/fluid/operators/lod_tensor_to_array_op.cc b/paddle/fluid/operators/lod_tensor_to_array_op.cc index 94b0319729117..da8ea875e9393 100644 --- a/paddle/fluid/operators/lod_tensor_to_array_op.cc +++ b/paddle/fluid/operators/lod_tensor_to_array_op.cc @@ -66,7 +66,7 @@ struct LoDTensorToArrayFunctor { if (std::is_same::value) { Apply(static_cast(dev_ctx)); } else { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) Apply(static_cast(dev_ctx)); #else PADDLE_THROW( diff --git a/paddle/fluid/operators/lookup_table_v2_op.cu b/paddle/fluid/operators/lookup_table_v2_op.cu index edd8b20da160c..3f0ccf3bf40ff 100644 --- a/paddle/fluid/operators/lookup_table_v2_op.cu +++ b/paddle/fluid/operators/lookup_table_v2_op.cu @@ -221,6 +221,9 @@ struct LookupTableV2GradCUDAFunctor { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS( hipMemsetAsync(d_table, 0, N * D * sizeof(T), dev_ctx.stream())); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS( + musaMemsetAsync(d_table, 0, N * D * sizeof(T), dev_ctx.stream())); #else PADDLE_ENFORCE_GPU_SUCCESS( cudaMemsetAsync(d_table, 0, N * D * sizeof(T), dev_ctx.stream())); diff --git a/paddle/fluid/operators/margin_cross_entropy_op.cu b/paddle/fluid/operators/margin_cross_entropy_op.cu index 75ef56accb10b..216e9863a5e27 100644 --- a/paddle/fluid/operators/margin_cross_entropy_op.cu +++ b/paddle/fluid/operators/margin_cross_entropy_op.cu @@ -16,6 +16,8 @@ #ifdef PADDLE_WITH_HIP #include namespace cub = hipcub; +#elif defined(PADDLE_WITH_MUSA) + #else #include #endif @@ -36,7 +38,7 @@ namespace cub = hipcub; #include "paddle/phi/core/visit_type.h" #include "paddle/phi/kernels/funcs/eigen/common.h" -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) #include "paddle/fluid/distributed/collective/process_group.h" #include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/device/gpu/nccl_helper.h" @@ -72,7 +74,7 @@ void GetClassInterval(const gpuStream_t& stream, return; } -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) DenseTensor num_classes_per_device; phi::TensorFromVector(shard_dim_vec, dev_ctx, &num_classes_per_device); int* num_classes_per_device_ptr = num_classes_per_device.data(); @@ -123,15 +125,15 @@ void GetClassInterval(const gpuStream_t& stream, if (comm_ctx) { comm_ctx->AllReduce(&num_classes_per_device, num_classes_per_device, - ncclSum, + mcclSum, calcu_stream); } else { - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclAllReduce( + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mcclAllReduce( num_classes_per_device_ptr, num_classes_per_device_ptr, num_classes_per_device.numel(), phi::ToNCCLDataType(num_classes_per_device.dtype()), - ncclSum, + mcclSum, comm->comm(), calcu_stream)); } @@ -270,7 +272,7 @@ void MarginCrossEntropyKernel(const Context& dev_ctx, DenseTensor* loss) { const auto& place = dev_ctx.GetPlace(); // old code -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) paddle::platform::NCCLComm* comm = nullptr; const auto& comm_context_manager = phi::distributed::CommContextManager::GetInstance(); @@ -405,7 +407,7 @@ void MarginCrossEntropyKernel(const Context& dev_ctx, phi::kps::IdentityFunctor(), {1}); -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) if (nranks > 1) { if (pg) { std::vector in_tensor; @@ -419,14 +421,14 @@ void MarginCrossEntropyKernel(const Context& dev_ctx, task->Wait(); } else { if (comm_ctx) { - comm_ctx->AllReduce(&logits_max, logits_max, ncclMax, stream); + comm_ctx->AllReduce(&logits_max, logits_max, mcclMax, stream); } else { PADDLE_ENFORCE_GPU_SUCCESS( - phi::dynload::ncclAllReduce(logits_max_buff, + phi::dynload::mcclAllReduce(logits_max_buff, logits_max_buff, logits_max.numel(), phi::ToNCCLDataType(logits_max.dtype()), - ncclMax, + mcclMax, comm->comm(), stream)); } @@ -450,7 +452,7 @@ void MarginCrossEntropyKernel(const Context& dev_ctx, phi::kps::ExpFunctor(), {1}); -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) if (nranks > 1) { if (pg) { std::vector in_tensor; @@ -464,14 +466,14 @@ void MarginCrossEntropyKernel(const Context& dev_ctx, task->Wait(); } else { if (comm_ctx) { - comm_ctx->AllReduce(&sum_exp_logits, sum_exp_logits, ncclSum, stream); + comm_ctx->AllReduce(&sum_exp_logits, sum_exp_logits, mcclSum, stream); } else { - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclAllReduce( + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mcclAllReduce( sum_exp_logits_buff, sum_exp_logits_buff, sum_exp_logits.numel(), phi::ToNCCLDataType(sum_exp_logits.dtype()), - ncclSum, + mcclSum, comm->comm(), stream)); } @@ -512,7 +514,7 @@ void MarginCrossEntropyKernel(const Context& dev_ctx, class_interval.data()); } -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) if (nranks > 1) { if (pg) { std::vector in_tensor; @@ -526,14 +528,14 @@ void MarginCrossEntropyKernel(const Context& dev_ctx, task->Wait(); } else { if (comm_ctx) { - comm_ctx->AllReduce(loss, *loss, ncclSum, stream); + comm_ctx->AllReduce(loss, *loss, mcclSum, stream); } else { PADDLE_ENFORCE_GPU_SUCCESS( - phi::dynload::ncclAllReduce(loss_ptr, + phi::dynload::mcclAllReduce(loss_ptr, loss_ptr, loss->numel(), phi::ToNCCLDataType(loss->dtype()), - ncclSum, + mcclSum, comm->comm(), stream)); } diff --git a/paddle/fluid/operators/math/bert_encoder_functor.h b/paddle/fluid/operators/math/bert_encoder_functor.h index 76e27380b90e2..d1e0a772f3eaa 100644 --- a/paddle/fluid/operators/math/bert_encoder_functor.h +++ b/paddle/fluid/operators/math/bert_encoder_functor.h @@ -20,6 +20,12 @@ limitations under the License. */ #include // NOLINT #endif + +#ifdef PADDLE_WITH_MUSA +#include +#include +#endif + #ifdef PADDLE_WITH_HIP #include @@ -47,7 +53,7 @@ struct CUDATypeTraits { typedef float TYPE; }; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) // This functor involves a fusion calculation in Ernie or Bert. // The fusion mode is as follows: // diff --git a/paddle/fluid/operators/math/gru_compute.cc b/paddle/fluid/operators/math/gru_compute.cc index 857d870847ee8..2b0d3432720df 100644 --- a/paddle/fluid/operators/math/gru_compute.cc +++ b/paddle/fluid/operators/math/gru_compute.cc @@ -28,7 +28,7 @@ struct GRUUnitFunctor { const detail::ActivationType active_node, const detail::ActivationType active_gate, bool origin_mode) { -#if !defined(__NVCC__) && !defined(__HIPCC___) +#if !defined(__NVCC__) && !defined(__HIPCC___) && !defined(__MUSACC__) auto blas = phi::funcs::GetBlas(context); if (value.prev_out_value) { blas.GEMM(false, @@ -92,7 +92,7 @@ struct GRUUnitGradFunctor { const detail::ActivationType active_node, const detail::ActivationType active_gate, bool origin_mode) { -#if !defined(__NVCC__) && !defined(__HIPCC___) +#if !defined(__NVCC__) && !defined(__HIPCC___) && !defined(__MUSACC__) detail::backward_state_grad(detail::backward::gru_stateGrad(), value, grad, @@ -182,7 +182,7 @@ struct GRUUnitFunctorV2 { int batch_size, const detail::ActivationType active_node, const detail::ActivationType active_gate) { -#if !defined(__NVCC__) && !defined(__HIPCC___) +#if !defined(__NVCC__) && !defined(__HIPCC___) && !defined(__MUSACC__) auto blas = phi::funcs::GetBlas(context); if (value.prev_out_value) { blas.GEMM(CblasNoTrans, @@ -234,7 +234,7 @@ struct GRUUnitGradFunctorV2 { int batch_size, const detail::ActivationType active_node, const detail::ActivationType active_gate) { -#if !defined(__NVCC__) && !defined(__HIPCC___) +#if !defined(__NVCC__) && !defined(__HIPCC___) && !defined(__MUSACC__) // calculate grad_update_gate, grad_frame_state, // grad_reset_output, grad_reset_gate detail::cpu_gru_backward(context, diff --git a/paddle/fluid/operators/math/inclusive_scan.h b/paddle/fluid/operators/math/inclusive_scan.h index 3032b78a2029d..792a08423be0a 100644 --- a/paddle/fluid/operators/math/inclusive_scan.h +++ b/paddle/fluid/operators/math/inclusive_scan.h @@ -14,7 +14,7 @@ #pragma once -#ifdef __NVCC__ +#if defined(__NVCC__) || defined(__MUSACC__) #include "cub/cub.cuh" #endif #ifdef __HIPCC__ diff --git a/paddle/fluid/operators/math/prelu.h b/paddle/fluid/operators/math/prelu.h index 00ff1fbcbc38d..1762353abaa9f 100644 --- a/paddle/fluid/operators/math/prelu.h +++ b/paddle/fluid/operators/math/prelu.h @@ -23,7 +23,7 @@ namespace paddle { namespace operators { namespace math { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) template class PreluChannelWiseDirectCUDAFunctor { public: diff --git a/paddle/fluid/operators/math/sample_prob.cu b/paddle/fluid/operators/math/sample_prob.cu index bf028c4ada369..87fe1ee33f0f1 100644 --- a/paddle/fluid/operators/math/sample_prob.cu +++ b/paddle/fluid/operators/math/sample_prob.cu @@ -160,6 +160,11 @@ void GPUSampleWithProb::operator()(const phi::GPUContext& context, s_data, sizeof(int64_t) * num_samples, hipMemcpyHostToDevice)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(musaMemcpy(samples_data + num_true, + s_data, + sizeof(int64_t) * num_samples, + musaMemcpyHostToDevice)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpy(samples_data + num_true, s_data, diff --git a/paddle/fluid/operators/math/sample_prob.h b/paddle/fluid/operators/math/sample_prob.h index 524ba826a5704..da8c22aa67bbb 100644 --- a/paddle/fluid/operators/math/sample_prob.h +++ b/paddle/fluid/operators/math/sample_prob.h @@ -106,7 +106,7 @@ class SampleWithProb { } }; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) template class GPUSampleWithProb { public: diff --git a/paddle/fluid/operators/matmul_op.cc b/paddle/fluid/operators/matmul_op.cc index 895a427bae6e2..f082189fa0f37 100644 --- a/paddle/fluid/operators/matmul_op.cc +++ b/paddle/fluid/operators/matmul_op.cc @@ -98,7 +98,7 @@ ComputeMatmulImpl(const framework::ExecutionContext &context) { int head_number = 1; #if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA) && \ - !defined(PADDLE_WITH_HIP) + !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA) head_number = context.Attr("head_number"); #endif @@ -112,7 +112,7 @@ ComputeMatmulImpl(const framework::ExecutionContext &context) { } } #if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA) && \ - !defined(PADDLE_WITH_HIP) + !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA) bool split_vertical_y = (mat_dim_a.width_ != mat_dim_b.height_); if (head_number > 1) { @@ -271,7 +271,7 @@ class MatMulGradKernel : public framework::OpKernel { int head_number = 1; #if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA) && \ - !defined(PADDLE_WITH_HIP) + !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA) if (context.HasAttr("head_number")) { head_number = context.Attr("head_number"); } @@ -403,7 +403,7 @@ class MatMulDoubleGradKernel : public framework::OpKernel { int head_number = 1; #if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA) && \ - !defined(PADDLE_WITH_HIP) + !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA) head_number = context.Attr("head_number"); #endif @@ -645,7 +645,7 @@ class MatMulOp : public framework::OperatorWithKernel { } int64_t dim_out_y = mat_dim_y.width_; #if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA) && \ - !defined(PADDLE_WITH_HIP) + !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA) int head_number = context->Attrs().Get("head_number"); bool split_vertical_y = (mat_dim_x.width_ != mat_dim_y.height_); if (context->IsRuntime()) { @@ -788,7 +788,7 @@ class MatMulOpMaker : public framework::OpProtoAndCheckerMaker { .AsExtra(); #if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA) && \ - !defined(PADDLE_WITH_HIP) + !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA) AddAttr("head_number", "The number of heads of the matrix") .SetDefault(1); #endif diff --git a/paddle/fluid/operators/memcpy_h2d_op.h b/paddle/fluid/operators/memcpy_h2d_op.h index 5f480461d77cd..a4b6e061bfdff 100644 --- a/paddle/fluid/operators/memcpy_h2d_op.h +++ b/paddle/fluid/operators/memcpy_h2d_op.h @@ -39,7 +39,7 @@ class MemcpyH2DFunctor { void operator()(const phi::DenseTensor &lod_tensor) const { auto &out_tensor = *out_->GetMutable(); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) auto stream = static_cast(&dev_ctx_)->stream(); #else auto stream = nullptr; diff --git a/paddle/fluid/operators/merge_lod_tensor_op.cc b/paddle/fluid/operators/merge_lod_tensor_op.cc index 3ed27460e16b6..935b93d1c3ae3 100644 --- a/paddle/fluid/operators/merge_lod_tensor_op.cc +++ b/paddle/fluid/operators/merge_lod_tensor_op.cc @@ -68,7 +68,7 @@ class MergeLoDTensorOp : public framework::OperatorBase { if (platform::is_cpu_place(mask.place())) { cpu_mask->ShareDataWith(mask); } else if (platform::is_gpu_place(mask.place())) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) framework::TensorCopy( mask, platform::CPUPlace(), dev_ctx, cpu_mask.get()); #else diff --git a/paddle/fluid/operators/minus_op.cc b/paddle/fluid/operators/minus_op.cc index 64bc176d97149..580ea2da8721c 100644 --- a/paddle/fluid/operators/minus_op.cc +++ b/paddle/fluid/operators/minus_op.cc @@ -157,6 +157,6 @@ REGISTER_OPERATOR(minus, ops::MinusGradMaker); PD_REGISTER_STRUCT_KERNEL(minus, CPU, ALL_LAYOUT, ops::MinusKernel, float) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) PD_REGISTER_STRUCT_KERNEL(minus, GPU, ALL_LAYOUT, ops::MinusKernel, float) {} #endif diff --git a/paddle/fluid/operators/nccl/CMakeLists.txt b/paddle/fluid/operators/nccl/CMakeLists.txt index 629b41b4b582b..2d079c8ef521d 100644 --- a/paddle/fluid/operators/nccl/CMakeLists.txt +++ b/paddle/fluid/operators/nccl/CMakeLists.txt @@ -1,4 +1,4 @@ -if(NOT (WITH_NCCL OR WITH_RCCL)) +if(NOT (WITH_NCCL OR WITH_RCCL OR WITH_MCCL)) return() endif() @@ -16,7 +16,14 @@ if(WITH_ROCM AND NOT WIN32) DEPS device_context operator) endif() -if(WITH_GPU OR WITH_ROCM) +if(WITH_MUSA AND NOT WIN32) + musa_library( + nccl_common + SRCS nccl_gpu_common.cc + DEPS device_context operator) +endif() + +if(WITH_GPU OR WITH_ROCM OR WITH_MUSA) op_library(nccl_op DEPS nccl_common) set(OPERATOR_DEPS ${OPERATOR_DEPS} nccl_common diff --git a/paddle/fluid/operators/nccl/nccl_gpu_common.cc b/paddle/fluid/operators/nccl/nccl_gpu_common.cc index 9f7d967a84708..4916d71b2f73a 100644 --- a/paddle/fluid/operators/nccl/nccl_gpu_common.cc +++ b/paddle/fluid/operators/nccl/nccl_gpu_common.cc @@ -18,7 +18,7 @@ namespace paddle { namespace platform { namespace { // TODO(panyx0718): Where to destroy them. -std::unique_ptr> global_comms; +std::unique_ptr> global_comms; std::unique_ptr> comm_id_map; bool inited = false; size_t last_num_gpus = -1; @@ -41,21 +41,21 @@ void Communicator::InitAll(const std::vector& gpus) { if (global_comms) { for (size_t i = 0; i < global_comms->size(); ++i) { // FIXME(dzh) : PADDLE_ENFORCE return void - dynload::ncclCommDestroy((*global_comms)[i]); + dynload::mcclCommDestroy((*global_comms)[i]); } } - global_comms = std::make_unique>(); + global_comms = std::make_unique>(); comm_id_map = std::make_unique>(); global_comms->resize(gpus.size()); for (size_t i = 0; i < gpus.size(); ++i) { (*comm_id_map)[gpus[i]] = i; } PADDLE_ENFORCE_GPU_SUCCESS( - dynload::ncclCommInitAll(global_comms->data(), gpus.size(), gpus.data())); + dynload::mcclCommInitAll(global_comms->data(), gpus.size(), gpus.data())); inited = true; } -const std::vector& Communicator::comms() const { +const std::vector& Communicator::comms() const { std::lock_guard guard(comm_mu); return *global_comms; } diff --git a/paddle/fluid/operators/nccl/nccl_gpu_common.h b/paddle/fluid/operators/nccl/nccl_gpu_common.h index 01905d8ca84b3..0427180d56c04 100644 --- a/paddle/fluid/operators/nccl/nccl_gpu_common.h +++ b/paddle/fluid/operators/nccl/nccl_gpu_common.h @@ -25,6 +25,8 @@ limitations under the License. */ #include "paddle/fluid/platform/device_context.h" #ifdef PADDLE_WITH_RCCL #include "paddle/fluid/platform/dynload/rccl.h" +#elif defined(PADDLE_WITH_MCCL) +#include "paddle/fluid/platform/dynload/mccl.h" #else #include "paddle/fluid/platform/dynload/nccl.h" #endif @@ -42,7 +44,7 @@ struct Communicator { void InitAll(const std::vector& gpus); - const std::vector& comms() const; + const std::vector& comms() const; }; } // namespace platform diff --git a/paddle/fluid/operators/nccl/nccl_op.cc b/paddle/fluid/operators/nccl/nccl_op.cc index 8b06aa653c070..7e9b2b1d4dd19 100644 --- a/paddle/fluid/operators/nccl/nccl_op.cc +++ b/paddle/fluid/operators/nccl/nccl_op.cc @@ -105,8 +105,8 @@ class NCCLAllReduceOp : public framework::OperatorWithKernel { std::string reduction = ctx->Attrs().Get("reduction"); PADDLE_ENFORCE_EQ( - (reduction == "ncclSum" || reduction == "ncclProd" || - reduction == "ncclMin" || reduction == "ncclMax"), + (reduction == "mcclSum" || reduction == "mcclProd" || + reduction == "mcclMin" || reduction == "mcclMax"), true, platform::errors::InvalidArgument("invalid nccl reduction.")); @@ -124,9 +124,9 @@ class NCCLAllReduceOpMaker : public framework::OpProtoAndCheckerMaker { AddInput("Communicator", "Communicator for communicating between gpus"); AddOutput("Out", "The output of AllReduce op"); AddAttr("reduction", - "(string, default 'ncclSum') " - "{'ncclMin', 'ncclMax', 'ncclProd', 'ncclSum'}.") - .SetDefault("ncclSum"); + "(string, default 'mcclSum') " + "{'mcclMin', 'mcclMax', 'mcclProd', 'mcclSum'}.") + .SetDefault("mcclSum"); AddComment(R"DOC( NCCLAllReduce Operator. @@ -151,8 +151,8 @@ class NCCLReduceOp : public framework::OperatorWithKernel { std::string reduction = ctx->Attrs().Get("reduction"); PADDLE_ENFORCE_EQ( - (reduction == "ncclSum" || reduction == "ncclProd" || - reduction == "ncclMin" || reduction == "ncclMax"), + (reduction == "mcclSum" || reduction == "mcclProd" || + reduction == "mcclMin" || reduction == "mcclMax"), true, platform::errors::InvalidArgument("invalid nccl reduction.")); @@ -170,9 +170,9 @@ class NCCLReduceOpMaker : public framework::OpProtoAndCheckerMaker { AddInput("Communicator", "Communicator for communicating between gpus"); AddOutput("Out", "The output of Reduce op"); AddAttr("reduction", - "(string, default 'ncclSum') " - "{'ncclMin', 'ncclMax', 'ncclProd', 'ncclSum'}.") - .SetDefault("ncclSum"); + "(string, default 'mcclSum') " + "{'mcclMin', 'mcclMax', 'mcclProd', 'mcclSum'}.") + .SetDefault("mcclSum"); AddAttr("root", "(int, default kInvalidGPUId) " "Root gpu of the parameter. If not, " @@ -246,10 +246,10 @@ REGISTER_OPERATOR( ops::NCCLInitOpVarTypeInference, ops::NCCLInitOpShapeInference); -REGISTER_OP_WITHOUT_GRADIENT(ncclAllReduce, +REGISTER_OP_WITHOUT_GRADIENT(mcclAllReduce, ops::NCCLAllReduceOp, ops::NCCLAllReduceOpMaker); -REGISTER_OP_WITHOUT_GRADIENT(ncclBcast, +REGISTER_OP_WITHOUT_GRADIENT(mcclBcast, ops::NCCLBcastOp, ops::NCCLBcastOpMaker); REGISTER_OP_WITHOUT_GRADIENT(ncclReduce, diff --git a/paddle/fluid/operators/nccl/nccl_op.cu.cc b/paddle/fluid/operators/nccl/nccl_op.cu.cc index abb24cc8cae10..7b99c47cf13c8 100644 --- a/paddle/fluid/operators/nccl/nccl_op.cu.cc +++ b/paddle/fluid/operators/nccl/nccl_op.cu.cc @@ -27,33 +27,33 @@ class NCCLTypeWrapper; template <> class NCCLTypeWrapper { public: - static const ncclDataType_t type = ncclFloat; + static const mcclDataType_t type = mcclFloat; }; template <> class NCCLTypeWrapper { public: - static const ncclDataType_t type = ncclDouble; + static const mcclDataType_t type = mcclDouble; }; -static ncclRedOp_t str_to_nccl_red_type(std::string reduction) { - static const std::unordered_map str_to_type = { - {"ncclSum", ncclSum}, - {"ncclMin", ncclMin}, - {"ncclMax", ncclMax}, - {"ncclProd", ncclProd}, +static mcclRedOp_t str_to_nccl_red_type(std::string reduction) { + static const std::unordered_map str_to_type = { + {"mcclSum", mcclSum}, + {"mcclMin", mcclMin}, + {"mcclMax", mcclMax}, + {"mcclProd", mcclProd}, }; auto it = str_to_type.find(reduction); PADDLE_ENFORCE_EQ(it != str_to_type.end(), true, platform::errors::InvalidArgument( - "Invalid nccl reduction. Must be ncclMin | ncclMax | " - "ncclProd | ncclSum")); + "Invalid nccl reduction. Must be mcclMin | mcclMax | " + "mcclProd | mcclSum")); return it->second; } template -class NCCLAllReduceKernel : public framework::OpKernel { +class mcclAllReduceKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), @@ -74,7 +74,7 @@ class NCCLAllReduceKernel : public framework::OpKernel { << " invoke allreduce. send " << x->numel() << " recv " << out->numel(); PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::ncclAllReduce(x->data(), + platform::dynload::mcclAllReduce(x->data(), out->mutable_data(ctx.GetPlace()), out->numel(), NCCLTypeWrapper::type, @@ -115,7 +115,7 @@ class NCCLReduceKernel : public framework::OpKernel { VLOG(3) << "gpu : " << gpu_id << " invoke reduce. send " << x->numel() << " recv " << out->numel(); PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::ncclReduce(x->data(), + platform::dynload::mcclReduce(x->data(), recvbuffer, x->numel(), NCCLTypeWrapper::type, @@ -144,7 +144,7 @@ class NCCLBcastKernel : public framework::OpKernel { if (idx == root) { auto* x = ctx.Input("X"); VLOG(3) << "gpu : " << gpu_id << " invoke Bcast. send " << x->numel(); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclBcast( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclBcast( reinterpret_cast(const_cast(x->data())), x->numel(), NCCLTypeWrapper::type, @@ -157,7 +157,7 @@ class NCCLBcastKernel : public framework::OpKernel { VLOG(3) << "gpu : " << gpu_id << " invoke Bcast. recv buffer " << common::product(out->dims()); PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::ncclBcast(out->mutable_data(ctx.GetPlace()), + platform::dynload::mcclBcast(out->mutable_data(ctx.GetPlace()), out->numel(), NCCLTypeWrapper::type, root, @@ -173,8 +173,8 @@ class NCCLBcastKernel : public framework::OpKernel { namespace ops = paddle::operators; PD_REGISTER_STRUCT_KERNEL( - ncclAllReduce, GPU, ALL_LAYOUT, ops::NCCLAllReduceKernel, float) {} + mcclAllReduce, GPU, ALL_LAYOUT, ops::mcclAllReduceKernel, float) {} PD_REGISTER_STRUCT_KERNEL( - ncclBcast, GPU, ALL_LAYOUT, ops::NCCLBcastKernel, float) {} + mcclBcast, GPU, ALL_LAYOUT, ops::NCCLBcastKernel, float) {} PD_REGISTER_STRUCT_KERNEL( ncclReduce, GPU, ALL_LAYOUT, ops::NCCLReduceKernel, float) {} diff --git a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu index 6b0a36fc56472..8290da165800b 100644 --- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu +++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu @@ -30,13 +30,13 @@ #include "paddle/phi/kernels/funcs/tensor_to_string.h" #include "paddle/utils/optional.h" -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) #include "paddle/phi/core/distributed/nccl_comm_context.h" #include "paddle/phi/core/flags.h" PHI_DECLARE_bool(dynamic_static_unified_comm); #endif -#ifdef __NVCC__ +#if defined(__NVCC__) || defined(__MUSACC__) #include "cub/cub.cuh" #include "math.h" // NOLINT #endif @@ -74,6 +74,8 @@ static void FillZeroWithPtr(T *x, size_t n, gpuStream_t stream) { static_assert(!std::is_same::value, "T cannot be void."); #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipMemsetAsync(x, 0, n * sizeof(T), stream)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(musaMemsetAsync(x, 0, n * sizeof(T), stream)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaMemsetAsync(x, 0, n * sizeof(T), stream)); #endif @@ -271,6 +273,10 @@ static bool IsFinite(const phi::GPUContext &dev_ctx, const float *ptr) { PADDLE_ENFORCE_GPU_SUCCESS(hipMemcpyAsync( &cpu_value, ptr, sizeof(float), hipMemcpyDeviceToHost, stream)); PADDLE_ENFORCE_GPU_SUCCESS(hipStreamSynchronize(stream)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(musaMemcpyAsync( + &cpu_value, ptr, sizeof(float), musaMemcpyDeviceToHost, stream)); + PADDLE_ENFORCE_GPU_SUCCESS(musaStreamSynchronize(stream)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync( &cpu_value, ptr, sizeof(float), cudaMemcpyDeviceToHost, stream)); @@ -895,14 +901,14 @@ static void MultiTensorUpdateLambParamAndBetaPows( #undef PD_LAUNCH_VEC_MULTI_TENSOR_UPDATE_PARAM_BETAPOW_CASE } -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) static bool CreatePreMulScaleOpIfSupported( - ncclDataType_t dtype, - ncclComm_t comm, + mcclDataType_t dtype, + mcclComm_t comm, const void *scale, - ncclRedOp_t *op, + mcclRedOp_t *op, distributed::NCCLCommContext *comm_ctx = nullptr) { -#if NCCL_VERSION_CODE >= 21100 +// #if NCCL_VERSION_CODE >= 21100 if (FLAGS_dynamic_static_unified_comm) { PADDLE_ENFORCE_NOT_NULL( comm_ctx, @@ -913,32 +919,32 @@ static bool CreatePreMulScaleOpIfSupported( "But parameter of comm_ctx should not be nullptr.")); int ver = comm_ctx->GetNcclVersion(); if (ver >= 21100) { - VLOG(10) << "ncclRedOpCreatePreMulSum is supported."; + VLOG(10) << "mcclRedOpCreatePreMulSum is supported."; comm_ctx->RedOpCreatePreMulSum( - op, const_cast(scale), dtype, ncclScalarDevice); + op, const_cast(scale), dtype, mcclScalarDevice); return true; } } else { int ver; - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclGetVersion(&ver)); + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mcclGetVersion(&ver)); if (ver >= 21100) { - VLOG(10) << "ncclRedOpCreatePreMulSum is supported."; - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclRedOpCreatePreMulSum( - op, const_cast(scale), dtype, ncclScalarDevice, comm)); + VLOG(10) << "mcclRedOpCreatePreMulSum is supported."; + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mcclRedOpCreatePreMulSum( + op, const_cast(scale), dtype, mcclScalarDevice, comm)); return true; } } -#endif - VLOG(10) << "ncclRedOpCreatePreMulSum is not supported."; +// #endif + VLOG(10) << "mcclRedOpCreatePreMulSum is not supported."; return false; } static void DestoryOpIfSupported( - ncclRedOp_t op, - ncclComm_t comm, + mcclRedOp_t op, + mcclComm_t comm, distributed::NCCLCommContext *comm_ctx = nullptr) { -#if NCCL_VERSION_CODE >= 21100 - VLOG(10) << "ncclRedOpDestroy starts"; +// #if NCCL_VERSION_CODE >= 21100 + VLOG(10) << "mcclRedOpDestroy starts"; if (FLAGS_dynamic_static_unified_comm) { PADDLE_ENFORCE_NOT_NULL( @@ -950,12 +956,12 @@ static void DestoryOpIfSupported( "But parameter of comm_ctx should not be nullptr.")); comm_ctx->RedOpDestroy(op); } else { - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclRedOpDestroy(op, comm)); + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mcclRedOpDestroy(op, comm)); } - VLOG(10) << "ncclRedOpDestroy ends"; + VLOG(10) << "mcclRedOpDestroy ends"; -#endif - VLOG(10) << "ncclRedOpDestroy is not supported."; +// #endif + VLOG(10) << "mcclRedOpDestroy is not supported."; } template @@ -980,11 +986,11 @@ static void LaunchScaleKernel(const phi::GPUContext &dev_ctx, } template -static void NCCLSumWithScaleBase(const T *sendbuff, +static void mcclSumWithScaleBase(const T *sendbuff, T *recvbuff, size_t recvcount, size_t nranks, - ncclComm_t comm, + mcclComm_t comm, gpuStream_t stream, const phi::GPUContext &dev_ctx, distributed::NCCLCommContext *comm_ctx, @@ -1016,9 +1022,9 @@ static void NCCLSumWithScaleBase(const T *sendbuff, return; } - ncclRedOp_t op = ncclSum; - ncclDataType_t dtype = - std::is_same::value ? ncclFloat32 : ncclFloat16; + mcclRedOp_t op = mcclSum; + mcclDataType_t dtype = + std::is_same::value ? mcclFloat32 : mcclFloat16; bool should_destroy_op = scale && CreatePreMulScaleOpIfSupported( dtype, comm, scale, &op, comm_ctx); memory_utils::Buffer buffer(dev_ctx.GetPlace()); @@ -1034,7 +1040,7 @@ static void NCCLSumWithScaleBase(const T *sendbuff, // TODO(BeingGod): NCCLCommContext::ReduceScatter only accept DenseTensor, // but sendbuff or recvbuff maybe allocated by Buffer. PADDLE_ENFORCE_GPU_SUCCESS( - phi::dynload::ncclReduceScatter(sendbuff, + phi::dynload::mcclReduceScatter(sendbuff, recvbuff, recvcount, dtype, @@ -1045,7 +1051,7 @@ static void NCCLSumWithScaleBase(const T *sendbuff, // TODO(BeingGod): NCCLCommContext::AllReduce only accept DenseTensor, // but sendbuff or recvbuff maybe allocated by Buffer. PADDLE_ENFORCE_GPU_SUCCESS( - phi::dynload::ncclAllReduce(sendbuff, + phi::dynload::mcclAllReduce(sendbuff, recvbuff, recvcount, dtype, @@ -1055,10 +1061,10 @@ static void NCCLSumWithScaleBase(const T *sendbuff, } } else { if (UseReduceScatter) { - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclReduceScatter( + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mcclReduceScatter( sendbuff, recvbuff, recvcount, dtype, op, comm, stream)); } else { - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclAllReduce( + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mcclAllReduce( sendbuff, recvbuff, recvcount, dtype, op, comm, stream)); } } @@ -1069,16 +1075,16 @@ static void NCCLSumWithScaleBase(const T *sendbuff, } template -static void NCCLReduceScatterWithScale(const T *sendbuff, +static void mcclReduceScatterWithScale(const T *sendbuff, T *recvbuff, size_t recvcount, size_t nranks, - ncclComm_t comm, + mcclComm_t comm, gpuStream_t stream, const phi::GPUContext &dev_ctx, distributed::NCCLCommContext *comm_ctx, const T *scale = nullptr) { - NCCLSumWithScaleBase(sendbuff, + mcclSumWithScaleBase(sendbuff, recvbuff, recvcount, nranks, @@ -1090,16 +1096,16 @@ static void NCCLReduceScatterWithScale(const T *sendbuff, } template -static void NCCLAllReduceWithScale(const T *sendbuff, +static void mcclAllReduceWithScale(const T *sendbuff, T *recvbuff, size_t recvcount, size_t nranks, - ncclComm_t comm, + mcclComm_t comm, gpuStream_t stream, const phi::GPUContext &dev_ctx, distributed::NCCLCommContext *comm_ctx, const T *scale = nullptr) { - NCCLSumWithScaleBase(sendbuff, + mcclSumWithScaleBase(sendbuff, recvbuff, recvcount, nranks, @@ -1240,6 +1246,10 @@ static std::string GetMinMaxStr(const T *x, size_t n, const phi::Place &place) { PADDLE_ENFORCE_GPU_SUCCESS(hipMemcpyAsync( &ret_cpu[0], ret, 2 * sizeof(T), hipMemcpyDeviceToHost, stream)); PADDLE_ENFORCE_GPU_SUCCESS(hipStreamSynchronize(stream)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(musaMemcpyAsync( + &ret_cpu[0], ret, 2 * sizeof(T), musaMemcpyDeviceToHost, stream)); + PADDLE_ENFORCE_GPU_SUCCESS(musaStreamSynchronize(stream)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync( &ret_cpu[0], ret, 2 * sizeof(T), cudaMemcpyDeviceToHost, stream)); @@ -1296,6 +1306,12 @@ static bool HasNanInf(const phi::GPUContext &dev_ctx, const T *x, int numel) { sizeof(flag), hipMemcpyDeviceToHost, dev_ctx.stream())); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(musaMemcpyAsync(&flag, + out.Get(), + sizeof(flag), + musaMemcpyDeviceToHost, + dev_ctx.stream())); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(&flag, out.Get(), @@ -1458,7 +1474,7 @@ void DistributedFusedLambKernel( DenseTensor *acc_step, DenseTensor *stop_update, DenseTensor *step) { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) auto stream = dev_ctx.stream(); auto place = dev_ctx.GetPlace(); found_inf->Resize({1}); @@ -1756,7 +1772,7 @@ void DistributedFusedLambKernel( // Step 6: allreduce + global norm gradient clip int64_t global_rank = 0, local_rank = 0; - ncclComm_t global_comm = nullptr, local_comm = nullptr, + mcclComm_t global_comm = nullptr, local_comm = nullptr, external_comm = nullptr; paddle::platform::NCCLComm *nccl_comm_handle = nullptr, *local_nccl_comm_handle = nullptr; @@ -1868,7 +1884,7 @@ void DistributedFusedLambKernel( // (1) ReduceScater first if (local_shard) { if (use_hierarchical_allreduce) { - NCCLReduceScatterWithScale( + mcclReduceScatterWithScale( fp32_grad_data, fp32_sum_grad + local_rank * fp32_numel_each_device, fp32_numel_each_device, @@ -1877,7 +1893,7 @@ void DistributedFusedLambKernel( stream, dev_ctx, local_comm_ctx); - NCCLAllReduceWithScale( + mcclAllReduceWithScale( fp32_sum_grad + local_rank * fp32_numel_each_device, fp32_sum_grad + local_rank * fp32_numel_each_device, fp32_numel_each_device, @@ -1887,7 +1903,7 @@ void DistributedFusedLambKernel( dev_ctx, external_comm_ctx); - NCCLReduceScatterWithScale( + mcclReduceScatterWithScale( fp16_grad_data, fp16_sum_grad + local_rank * fp16_numel_each_device, fp16_numel_each_device, @@ -1896,7 +1912,7 @@ void DistributedFusedLambKernel( stream, dev_ctx, local_comm_ctx); - NCCLAllReduceWithScale( + mcclAllReduceWithScale( fp16_sum_grad + local_rank * fp16_numel_each_device, fp16_sum_grad + local_rank * fp16_numel_each_device, fp16_numel_each_device, @@ -1906,7 +1922,7 @@ void DistributedFusedLambKernel( dev_ctx, external_comm_ctx); } else { - NCCLAllReduceWithScale(fp32_grad_data, + mcclAllReduceWithScale(fp32_grad_data, fp32_sum_grad, fp32_numel, nranks, @@ -1914,7 +1930,7 @@ void DistributedFusedLambKernel( stream, dev_ctx, comm_ctx); - NCCLAllReduceWithScale(fp16_grad_data, + mcclAllReduceWithScale(fp16_grad_data, fp16_sum_grad, fp16_numel, nranks, @@ -1926,7 +1942,7 @@ void DistributedFusedLambKernel( fp32_sum_grad += (local_rank * fp32_numel_each_device); fp16_sum_grad += (local_rank * fp16_numel_each_device); } else { - NCCLReduceScatterWithScale(fp32_grad_data, + mcclReduceScatterWithScale(fp32_grad_data, fp32_sum_grad, fp32_numel_each_device, nranks, @@ -1934,7 +1950,7 @@ void DistributedFusedLambKernel( stream, dev_ctx, comm_ctx); - NCCLReduceScatterWithScale(fp16_grad_data, + mcclReduceScatterWithScale(fp16_grad_data, fp16_sum_grad, fp16_numel_each_device, nranks, @@ -1957,11 +1973,11 @@ void DistributedFusedLambKernel( // TODO(BeingGod): NCCLCommContext::AllReduce only accept DenseTensor, // but fp32_square_grad_norm is allocated by Buffer. PADDLE_ENFORCE_GPU_SUCCESS( - phi::dynload::ncclAllReduce(fp32_square_grad_norm, + phi::dynload::mcclAllReduce(fp32_square_grad_norm, fp32_square_grad_norm, 1, - ncclFloat32, - ncclSum, + mcclFloat32, + mcclSum, local_comm, stream)); } @@ -2014,7 +2030,7 @@ void DistributedFusedLambKernel( << HasNanInf(dev_ctx, fp16_grad_data, fp16_numel); if (local_shard) { if (use_hierarchical_allreduce) { - NCCLReduceScatterWithScale( + mcclReduceScatterWithScale( fp32_grad_data, fp32_sum_grad + local_rank * fp32_numel_each_device, fp32_numel_each_device, @@ -2024,7 +2040,7 @@ void DistributedFusedLambKernel( dev_ctx, local_comm_ctx, fp32_scale); - NCCLAllReduceWithScale( + mcclAllReduceWithScale( fp32_sum_grad + local_rank * fp32_numel_each_device, fp32_sum_grad + local_rank * fp32_numel_each_device, fp32_numel_each_device, @@ -2033,7 +2049,7 @@ void DistributedFusedLambKernel( stream, dev_ctx, external_comm_ctx); - NCCLReduceScatterWithScale( + mcclReduceScatterWithScale( fp16_grad_data, fp16_sum_grad + local_rank * fp16_numel_each_device, fp16_numel_each_device, @@ -2043,7 +2059,7 @@ void DistributedFusedLambKernel( dev_ctx, local_comm_ctx, fp16_scale); - NCCLAllReduceWithScale( + mcclAllReduceWithScale( fp16_sum_grad + local_rank * fp16_numel_each_device, fp16_sum_grad + local_rank * fp16_numel_each_device, fp16_numel_each_device, @@ -2053,7 +2069,7 @@ void DistributedFusedLambKernel( dev_ctx, external_comm_ctx); } else { - NCCLAllReduceWithScale(fp32_grad_data, + mcclAllReduceWithScale(fp32_grad_data, fp32_sum_grad, fp32_numel, nranks, @@ -2062,7 +2078,7 @@ void DistributedFusedLambKernel( dev_ctx, comm_ctx, fp32_scale); - NCCLAllReduceWithScale(fp16_grad_data, + mcclAllReduceWithScale(fp16_grad_data, fp16_sum_grad, fp16_numel, nranks, @@ -2075,7 +2091,7 @@ void DistributedFusedLambKernel( fp32_sum_grad += (local_rank * fp32_numel_each_device); fp16_sum_grad += (local_rank * fp16_numel_each_device); } else { - NCCLReduceScatterWithScale(fp32_grad_data, + mcclReduceScatterWithScale(fp32_grad_data, fp32_sum_grad, fp32_numel_each_device, nranks, @@ -2084,7 +2100,7 @@ void DistributedFusedLambKernel( dev_ctx, comm_ctx, fp32_scale); - NCCLReduceScatterWithScale(fp16_grad_data, + mcclReduceScatterWithScale(fp16_grad_data, fp16_sum_grad, fp16_numel_each_device, nranks, @@ -2109,11 +2125,11 @@ void DistributedFusedLambKernel( // TODO(BeingGod): NCCLCommContext::AllReduce only accept DenseTensor, // but fp32_square_grad_norm is allocated by Buffer. PADDLE_ENFORCE_GPU_SUCCESS( - phi::dynload::ncclAllReduce(fp32_square_grad_norm, + phi::dynload::mcclAllReduce(fp32_square_grad_norm, fp32_square_grad_norm, 1, - ncclFloat32, - ncclSum, + mcclFloat32, + mcclSum, local_comm, stream)); VLOG(1) << "Grad square norm after all reduce: " @@ -2126,7 +2142,7 @@ void DistributedFusedLambKernel( } else { if (local_shard) { if (use_hierarchical_allreduce) { - NCCLReduceScatterWithScale( + mcclReduceScatterWithScale( fp32_grad_data, fp32_sum_grad + local_rank * fp32_numel_each_device, fp32_numel_each_device, @@ -2135,7 +2151,7 @@ void DistributedFusedLambKernel( stream, dev_ctx, local_comm_ctx); - NCCLAllReduceWithScale( + mcclAllReduceWithScale( fp32_sum_grad + local_rank * fp32_numel_each_device, fp32_sum_grad + local_rank * fp32_numel_each_device, fp32_numel_each_device, @@ -2144,7 +2160,7 @@ void DistributedFusedLambKernel( stream, dev_ctx, external_comm_ctx); - NCCLReduceScatterWithScale( + mcclReduceScatterWithScale( fp16_grad_data, fp16_sum_grad + local_rank * fp16_numel_each_device, fp16_numel_each_device, @@ -2153,7 +2169,7 @@ void DistributedFusedLambKernel( stream, dev_ctx, local_comm_ctx); - NCCLAllReduceWithScale( + mcclAllReduceWithScale( fp16_sum_grad + local_rank * fp16_numel_each_device, fp16_sum_grad + local_rank * fp16_numel_each_device, fp16_numel_each_device, @@ -2163,7 +2179,7 @@ void DistributedFusedLambKernel( dev_ctx, external_comm_ctx); } else { - NCCLAllReduceWithScale(fp32_grad_data, + mcclAllReduceWithScale(fp32_grad_data, fp32_sum_grad, fp32_numel, nranks, @@ -2171,7 +2187,7 @@ void DistributedFusedLambKernel( stream, dev_ctx, comm_ctx); - NCCLAllReduceWithScale(fp16_grad_data, + mcclAllReduceWithScale(fp16_grad_data, fp16_sum_grad, fp16_numel, nranks, @@ -2183,7 +2199,7 @@ void DistributedFusedLambKernel( fp32_sum_grad += (local_rank * fp32_numel_each_device); fp16_sum_grad += (local_rank * fp16_numel_each_device); } else { - NCCLReduceScatterWithScale(fp32_grad_data, + mcclReduceScatterWithScale(fp32_grad_data, fp32_sum_grad, fp32_numel_each_device, num_devices, @@ -2191,7 +2207,7 @@ void DistributedFusedLambKernel( stream, dev_ctx, comm_ctx); - NCCLReduceScatterWithScale(fp16_grad_data, + mcclReduceScatterWithScale(fp16_grad_data, fp16_sum_grad, fp16_numel_each_device, num_devices, @@ -2211,11 +2227,11 @@ void DistributedFusedLambKernel( // TODO(BeingGod): NCCLCommContext::AllReduce only accept DenseTensor, // but fp32_square_grad_norm is allocated by Buffer. PADDLE_ENFORCE_GPU_SUCCESS( - phi::dynload::ncclAllReduce(fp32_square_grad_norm, + phi::dynload::mcclAllReduce(fp32_square_grad_norm, fp32_square_grad_norm, 1, - ncclFloat32, - ncclSum, + mcclFloat32, + mcclSum, local_comm, stream)); } @@ -2357,26 +2373,26 @@ void DistributedFusedLambKernel( // TODO(BeingGod): NCCLCommContext::AllReduce only accept DenseTensor, // but param_square_norm is allocated by Buffer. PADDLE_ENFORCE_GPU_SUCCESS( - phi::dynload::ncclAllReduce(param_square_norm + fp32_global_param_num, + phi::dynload::mcclAllReduce(param_square_norm + fp32_global_param_num, param_square_norm + fp32_global_param_num, 2 * param_num - fp32_global_param_num, - ncclFloat32, - ncclSum, + mcclFloat32, + mcclSum, local_comm, stream)); } else { // TODO(BeingGod): NCCLCommContext::AllReduce only accept DenseTensor, // but trust_ratio_div_square_norm is allocated by Buffer. PADDLE_ENFORCE_GPU_SUCCESS( - phi::dynload::ncclAllReduce(trust_ratio_div_square_norm, + phi::dynload::mcclAllReduce(trust_ratio_div_square_norm, trust_ratio_div_square_norm, param_num, - ncclFloat32, - ncclSum, + mcclFloat32, + mcclSum, local_comm, stream)); } - VLOG(10) << "ncclAllReduce done"; + VLOG(10) << "mcclAllReduce done"; } LogParamAndTrustRatioDivSquareNorm<1>( @@ -2401,7 +2417,7 @@ void DistributedFusedLambKernel( beta1, beta2); if (num_devices > 1) { - // ncclAllGather + // mcclAllGather if (local_comm_ctx) { auto send_buf = distributed::GetPartialTensor( *fp32_param_out, fp32_offset, fp32_numel_each_device); @@ -2410,10 +2426,10 @@ void DistributedFusedLambKernel( local_comm_ctx->AllGather(&recv_buf, send_buf, stream); } else { PADDLE_ENFORCE_GPU_SUCCESS( - phi::dynload::ncclAllGather(fp32_param_data + fp32_offset, + phi::dynload::mcclAllGather(fp32_param_data + fp32_offset, fp32_param_data, fp32_numel_each_device, - ncclFloat32, + mcclFloat32, local_comm, stream)); } @@ -2439,7 +2455,7 @@ void DistributedFusedLambKernel( beta1, beta2); if (num_devices > 1) { - // ncclAllGather + // mcclAllGather if (local_comm_ctx) { auto send_buf = distributed::GetPartialTensor( *fp16_param_out, fp16_offset, fp16_numel_each_device); @@ -2448,10 +2464,10 @@ void DistributedFusedLambKernel( local_comm_ctx->AllGather(&recv_buf, send_buf, stream); } else { PADDLE_ENFORCE_GPU_SUCCESS( - phi::dynload::ncclAllGather(fp16_param_data + fp16_offset, + phi::dynload::mcclAllGather(fp16_param_data + fp16_offset, fp16_param_data, fp16_numel_each_device, - ncclFloat16, + mcclFloat16, local_comm, stream)); } diff --git a/paddle/fluid/operators/optimizers/sparse_momentum_op.h b/paddle/fluid/operators/optimizers/sparse_momentum_op.h index 4c47fd2b62178..13d925bbe19a1 100644 --- a/paddle/fluid/operators/optimizers/sparse_momentum_op.h +++ b/paddle/fluid/operators/optimizers/sparse_momentum_op.h @@ -25,7 +25,8 @@ #include "paddle/fluid/platform/for_range.h" #include "paddle/phi/common/amp_type_traits.h" -#ifdef __NVCC__ +#if defined(__NVCC__) || defined(__MUSACC__) + #include "cub/cub.cuh" #endif #ifdef __HIPCC__ @@ -461,7 +462,7 @@ class SparseMomentumOpKernel : public framework::OpKernel { grad_index.mutable_data({num_index}, ctx.GetPlace()); if (platform::is_gpu_place(ctx.GetPlace())) { -#if defined(__NVCC__) || defined(__HIPCC__) +#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) auto sort_value_ptr = sort_value.mutable_data({num_index}, ctx.GetPlace()); diff --git a/paddle/fluid/operators/pscore/send_and_recv_op.cc b/paddle/fluid/operators/pscore/send_and_recv_op.cc index 4f118565396e1..cc11601be0be6 100644 --- a/paddle/fluid/operators/pscore/send_and_recv_op.cc +++ b/paddle/fluid/operators/pscore/send_and_recv_op.cc @@ -107,7 +107,7 @@ PD_REGISTER_STRUCT_KERNEL(send_and_recv, double, int, int64_t) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) PD_REGISTER_STRUCT_KERNEL(send_and_recv, GPU, ALL_LAYOUT, diff --git a/paddle/fluid/operators/rank_loss_op.cc b/paddle/fluid/operators/rank_loss_op.cc index ebdddfd41b33f..b9f05d663dba0 100644 --- a/paddle/fluid/operators/rank_loss_op.cc +++ b/paddle/fluid/operators/rank_loss_op.cc @@ -246,7 +246,7 @@ PD_REGISTER_STRUCT_KERNEL( PD_REGISTER_STRUCT_KERNEL( rank_loss_grad, CPU, ALL_LAYOUT, ops::RankLossGradKernel, float) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) PD_REGISTER_STRUCT_KERNEL( rank_loss, GPU, ALL_LAYOUT, ops::RankLossKernel, float) {} PD_REGISTER_STRUCT_KERNEL( diff --git a/paddle/fluid/operators/reader/buffered_reader.cc b/paddle/fluid/operators/reader/buffered_reader.cc index b73ffe4319be7..24457c24a54ac 100644 --- a/paddle/fluid/operators/reader/buffered_reader.cc +++ b/paddle/fluid/operators/reader/buffered_reader.cc @@ -48,7 +48,7 @@ BufferedReader::BufferedReader( buffer_size_(buffer_size), pin_memory_(pin_memory) { VLOG(1) << "BufferedReader"; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) if (platform::is_gpu_place(place_) && !pin_memory) { int dev_idx = place_.device; // NOLINT compute_stream_ = @@ -118,7 +118,7 @@ void BufferedReader::ReadAsync(size_t i) { return -1UL; } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) // @{ Group GPU Place +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) // @{ Group GPU Place if (platform::is_gpu_place(place_)) { TensorVec &cuda = cuda_buffer_[i]; if (cuda.empty()) { @@ -197,6 +197,11 @@ void BufferedReader::ReadAsync(size_t i) { hipEventRecord(events_[i].get(), compute_stream_)); PADDLE_ENFORCE_GPU_SUCCESS( hipStreamWaitEvent(stream_.get(), events_[i].get(), 0)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS( + musaEventRecord(events_[i].get(), compute_stream_)); + PADDLE_ENFORCE_GPU_SUCCESS( + musaStreamWaitEvent(stream_.get(), events_[i].get(), 0)); #else PADDLE_ENFORCE_GPU_SUCCESS( cudaEventRecord(events_[i].get(), compute_stream_)); diff --git a/paddle/fluid/operators/reader/buffered_reader.h b/paddle/fluid/operators/reader/buffered_reader.h index 032a74b7e23f1..db849dc70b5da 100644 --- a/paddle/fluid/operators/reader/buffered_reader.h +++ b/paddle/fluid/operators/reader/buffered_reader.h @@ -21,7 +21,7 @@ #include "ThreadPool.h" #include "paddle/fluid/framework/reader.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) #include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/device/gpu/gpu_resource_pool.h" #endif @@ -80,7 +80,7 @@ class BufferedReader : public framework::DecoratedReader { std::vector xpu_buffer_; std::vector custom_device_buffer_; size_t prev_pos_{-1UL}; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) gpuStream_t compute_stream_; std::shared_ptr stream_; std::vector> events_; diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.h b/paddle/fluid/operators/reduce_ops/reduce_op.h index e69492501c117..d0bde6af20489 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_op.h +++ b/paddle/fluid/operators/reduce_ops/reduce_op.h @@ -28,7 +28,7 @@ limitations under the License. */ #include "paddle/fluid/framework/phi_utils.h" #include "paddle/phi/kernels/cpu/reduce.h" -#if defined(__HIPCC__) || defined(__NVCC__) || defined(__xpu__) +#if defined(__HIPCC__) || defined(__NVCC__) || defined(__xpu__) || defined(__MUSACC__) #include "paddle/phi/kernels/gpu/reduce.h" #include "paddle/phi/kernels/gpu/reduce_grad.h" #endif @@ -757,7 +757,7 @@ If reduce_all is true, just reduce along all dimensions and output a scalar. virtual std::string GetOpType() const = 0; }; -#if defined(__HIPCC__) || defined(__NVCC__) || defined(__xpu__) +#if defined(__HIPCC__) || defined(__NVCC__) || defined(__xpu__) || defined(__MUSACC__) template class ReduceBaseOp, diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc index 30d4fb0cf9ad4..1a26271a97f22 100644 --- a/paddle/fluid/operators/reshape_op.cc +++ b/paddle/fluid/operators/reshape_op.cc @@ -429,7 +429,7 @@ class ReshapeKernel { pt_scalar_shape, out); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) if (platform::is_gpu_place(ctx.GetPlace())) { auto &dev_ctx = ctx.device_context(); phi::ReshapeInferKernel(static_cast(dev_ctx), @@ -462,7 +462,7 @@ class ReshapeGradKernel { phi::ReshapeGradKernel( static_cast(dev_ctx), *d_out, d_x); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) if (platform::is_gpu_place(ctx.GetPlace())) { auto &dev_ctx = ctx.device_context(); phi::ReshapeGradKernel( @@ -492,7 +492,7 @@ class ReshapeDoubleGradKernel { phi::ReshapeDoubleGradKernel( static_cast(dev_ctx), *d_out, *dd_x, dd_out); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) if (platform::is_gpu_place(ctx.GetPlace())) { auto &dev_ctx = ctx.device_context(); phi::ReshapeDoubleGradKernel( @@ -764,7 +764,7 @@ REGISTER_OPERATOR(reshape2_grad_grad, ops::ReshapeDoubleGradOpNoNeedBufferVarInferer, Reshape2DoubleGradInferShapeFunctor); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape, float, ops::ReshapeKernel, diff --git a/paddle/fluid/operators/save_op.cc b/paddle/fluid/operators/save_op.cc index f025d27807421..14b86627c3825 100644 --- a/paddle/fluid/operators/save_op.cc +++ b/paddle/fluid/operators/save_op.cc @@ -121,7 +121,7 @@ PD_REGISTER_KERNEL(save_sr, kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) PD_REGISTER_KERNEL(save, GPU, ALL_LAYOUT, diff --git a/paddle/fluid/operators/select_op_helper.h b/paddle/fluid/operators/select_op_helper.h index 2b7f884f6170c..7e3de57345a4b 100644 --- a/paddle/fluid/operators/select_op_helper.h +++ b/paddle/fluid/operators/select_op_helper.h @@ -39,7 +39,7 @@ inline int GetBranchNumber(const phi::DenseTensor &mask) { } // when platform::is_gpu_place(mask.place()) is true std::unique_ptr cpu_mask{new phi::DenseTensor()}; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || \ defined(PADDLE_WITH_CUSTOM_DEVICE) || defined(PADDLE_WITH_XPU) framework::TensorCopySync(mask, platform::CPUPlace(), cpu_mask.get()); #else diff --git a/paddle/fluid/operators/sequence_ops/sequence_reverse_op.h b/paddle/fluid/operators/sequence_ops/sequence_reverse_op.h index 2236988025cbc..13133e54f0415 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_reverse_op.h +++ b/paddle/fluid/operators/sequence_ops/sequence_reverse_op.h @@ -136,7 +136,7 @@ class SequenceReverseOpKernel : public framework::OpKernel { const size_t *lod; size_t lod_count = x.lod()[0].size(); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) if (platform::is_gpu_place(ctx.GetPlace())) { auto xlod = x.lod()[0]; phi::MixVector mixv_xlod(&xlod); @@ -144,7 +144,7 @@ class SequenceReverseOpKernel : public framework::OpKernel { } else { #endif lod = x.lod()[0].data(); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) } #endif diff --git a/paddle/fluid/operators/sequence_ops/sequence_softmax_cudnn_op.cu.cc b/paddle/fluid/operators/sequence_ops/sequence_softmax_cudnn_op.cu.cc index 01f7bb3e92890..316f8a55cc803 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_softmax_cudnn_op.cu.cc +++ b/paddle/fluid/operators/sequence_ops/sequence_softmax_cudnn_op.cu.cc @@ -26,44 +26,8 @@ template class SequenceSoftmaxCUDNNKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* out = ctx.Output("Out"); + PADDLE_ENFORCE(false,"not support"); - auto& lod = x->lod(); - auto& dims = x->dims(); - - const size_t level = lod.size() - 1; - PADDLE_ENFORCE_EQ( - dims[0], - static_cast(lod[level].back()), - platform::errors::InvalidArgument( - "The first dimension of Input(X) should be equal to the sum of all " - "sequences' lengths. But received first dimension of Input(X) is " - "%d, the sum of all sequences' lengths is %d.", - dims[0], - static_cast(lod[level].back()))); - PADDLE_ENFORCE_EQ(dims[0], - x->numel(), - platform::errors::InvalidArgument( - "The width of each timestep in Input(X) of " - "SequenceSoftmaxOp should be 1.")); - - out->mutable_data(ctx.GetPlace()); - for (int i = 0; i < static_cast(lod[level].size()) - 1; ++i) { - int start_pos = static_cast(lod[level][i]); - int end_pos = static_cast(lod[level][i + 1]); - Tensor x_i = x->Slice(start_pos, end_pos); - Tensor out_i = out->Slice(start_pos, end_pos); - - // Reshape from (end_pos - start_pos) x 1UL to 1UL x (end_pos - start_pos) - framework::DDim dims_i = - // common::make_ddim({1UL, end_pos - start_pos, 1UL, 1UL}); - common::make_ddim({1UL, end_pos - start_pos}); - x_i.Resize(dims_i); - out_i.Resize(dims_i); - phi::funcs::SoftmaxCUDNNFunctor()( - ctx.template device_context(), &x_i, &out_i); - } } }; @@ -71,36 +35,7 @@ template class SequenceSoftmaxGradCUDNNKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - auto* out = ctx.Input("Out"); - auto* out_grad = ctx.Input(framework::GradVarName("Out")); - auto* x = ctx.Input("X"); - auto* x_grad = ctx.Output(framework::GradVarName("X")); - if (x_grad) { - x_grad->set_lod(x->lod()); - } - auto& lod = x->lod(); - const size_t level = lod.size() - 1; - - x_grad->mutable_data(ctx.GetPlace()); // NOLINT - for (int i = 0; i < static_cast(lod[level].size()) - 1; ++i) { - int start_pos = static_cast(lod[level][i]); - int end_pos = static_cast(lod[level][i + 1]); - - Tensor out_i = out->Slice(start_pos, end_pos); - Tensor out_grad_i = out_grad->Slice(start_pos, end_pos); - Tensor x_grad_i = x_grad->Slice(start_pos, end_pos); - - // Reshape from (end_pos - start_pos) x 1UL to 1UL x (end_pos - start_pos) - framework::DDim dims_i = common::make_ddim({1UL, end_pos - start_pos}); - out_i.Resize(dims_i); - out_grad_i.Resize(dims_i); - x_grad_i.Resize(dims_i); - phi::funcs::SoftmaxGradCUDNNFunctor()( - ctx.template device_context(), - &out_i, - &out_grad_i, - &x_grad_i); - } + PADDLE_ENFORCE(false,"not support"); } }; diff --git a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cc b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cc index 12d4f72a91169..a037d0dcf73cc 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cc +++ b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cc @@ -16,7 +16,7 @@ limitations under the License. */ #include -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) #include "paddle/fluid/platform/device/gpu/gpu_dnn.h" #endif diff --git a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cu b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cu index 40a7a451a6e21..3262bef2bf5e9 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cu +++ b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cu @@ -17,6 +17,10 @@ limitations under the License. */ #include #endif +#ifdef __MUSACC__ +#include +#endif + #ifdef __HIPCC__ #include namespace cub = hipcub; diff --git a/paddle/fluid/operators/set_value_op.cc b/paddle/fluid/operators/set_value_op.cc index a0aa1f589191f..16864b80b5c76 100644 --- a/paddle/fluid/operators/set_value_op.cc +++ b/paddle/fluid/operators/set_value_op.cc @@ -151,26 +151,32 @@ class SetValueGradMaker : public framework::SingleGradOpMaker { protected: void Apply(GradOpPtr op) const override { - op->SetType("set_value_grad"); - op->SetInput("ValueTensor", this->Input("ValueTensor")); - op->SetOutput(framework::GradVarName("ValueTensor"), - this->InputGrad("ValueTensor")); - - op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out")); - - if (this->HasInput("StartsTensorList")) { - op->SetInput("StartsTensorList", this->Input("StartsTensorList")); - } - if (this->HasInput("EndsTensorList")) { - op->SetInput("EndsTensorList", this->Input("EndsTensorList")); + if (this->HasInput("ValueTensor")) { + op->SetType("set_value_grad"); + + op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out")); + op->SetInput("ValueTensor", this->Input("ValueTensor")); + if (this->HasInput("StartsTensorList")) { + op->SetInput("StartsTensorList", this->Input("StartsTensorList")); + } + if (this->HasInput("EndsTensorList")) { + op->SetInput("EndsTensorList", this->Input("EndsTensorList")); + } + if (this->HasInput("StepsTensorList")) { + op->SetInput("StepsTensorList", this->Input("StepsTensorList")); + } + + op->SetAttrMap(this->Attrs()); + + op->SetOutput(framework::GradVarName("ValueTensor"), + this->InputGrad("ValueTensor")); + op->SetOutput(framework::GradVarName("Input"), this->InputGrad("Input")); + + } else { + op->SetType("assign"); + op->SetInput("X", this->OutputGrad("Out")); + op->SetOutput("Out", this->InputGrad("Input")); } - if (this->HasInput("StepsTensorList")) { - op->SetInput("StepsTensorList", this->Input("StepsTensorList")); - } - - op->SetAttrMap(this->Attrs()); - - op->SetOutput(framework::GradVarName("Input"), this->InputGrad("Input")); } }; diff --git a/paddle/fluid/operators/split_lod_tensor_op.cc b/paddle/fluid/operators/split_lod_tensor_op.cc index 6b79d5c35b783..a1e4a328cf439 100644 --- a/paddle/fluid/operators/split_lod_tensor_op.cc +++ b/paddle/fluid/operators/split_lod_tensor_op.cc @@ -69,7 +69,7 @@ class SplitLoDTensorOp : public framework::OperatorBase { if (platform::is_cpu_place(mask.place())) { cpu_mask->ShareDataWith(mask); } else if (platform::is_gpu_place(mask.place())) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) framework::TensorCopy( mask, platform::CPUPlace(), dev_ctx, cpu_mask.get()); #else diff --git a/paddle/fluid/operators/svd_helper.h b/paddle/fluid/operators/svd_helper.h index caa31565d4cf3..c2911806996ce 100644 --- a/paddle/fluid/operators/svd_helper.h +++ b/paddle/fluid/operators/svd_helper.h @@ -478,7 +478,7 @@ struct DeviceIndependenceTensorOperations { std::vector out_shape = GetBroadcastShape({&x, &y}); ret.Resize(common::make_ddim(out_shape)); if (platform::is_gpu_place(context.GetPlace())) { -#if defined(__NVCC__) || defined(__HIPCC__) +#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) // For GPU, there is no need to define XxxInverseFunctor and call // ElementwiseComputeEx in two branches. ElementwiseComputeEx, DeviceContext, InT>( diff --git a/paddle/fluid/operators/sync_batch_norm_op.cu b/paddle/fluid/operators/sync_batch_norm_op.cu index af69594f992cd..21406abff8d9f 100644 --- a/paddle/fluid/operators/sync_batch_norm_op.cu +++ b/paddle/fluid/operators/sync_batch_norm_op.cu @@ -15,6 +15,7 @@ #include "paddle/fluid/operators/sync_batch_norm_utils.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/common/memory_utils.h" +#include "paddle/phi/core/enforce.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/sync_batch_norm_kernel.h" @@ -104,8 +105,8 @@ void SyncBatchNormKernel(const Context& ctx, <<>>(x_d, N, H * W * D, C, stats); } -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) - ncclComm_t comm = static_cast(detail::GetCCLComm(x.place(), 0)); +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) + mcclComm_t comm = static_cast(detail::GetCCLComm(x.place(), 0)); if (comm == nullptr) { comm = ctx.nccl_comm(); } @@ -114,11 +115,11 @@ void SyncBatchNormKernel(const Context& ctx, int dtype = phi::ToNCCLDataType(mean_out->dtype()); // In-place operation PADDLE_ENFORCE_GPU_SUCCESS( - phi::dynload::ncclAllReduce(stats, + phi::dynload::mcclAllReduce(stats, stats, 2 * C + 1, - static_cast(dtype), - ncclSum, + static_cast(dtype), + mcclSum, comm, stream)); VLOG(3) << "Sync result using all reduce"; @@ -236,26 +237,28 @@ void SyncBatchNormCooKernel(const Context& dev_ctx, DenseTensor* saved_mean, DenseTensor* saved_variance, DenseTensor* reserve_space) { - EmptyLikeCooKernel(dev_ctx, x, y); - phi::SyncBatchNormKernel(dev_ctx, - x.values(), - mean, - variance, - scale, - bias, - is_test, - momentum, - epsilon, - data_layout, - use_global_stats, - trainable_statistics, - y->mutable_values(), - mean_out, - variance_out, - saved_mean, - saved_variance, - reserve_space); - y->SetIndicesDict(x.GetIndicesDict()); + PADDLE_ENFORCE(false, "error"); + + // EmptyLikeCooKernel(dev_ctx, x, y); + // phi::SyncBatchNormKernel(dev_ctx, + // x.values(), + // mean, + // variance, + // scale, + // bias, + // is_test, + // momentum, + // epsilon, + // data_layout, + // use_global_stats, + // trainable_statistics, + // y->mutable_values(), + // mean_out, + // variance_out, + // saved_mean, + // saved_variance, + // reserve_space); + // y->SetIndicesDict(x.GetIndicesDict()); } template @@ -277,26 +280,27 @@ void SyncBatchNormCooGradKernel( SparseCooTensor* x_grad, DenseTensor* scale_grad, DenseTensor* bias_grad) { - EmptyLikeCooKernel(dev_ctx, x, x_grad); - *scale_grad = phi::EmptyLike(dev_ctx, scale); - *bias_grad = phi::EmptyLike(dev_ctx, bias); - phi::SyncBatchNormGradKernel(dev_ctx, - x.values(), - scale, - bias, - saved_mean, - saved_variance, - reserve_space, - y_grad.values(), - momentum, - epsilon, - data_layout, - is_test, - use_global_stats, - trainable_statistics, - x_grad->mutable_values(), - scale_grad, - bias_grad); + PADDLE_ENFORCE(false, "error"); + // EmptyLikeCooKernel(dev_ctx, x, x_grad); + // *scale_grad = phi::EmptyLike(dev_ctx, scale); + // *bias_grad = phi::EmptyLike(dev_ctx, bias); + // phi::SyncBatchNormGradKernel(dev_ctx, + // x.values(), + // scale, + // bias, + // saved_mean, + // saved_variance, + // reserve_space, + // y_grad.values(), + // momentum, + // epsilon, + // data_layout, + // is_test, + // use_global_stats, + // trainable_statistics, + // x_grad->mutable_values(), + // scale_grad, + // bias_grad); } } // namespace sparse diff --git a/paddle/fluid/operators/sync_batch_norm_utils.h b/paddle/fluid/operators/sync_batch_norm_utils.h index c132a91bb5346..21f1052e03a28 100644 --- a/paddle/fluid/operators/sync_batch_norm_utils.h +++ b/paddle/fluid/operators/sync_batch_norm_utils.h @@ -19,7 +19,8 @@ limitations under the License. */ #include #include #include -#ifdef __NVCC__ +#if defined(__NVCC__) || defined(__MUSACC__) + #include "cub/cub.cuh" #endif #ifdef __HIPCC__ @@ -27,7 +28,7 @@ limitations under the License. */ namespace cub = hipcub; #endif #include "paddle/fluid/distributed/collective/process_group.h" -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) #include "paddle/fluid/distributed/collective/process_group_nccl.h" #endif #include "paddle/common/layout.h" @@ -570,9 +571,9 @@ void SyncBatchNormGradFunctor( } } -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) int global_gid = 0; - ncclComm_t comm = nullptr; + mcclComm_t comm = nullptr; if (paddle::distributed::ProcessGroupMapFromGid::getInstance()->has( global_gid)) { @@ -588,11 +589,11 @@ void SyncBatchNormGradFunctor( int dtype = paddle::platform::ToNCCLDataType(scale.dtype()); // In-place operation PADDLE_ENFORCE_GPU_SUCCESS( - phi::dynload::ncclAllReduce(stats, + phi::dynload::mcclAllReduce(stats, stats, 2 * C + 1, - static_cast(dtype), - ncclSum, + static_cast(dtype), + mcclSum, comm, stream)); VLOG(3) << "Sync result using all reduce"; diff --git a/paddle/fluid/operators/top_k_op.cu b/paddle/fluid/operators/top_k_op.cu index ef6172b6965f2..63d8614f3c369 100644 --- a/paddle/fluid/operators/top_k_op.cu +++ b/paddle/fluid/operators/top_k_op.cu @@ -15,7 +15,8 @@ limitations under the License. */ #pragma once #include #include -#ifdef __NVCC__ +#if defined(__NVCC__) || defined(__MUSACC__) + #include "cub/cub.cuh" #endif #ifdef __HIPCC__ diff --git a/paddle/fluid/operators/uniform_random_op.h b/paddle/fluid/operators/uniform_random_op.h index 458794223dc74..20fe009e4c091 100644 --- a/paddle/fluid/operators/uniform_random_op.h +++ b/paddle/fluid/operators/uniform_random_op.h @@ -19,7 +19,7 @@ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" -#if defined(__NVCC__) || defined(__HIPCC__) +#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) #include #include "paddle/phi/core/generator.h" @@ -113,7 +113,7 @@ inline std::vector GetNewDataFromShapeTensorList( return vec_new_shape; } -#if defined(__NVCC__) || defined(__HIPCC__) +#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) template struct UniformGenerator { diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt index 113ba40ec0cf3..1aaafb99cf969 100644 --- a/paddle/fluid/platform/CMakeLists.txt +++ b/paddle/fluid/platform/CMakeLists.txt @@ -64,7 +64,7 @@ if(WITH_DGC) set(dgc_deps dgc) endif() -if(WITH_GPU OR WITH_ROCM) +if(WITH_GPU OR WITH_ROCM OR WITH_MUSA) set(GPU_CTX_DEPS dynload_cuda dynamic_loader) endif() @@ -90,8 +90,14 @@ if(WITH_ROCM) SRCS stream_callback_manager.cc DEPS simple_threadpool enforce common) endif() +if(WITH_MUSA) + musa_library( + stream_callback_manager + SRCS stream_callback_manager.cc + DEPS simple_threadpool enforce common) +endif() -if(WITH_GPU OR WITH_ROCM) +if(WITH_GPU OR WITH_ROCM OR WITH_MUSA) set(STREAM_CALLBACK_DEPS stream_callback_manager) else() set(STREAM_CALLBACK_DEPS) @@ -138,7 +144,7 @@ cc_library( SRCS collective_helper.cc gen_comm_id_helper.cc DEPS framework_proto device_context enforce common) -if(WITH_GPU OR WITH_ROCM) +if(WITH_GPU OR WITH_ROCM OR WITH_MUSA) target_link_libraries(device_context gpu_resource_pool) endif() @@ -236,6 +242,31 @@ if(WITH_ROCM) DEPS device_context gpu_info) endif() +if(WITH_MUSA) + musa_library( + device_event_gpu + SRCS device_event_gpu.cc + DEPS device_event_base) + set(DEVICE_EVENT_LIBS + device_event_gpu + CACHE INTERNAL "device event libs") + if(WITH_CUSTOM_DEVICE) + musa_test( + device_event_test + SRCS device_event_test.cc + DEPS device_event_gpu device_event_custom_device) + else() + musa_test( + device_event_test + SRCS device_event_test.cc + DEPS device_event_gpu) + endif() + musa_test( + device_context_test + SRCS device_context_test.cu + DEPS device_context gpu_info) +endif() + cc_library(timer SRCS timer.cc) cc_test( timer_test @@ -285,6 +316,18 @@ elseif(WITH_ROCM) stats op_proto_maker shape_inference) +elseif(WITH_MUSA) + musa_library( + profiler + SRCS profiler.cc profiler.cu + DEPS phi + common + gpu_info + enforce + new_profiler + stats + op_proto_maker + shape_inference) elseif(WITH_XPU) cc_library( profiler @@ -365,8 +408,23 @@ if(WITH_ROCM) DEPS gpu_info) endif() +if(WITH_MUSA) + musa_test( + float16_gpu_test + SRCS float16_test.cu + DEPS lod_tensor) + musa_test( + test_limit_gpu_memory + SRCS test_limit_gpu_memory.cu + DEPS gpu_info phi common) + musa_library( + cuda_device_guard + SRCS cuda_device_guard.cc + DEPS gpu_info) +endif() + if(NOT APPLE AND NOT WIN32) - if(WITH_GPU OR WITH_ROCM) + if(WITH_GPU OR WITH_ROCM OR WITH_MUSA) cc_test( device_code_test SRCS device_code_test.cc diff --git a/paddle/fluid/platform/collective_helper.cc b/paddle/fluid/platform/collective_helper.cc index 4ffcf53b1a574..0c32207501898 100644 --- a/paddle/fluid/platform/collective_helper.cc +++ b/paddle/fluid/platform/collective_helper.cc @@ -23,7 +23,7 @@ namespace paddle { namespace platform { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) class NCCLCommImpl : public NCCLComm { public: void set_ring_id(int ring_id) { ring_id_ = ring_id; } @@ -37,8 +37,8 @@ class NCCLCommImpl : public NCCLComm { int device_id() const override { return dev_ctx_->GetPlace().device; } - void set_comm(ncclComm_t comm) { comm_ = comm; } - ncclComm_t comm() const override { return comm_; } + void set_comm(mcclComm_t comm) { comm_ = comm; } + mcclComm_t comm() const override { return comm_; } gpuStream_t stream() const override { return dev_ctx_->stream(); } @@ -64,7 +64,7 @@ class NCCLCommImpl : public NCCLComm { int ring_id_; int nranks_; int rank_; - ncclComm_t comm_; + mcclComm_t comm_; std::unique_ptr dev_ctx_; // used for comm wait compute, compute_stream-->event-->comm_stream @@ -80,7 +80,7 @@ NCCLCommContext& NCCLCommContext::Instance() { } NCCLComm* NCCLCommContext::CreateComm( - ncclUniqueId* nccl_id, int nranks, int rank, int dev_id, int ring_id) { + mcclUniqueId* nccl_id, int nranks, int rank, int dev_id, int ring_id) { PADDLE_ENFORCE_NOT_NULL(nccl_id, platform::errors::InvalidArgument( "The nccl unique id should not be null.")); @@ -106,10 +106,10 @@ NCCLComm* NCCLCommContext::CreateComm( platform::errors::InvalidArgument( "Expected dev_id >= 0. But received dev_id is %d.", dev_id)); - ncclComm_t comm = nullptr; + mcclComm_t comm = nullptr; SetDeviceId(dev_id); PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::ncclCommInitRank(&comm, nranks, *nccl_id, rank)); + platform::dynload::mcclCommInitRank(&comm, nranks, *nccl_id, rank)); auto* comm_wrapper = AssignNCCLComm(comm, nranks, rank, dev_id, ring_id); @@ -133,8 +133,8 @@ void NCCLCommContext::CreateAllNCCLComms(const std::vector& dev_ids, dev_ids.size())); const int kDevices = dev_ids.size(); - ncclComm_t comms[kDevices]; - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclCommInitAll( + mcclComm_t comms[kDevices]; + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclCommInitAll( comms, dev_ids.size(), dev_ids.data())); PADDLE_ENFORCE_EQ(comm_map_.count(ring_id), @@ -156,7 +156,7 @@ void NCCLCommContext::CreateAllNCCLComms(const std::vector& dev_ids, void NCCLCommContext::CreateNCCLCommMultiTrainer( const std::vector& dev_ids, - ncclUniqueId* nccl_id, + mcclUniqueId* nccl_id, int ntrainers, int train_id, int ring_id) { @@ -169,20 +169,22 @@ void NCCLCommContext::CreateNCCLCommMultiTrainer( VLOG(1) << "Begin CreateNCCLCommMultiTrainer. device number: " << kDevices << ", ntrainers: " << ntrainers << ", train_id: " << train_id << ", rind_id: " << ring_id; - ncclComm_t comms[kDevices]; + mcclComm_t comms[kDevices]; { - PADDLE_ENFORCE_GPU_SUCCESS(dynload::ncclGroupStart()); + PADDLE_ENFORCE_GPU_SUCCESS(dynload::mcclGroupStart()); for (int i = 0; i < kDevices; i++) { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipSetDevice(i)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(musaSetDevice(i)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaSetDevice(i)); #endif - platform::dynload::ncclCommInitRank( + platform::dynload::mcclCommInitRank( comms + i, kDevices * ntrainers, *nccl_id, train_id * kDevices + i); VLOG(1) << "ncclCommInitRank: " << i; } - PADDLE_ENFORCE_GPU_SUCCESS(dynload::ncclGroupEnd()); + PADDLE_ENFORCE_GPU_SUCCESS(dynload::mcclGroupEnd()); VLOG(1) << "nccl group end seccessss"; } PADDLE_ENFORCE_EQ(comm_map_.count(ring_id), @@ -208,7 +210,7 @@ void NCCLCommContext::CreateNCCLCommMultiTrainer( } NCCLComm* NCCLCommContext::AssignNCCLComm( - ncclComm_t comm, int nranks, int rank, int dev_id, int ring_id) { + mcclComm_t comm, int nranks, int rank, int dev_id, int ring_id) { std::unique_ptr dev_ctx( new phi::GPUContext(CUDAPlace(dev_id))); dev_ctx->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() diff --git a/paddle/fluid/platform/collective_helper.h b/paddle/fluid/platform/collective_helper.h index 6636856a0eb6c..d88e6e69fba50 100644 --- a/paddle/fluid/platform/collective_helper.h +++ b/paddle/fluid/platform/collective_helper.h @@ -28,10 +28,10 @@ namespace paddle { namespace platform { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) // In order to apply hierarchical communication with NCCL, we need // a communication ring contains NCCL communicators associated to a global -// ncclUniqueId. E.g. for a hierarchical case, +// mcclUniqueId. E.g. for a hierarchical case, // // 11 - 12 21 - 22 // | | | | @@ -55,7 +55,7 @@ class NCCLComm { virtual int nranks() const = 0; virtual int rank() const = 0; virtual int device_id() const = 0; - virtual ncclComm_t comm() const = 0; + virtual mcclComm_t comm() const = 0; virtual gpuStream_t stream() const = 0; virtual gpuEvent_t compute_event() const = 0; virtual gpuEvent_t comm_event() const = 0; @@ -69,12 +69,12 @@ class NCCLCommContext { static NCCLCommContext& Instance(); NCCLComm* CreateComm( - ncclUniqueId* nccl_id, int nranks, int rank, int dev_id, int ring_id = 0); + mcclUniqueId* nccl_id, int nranks, int rank, int dev_id, int ring_id = 0); void CreateAllNCCLComms(const std::vector& dev_ids, int ring_id = 0); void CreateNCCLCommMultiTrainer(const std::vector& dev_ids, - ncclUniqueId* nccl_id, + mcclUniqueId* nccl_id, int nranks, int rank, int ring_id); @@ -82,7 +82,7 @@ class NCCLCommContext { // a latter comm with the same dev_id and the same ring_id // will override the former NCCLComm* AssignNCCLComm( - ncclComm_t comm, int nranks, int rank, int dev_id, int ring_id = 0); + mcclComm_t comm, int nranks, int rank, int dev_id, int ring_id = 0); // retrieve a communicator by the ring id in multiprocessing mode NCCLComm* Get(int ring_id) const { @@ -99,7 +99,7 @@ class NCCLCommContext { return comm_map_.at(ring_id).begin()->second.get(); } - int GetRingId(ncclComm_t comm) const { + int GetRingId(mcclComm_t comm) const { for (const auto& pair : comm_map_) { for (const auto& p : pair.second) { if (p.second.get()->comm() == comm) { diff --git a/paddle/fluid/platform/device/CMakeLists.txt b/paddle/fluid/platform/device/CMakeLists.txt index 6f0d86f0a4b17..b782a45047117 100644 --- a/paddle/fluid/platform/device/CMakeLists.txt +++ b/paddle/fluid/platform/device/CMakeLists.txt @@ -1,7 +1,7 @@ set(DEV_LIBS custom_device) # GPU -if(WITH_GPU OR WITH_ROCM) +if(WITH_GPU OR WITH_ROCM OR WITH_MUSA) add_subdirectory(gpu) endif() diff --git a/paddle/fluid/platform/device/device_wrapper.h b/paddle/fluid/platform/device/device_wrapper.h index aa2dba03c9082..bcfb316837a30 100644 --- a/paddle/fluid/platform/device/device_wrapper.h +++ b/paddle/fluid/platform/device/device_wrapper.h @@ -16,7 +16,7 @@ limitations under the License. */ #pragma once -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) #include "paddle/fluid/platform/device/gpu/gpu_info.h" #endif diff --git a/paddle/fluid/platform/device/gpu/CMakeLists.txt b/paddle/fluid/platform/device/gpu/CMakeLists.txt index 65c3fb2063167..3176d042b7146 100644 --- a/paddle/fluid/platform/device/gpu/CMakeLists.txt +++ b/paddle/fluid/platform/device/gpu/CMakeLists.txt @@ -28,6 +28,18 @@ elseif(WITH_ROCM) cudnn_desc_test SRCS cudnn_desc_test.cc DEPS dynload_cuda) +elseif(WITH_MUSA) + # add_subdirectory(musa) + musa_library( + gpu_info + SRCS gpu_info.cc + DEPS phi common glog enforce monitor dynload_cuda) + + musa_test(cuda_helper_test SRCS cuda_helper_test.cu) + musa_test( + cudnn_desc_test + SRCS cudnn_desc_test.cc + DEPS dynload_cuda) endif() cc_library( diff --git a/paddle/fluid/platform/device/gpu/gpu_helper.h b/paddle/fluid/platform/device/gpu/gpu_helper.h index 878a122a49224..f94f5d55b7eee 100644 --- a/paddle/fluid/platform/device/gpu/gpu_helper.h +++ b/paddle/fluid/platform/device/gpu/gpu_helper.h @@ -13,10 +13,12 @@ // limitations under the License. #pragma once -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) #ifdef PADDLE_WITH_HIP #include "paddle/fluid/platform/device/gpu/rocm/rocm_helper.h" +#elif defined(PADDLE_WITH_MUSA) +#include "paddle/fluid/platform/device/gpu/musa/musa_helper.h" #else #include "paddle/fluid/platform/device/gpu/cuda/cuda_helper.h" #include "paddle/fluid/platform/device/gpu/cuda/cusparse_helper.h" diff --git a/paddle/fluid/platform/device/gpu/gpu_info.cc b/paddle/fluid/platform/device/gpu/gpu_info.cc index 3a26b73e64b77..f82d836e83e77 100644 --- a/paddle/fluid/platform/device/gpu/gpu_info.cc +++ b/paddle/fluid/platform/device/gpu/gpu_info.cc @@ -35,6 +35,8 @@ limitations under the License. */ #ifdef PADDLE_WITH_HIP #include "paddle/fluid/platform/dynload/miopen.h" +#elif defined(PADDLE_WITH_MUSA) +#include "paddle/fluid/platform/dynload/mudnn.h" #else #include "paddle/fluid/platform/dynload/cudnn.h" #include "paddle/phi/backends/gpu/cuda/cuda_graph.h" @@ -215,6 +217,12 @@ class RecordedGpuMallocHelper { } else { result = hipMalloc(ptr, size); } +#elif defined(PADDLE_WITH_MUSA) + if (UNLIKELY(malloc_managed_memory)) { + result = musaMallocManaged(ptr, size); + } else { + result = musaMalloc(ptr, size); + } #else phi::backends::gpu::CUDAGraphCaptureModeGuard capture_mode_guard; if (UNLIKELY(malloc_managed_memory)) { @@ -260,6 +268,9 @@ class RecordedGpuMallocHelper { #ifdef PADDLE_WITH_HIP auto err = hipFree(ptr); if (err != hipErrorDeinitialized) { +#elif defined(PADDLE_WITH_MUSA) + auto err = musaFree(ptr); + if (err != musaErrorMusartUnloading) { #else auto err = cudaFree(ptr); VLOG(10) << "[cudaFree] size=" << static_cast(size) / (1 << 20) @@ -306,6 +317,8 @@ class RecordedGpuMallocHelper { CUDADeviceGuard guard(dev_id_); #ifdef PADDLE_WITH_HIP auto result = hipMemGetInfo(actual_avail, actual_total); +#elif defined(PADDLE_WITH_MUSA) + auto result = musaMemGetInfo(actual_avail, actual_total); #else auto result = cudaMemGetInfo(actual_avail, actual_total); #endif diff --git a/paddle/fluid/platform/device/gpu/gpu_info.h b/paddle/fluid/platform/device/gpu/gpu_info.h index b5a00e9257a80..a2fe54ae4dca4 100644 --- a/paddle/fluid/platform/device/gpu/gpu_info.h +++ b/paddle/fluid/platform/device/gpu/gpu_info.h @@ -11,7 +11,7 @@ limitations under the License. */ #pragma once -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) #include diff --git a/paddle/fluid/platform/device/gpu/gpu_launch_config.h b/paddle/fluid/platform/device/gpu/gpu_launch_config.h index 98c6e379342f2..018fee5f7416f 100644 --- a/paddle/fluid/platform/device/gpu/gpu_launch_config.h +++ b/paddle/fluid/platform/device/gpu/gpu_launch_config.h @@ -16,10 +16,12 @@ #pragma once -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) #ifdef PADDLE_WITH_CUDA #include +#elif defined(PADDLE_WITH_MUSA) +#include #else #include #endif diff --git a/paddle/fluid/platform/device/gpu/gpu_resource_pool.cc b/paddle/fluid/platform/device/gpu/gpu_resource_pool.cc index 9f2168e1cdb8b..0fb7e061e3243 100644 --- a/paddle/fluid/platform/device/gpu/gpu_resource_pool.cc +++ b/paddle/fluid/platform/device/gpu/gpu_resource_pool.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) #include "paddle/fluid/platform/device/gpu/gpu_resource_pool.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h" @@ -30,6 +30,9 @@ CudaStreamResourcePool::CudaStreamResourcePool() { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS( hipStreamCreateWithFlags(&stream, hipStreamNonBlocking)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS( + musaStreamCreateWithFlags(&stream, musaStreamNonBlocking)); #else PADDLE_ENFORCE_GPU_SUCCESS( cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); @@ -41,6 +44,8 @@ CudaStreamResourcePool::CudaStreamResourcePool() { platform::SetDeviceId(dev_idx); #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipStreamDestroy(stream)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(musaStreamDestroy(stream)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamDestroy(stream)); #endif @@ -82,6 +87,9 @@ CudaEventResourcePool::CudaEventResourcePool() { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS( hipEventCreateWithFlags(&event, hipEventDisableTiming)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS( + musaEventCreateWithFlags(&event, musaEventDisableTiming)); #else PADDLE_ENFORCE_GPU_SUCCESS( cudaEventCreateWithFlags(&event, cudaEventDisableTiming)); @@ -93,6 +101,8 @@ CudaEventResourcePool::CudaEventResourcePool() { platform::SetDeviceId(dev_idx); #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipEventDestroy(event)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(musaEventDestroy(event)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaEventDestroy(event)); #endif diff --git a/paddle/fluid/platform/device/gpu/gpu_resource_pool.h b/paddle/fluid/platform/device/gpu/gpu_resource_pool.h index 2ac13e692f783..17e649b9ac62a 100644 --- a/paddle/fluid/platform/device/gpu/gpu_resource_pool.h +++ b/paddle/fluid/platform/device/gpu/gpu_resource_pool.h @@ -14,13 +14,16 @@ #pragma once -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) #ifdef PADDLE_WITH_CUDA #include #include #endif - +#ifdef PADDLE_WITH_MUSA +#include +#include +#endif #ifdef PADDLE_WITH_HIP #include #endif diff --git a/paddle/fluid/platform/device/gpu/gpu_types.h b/paddle/fluid/platform/device/gpu/gpu_types.h index c9afafdef7166..df8b87ed3a036 100644 --- a/paddle/fluid/platform/device/gpu/gpu_types.h +++ b/paddle/fluid/platform/device/gpu/gpu_types.h @@ -15,14 +15,19 @@ #pragma once -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) #ifdef PADDLE_WITH_HIP #include #include "paddle/fluid/platform/dynload/miopen.h" #include "paddle/fluid/platform/dynload/rocblas.h" +#elif defined(PADDLE_WITH_MUSA) +#include +#include "paddle/fluid/platform/dynload/mublas.h" +#include "paddle/fluid/platform/dynload/mudnn.h" +#include "paddle/phi/backends/gpu/forwards.h" #else #include @@ -34,78 +39,95 @@ namespace paddle { #ifdef PADDLE_WITH_HIP -#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \ +#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE, MUSA_TYPE) \ using GPU_TYPE = ROCM_TYPE; +#elif defined(PADDLE_WITH_MUSA) +#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE, MUSA_TYPE) \ + using GPU_TYPE = MUSA_TYPE; #else // CDUA - -#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \ +#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE, MUSA_TYPE) \ using GPU_TYPE = CUDA_TYPE; #endif -DECLARE_TYPE_FOR_GPU(gpuStream_t, cudaStream_t, hipStream_t); -DECLARE_TYPE_FOR_GPU(gpuError_t, cudaError_t, hipError_t); -DECLARE_TYPE_FOR_GPU(gpuEvent_t, cudaEvent_t, hipEvent_t); -DECLARE_TYPE_FOR_GPU(gpuMemcpyKind, cudaMemcpyKind, hipMemcpyKind); -DECLARE_TYPE_FOR_GPU(gpuDeviceProp, cudaDeviceProp, hipDeviceProp_t); - -DECLARE_TYPE_FOR_GPU(dnnDataType_t, cudnnDataType_t, miopenDataType_t); -DECLARE_TYPE_FOR_GPU(dnnActivationDescriptor, - cudnnActivationStruct, - miopenActivationDescriptor); -DECLARE_TYPE_FOR_GPU(dnnActivationMode_t, - cudnnActivationMode_t, - miopenActivationMode_t); -DECLARE_TYPE_FOR_GPU(dnnTensorDescriptor, - cudnnTensorStruct, - miopenTensorDescriptor); -DECLARE_TYPE_FOR_GPU(dnnTensorFormat_t, - cudnnTensorFormat_t, - miopenTensorFormat_t); -DECLARE_TYPE_FOR_GPU(dnnFilterDescriptor, - cudnnFilterStruct, - miopenTensorDescriptor); -DECLARE_TYPE_FOR_GPU(dnnFilterDescriptor_t, - cudnnFilterDescriptor_t, - miopenTensorDescriptor_t); -DECLARE_TYPE_FOR_GPU(dnnConvolutionDescriptor, - cudnnConvolutionStruct, - miopenConvolutionDescriptor); -DECLARE_TYPE_FOR_GPU(dnnConvolutionDescriptor_t, - cudnnConvolutionDescriptor_t, - miopenConvolutionDescriptor_t); -DECLARE_TYPE_FOR_GPU(dnnPoolingDescriptor_t, - cudnnPoolingDescriptor_t, - miopenPoolingDescriptor_t); -DECLARE_TYPE_FOR_GPU(dnnPoolingMode_t, cudnnPoolingMode_t, miopenPoolingMode_t); -DECLARE_TYPE_FOR_GPU(dnnDropoutDescriptor_t, - cudnnDropoutDescriptor_t, - miopenDropoutDescriptor_t); -DECLARE_TYPE_FOR_GPU(dnnHandle_t, cudnnHandle_t, miopenHandle_t); - -DECLARE_TYPE_FOR_GPU(blasHandle_t, cublasHandle_t, rocblas_handle); +DECLARE_TYPE_FOR_GPU(gpuStream_t, cudaStream_t, hipStream_t, musaStream_t); +DECLARE_TYPE_FOR_GPU(gpuError_t, cudaError_t, hipError_t, musaError_t); +DECLARE_TYPE_FOR_GPU(gpuEvent_t, cudaEvent_t, hipEvent_t, musaEvent_t); +DECLARE_TYPE_FOR_GPU(gpuMemcpyKind, cudaMemcpyKind, hipMemcpyKind, musaMemcpyKind); +DECLARE_TYPE_FOR_GPU(gpuDeviceProp, cudaDeviceProp, hipDeviceProp_t, musaDeviceProp); + + + // DECLARE_TYPE_FOR_GPU(dnnDataType_t, cudnnDataType_t, miopenDataType_t, mudnnDataType_t); + // DECLARE_TYPE_FOR_GPU(dnnActivationDescriptor, + // cudnnActivationStruct, + // miopenActivationDescriptor, + // mudnnActivationStruct); + // DECLARE_TYPE_FOR_GPU(dnnActivationMode_t, + // cudnnActivationMode_t, + // miopenActivationMode_t, + // mudnnActivationMode_t); + // DECLARE_TYPE_FOR_GPU(dnnTensorDescriptor, + // cudnnTensorStruct, + // miopenTensorDescriptor, + // mudnnTensorStruct); + // DECLARE_TYPE_FOR_GPU(dnnTensorFormat_t, + // cudnnTensorFormat_t, + // miopenTensorFormat_t, + // mudnnTensorFormat_t); + // DECLARE_TYPE_FOR_GPU(dnnFilterDescriptor, + // cudnnFilterStruct, + // miopenTensorDescriptor, + // mudnnFilterStruct); + // DECLARE_TYPE_FOR_GPU(dnnFilterDescriptor_t, + // cudnnFilterDescriptor_t, + // miopenTensorDescriptor_t, + // mudnnFilterDescriptor_t); + // DECLARE_TYPE_FOR_GPU(dnnConvolutionDescriptor, + // cudnnConvolutionStruct, + // miopenConvolutionDescriptor, + // mudnnConvolutionStruct); + // DECLARE_TYPE_FOR_GPU(dnnConvolutionDescriptor_t, + // cudnnConvolutionDescriptor_t, + // miopenConvolutionDescriptor_t, + // mudnnConvolutionDescriptor_t); + // DECLARE_TYPE_FOR_GPU(dnnPoolingDescriptor_t, + // cudnnPoolingDescriptor_t, + // miopenPoolingDescriptor_t, + // mudnnPoolingDescriptor_t); + // DECLARE_TYPE_FOR_GPU(dnnPoolingMode_t, cudnnPoolingMode_t, miopenPoolingMode_t,mudnnPoolingMode_t);MUDNN_DNN_ROUTINE_EACH + // DECLARE_TYPE_FOR_GPU(dnnDropoutDescriptor_t, + // cudnnDropoutDescriptor_t, + // miopenDropoutDescriptor_t, + // mudnnDropoutDescriptor_t); + DECLARE_TYPE_FOR_GPU(dnnHandle_t, cudnnHandle_t, miopenHandle_t,mudnnHandle_t); + +DECLARE_TYPE_FOR_GPU(blasHandle_t, cublasHandle_t, rocblas_handle,mublasHandle_t); // TODO(Ming Huang): Since there is no blasLt handler, // use rocblas_handle for workround. -DECLARE_TYPE_FOR_GPU(blasLtHandle_t, cublasLtHandle_t, rocblas_handle); +// DECLARE_TYPE_FOR_GPU(blasLtHandle_t, cublasLtHandle_t, rocblas_handle, mublasHandle_t); using CUDAGraphID = unsigned long long; // NOLINT #undef DECLARE_TYPE_FOR_GPU #ifdef PADDLE_WITH_HIP -#define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV) \ +#define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV, MUSA_CV) \ constexpr auto GPU_CV = ROCM_CV; +#elif defined(PADDLE_WITH_MUSA) +#define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV, MUSA_CV) \ + constexpr auto GPU_CV = MUSA_CV; #else // CDUA -#define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV) \ +#define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV, MUSA_CV) \ constexpr auto GPU_CV = CUDA_CV; #endif DECLARE_CONSTANT_FOR_GPU(gpuErrorOutOfMemory, cudaErrorMemoryAllocation, - hipErrorOutOfMemory); -DECLARE_CONSTANT_FOR_GPU(gpuErrorNotReady, cudaErrorNotReady, hipErrorNotReady); -DECLARE_CONSTANT_FOR_GPU(gpuSuccess, cudaSuccess, hipSuccess); + hipErrorOutOfMemory, + musaErrorMemoryAllocation); +DECLARE_CONSTANT_FOR_GPU(gpuErrorNotReady, cudaErrorNotReady, hipErrorNotReady, musaErrorNotReady); +DECLARE_CONSTANT_FOR_GPU(gpuSuccess, cudaSuccess, hipSuccess, musaSuccess); #undef DECLARE_CONSTANT_FOR_GPU } // namespace paddle diff --git a/paddle/fluid/platform/device/gpu/musa/musa_helper.h b/paddle/fluid/platform/device/gpu/musa/musa_helper.h new file mode 100644 index 0000000000000..45ded21129a5a --- /dev/null +++ b/paddle/fluid/platform/device/gpu/musa/musa_helper.h @@ -0,0 +1,104 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include // NOLINT + +#include "paddle/fluid/platform/dynload/mublas.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/macros.h" + +namespace paddle { +namespace platform { + +/* + * Summary: Grid stride looping macro in CUDA kernel + * + * [ Why need this macro? ] + * + * The original looping in CUDA kernel is: + * + * `for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \ + * i += blockDim.x * gridDim.x)` + * + * This for condition is risky. The value of `blockIdx.x * blockDim.x` + * may be large, such as over 1GB, the first iteration is no problem here, + * but when `i += blockDim.x * gridDim.x` is executed, the value of i + * will greater than INT_MAX and overflow becomes negative value, at + * this time, the cycle condition `i < (n)` is still satisfied, so it + * will cause illegal access to cuda memory. + * + * Here is a real example in ERINE, it will trigger above error. + * The related data are: + * - blockIdx.x = 2172938 + * - blockDim.x = 512 + * - blockIdx.x * blockDim.x = 1112543864 + * - INT_MAX = 2147483647 + * + * So we polish the for condition as follow, the int64_t __index__ will + * prevent overflow in the loop increment. + * + * Parameters: + * - i: loop index + * - num: total element numbers + * + * Examples: + * template + * __global__ void Scale(T* logit_grad, const T* loss_grad, const int num, + * const int d, const int remain) { + * CUDA_KERNEL_LOOP(index, num) { + * int idx_n = index / d; + * int idx_remain = index % remain; + * logit_grad[index] *= loss_grad[idx_n * remain + idx_remain]; + * } + * } + * + */ + +#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type) \ + int64_t __index__ = \ + static_cast(blockIdx.x) * blockDim.x + threadIdx.x; \ + int64_t __stride__ = static_cast(blockDim.x) * gridDim.x; \ + for (index_type i = __index__; __index__ < (num); \ + __index__ += __stride__, i = __index__) + +class CublasHandleHolder { + public: + explicit CublasHandleHolder(musaStream_t stream) { + PADDLE_RETRY_CUDA_SUCCESS(dynload::mublasCreate(&handle_)); + PADDLE_RETRY_CUDA_SUCCESS(dynload::mublasSetStream(handle_, stream)); + } + + const mublasHandle_t& GetCublasHandle() const { return handle_; } + + ~CublasHandleHolder() PADDLE_MAY_THROW { + PADDLE_RETRY_CUDA_SUCCESS(dynload::mublasDestroy(handle_)); + } + + template + inline void Call(Callback&& callback) const { + std::lock_guard guard(mtx_); + callback(handle_); + } + + private: + DISABLE_COPY_AND_ASSIGN(CublasHandleHolder); + + mublasHandle_t handle_; + mutable std::mutex mtx_; +}; + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/device/gpu/nccl_helper.h b/paddle/fluid/platform/device/gpu/nccl_helper.h index 8afcfc9f2b700..db5bcbc08c5de 100644 --- a/paddle/fluid/platform/device/gpu/nccl_helper.h +++ b/paddle/fluid/platform/device/gpu/nccl_helper.h @@ -14,7 +14,7 @@ #pragma once -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MUSA) #include #include @@ -29,9 +29,15 @@ #ifdef PADDLE_WITH_NCCL #include "paddle/fluid/platform/dynload/nccl.h" #endif +#ifdef PADDLE_WITH_MCCL +#include "paddle/fluid/platform/dynload/mccl.h" +#endif #ifdef PADDLE_WITH_RCCL #include "paddle/fluid/platform/dynload/rccl.h" #endif +#ifdef PADDLE_WITH_MCCL +#include "paddle/fluid/platform/dynload/mccl.h" +#endif #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/memory/allocation/allocator_facade.h" #include "paddle/fluid/platform/bfloat16.h" @@ -44,63 +50,63 @@ namespace paddle { namespace platform { -inline ncclDataType_t ToNCCLDataType(framework::proto::VarType::Type type) { +inline mcclDataType_t ToNCCLDataType(framework::proto::VarType::Type type) { if (type == framework::proto::VarType::FP32) { - return ncclFloat; + return mcclFloat; } else if (type == framework::proto::VarType::FP64) { - return ncclDouble; + return mcclFloat; } else if (type == framework::proto::VarType::INT32) { - return ncclInt; + return mcclInt; } else if (type == framework::proto::VarType::INT64) { - return ncclInt64; + return mcclInt64; } else if (type == framework::proto::VarType::FP16) { - return ncclFloat16; + return mcclFloat16; } else if (type == framework::proto::VarType::INT8) { - return ncclInt8; + return mcclInt8; } else if (type == framework::proto::VarType::UINT8) { - return ncclUint8; + return mcclUint8; } else if (type == framework::proto::VarType::BOOL) { - return ncclUint8; -#if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000 - } else if (type == framework::proto::VarType::BF16) { - return ncclBfloat16; -#endif + return mcclUint8; +// #if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000 + // } else if (type == framework::proto::VarType::BF16) { + // return mcclBfloat16; +// #endif } else { PADDLE_THROW(platform::errors::Unimplemented( "This datatype in nccl is not supported.")); } } -inline ncclDataType_t ToNCCLDataType(phi::DataType type) { +inline mcclDataType_t ToNCCLDataType(phi::DataType type) { if (type == phi::DataType::FLOAT32) { - return ncclFloat; + return mcclFloat; } else if (type == phi::DataType::FLOAT64) { - return ncclDouble; + return mcclFloat; } else if (type == phi::DataType::INT32) { - return ncclInt; + return mcclInt; } else if (type == phi::DataType::INT64) { - return ncclInt64; + return mcclInt64; } else if (type == phi::DataType::FLOAT16) { - return ncclFloat16; + return mcclFloat16; } else if (type == phi::DataType::UINT8) { - return ncclUint8; + return mcclUint8; } else if (type == phi::DataType::INT8) { - return ncclInt8; + return mcclInt8; } else if (type == phi::DataType::BOOL) { - return ncclUint8; -#if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000 - } else if (type == phi::DataType::BFLOAT16) { - return ncclBfloat16; -#endif + return mcclUint8; +// #if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000 + // } else if (type == phi::DataType::BFLOAT16) { + // return mcclBfloat16; +// #endif } else { PADDLE_THROW(platform::errors::Unimplemented( "This datatype in nccl is not supported.")); } } -// NOTE(minqiyang): according to the ncclGroupEnd documentations: +// NOTE(minqiyang): according to the mcclGroupEnd documentations: // https://docs.nvidia.com/deeplearning/sdk/nccl-api/ncclapidoc.html, -// ncclGroupEnd will wait for all communicators to be initialized, which will +// mcclGroupEnd will wait for all communicators to be initialized, which will // cause blocking problem when a runtime_error was thrown, so try only guard // NCCL actions when use it. class NCCLGroupGuard { @@ -112,18 +118,18 @@ class NCCLGroupGuard { inline NCCLGroupGuard() { NCCLMutex().lock(); - PADDLE_ENFORCE_GPU_SUCCESS(dynload::ncclGroupStart()); + PADDLE_ENFORCE_GPU_SUCCESS(dynload::mcclGroupStart()); } inline ~NCCLGroupGuard() PADDLE_MAY_THROW { - PADDLE_ENFORCE_GPU_SUCCESS(dynload::ncclGroupEnd()); + PADDLE_ENFORCE_GPU_SUCCESS(dynload::mcclGroupEnd()); NCCLMutex().unlock(); } }; struct NCCLContext { std::unique_ptr ctx_; - ncclComm_t comm_; + mcclComm_t comm_; explicit NCCLContext(int dev_id) : comm_{nullptr} { ctx_.reset(new phi::GPUContext(CUDAPlace(dev_id))); @@ -150,7 +156,7 @@ struct NCCLContext { } gpuStream_t stream() const { return ctx_->stream(); } - ncclComm_t comm() const { return comm_; } + mcclComm_t comm() const { return comm_; } int device_id() const { return ctx_->GetPlace().device; } }; @@ -160,7 +166,7 @@ struct NCCLContextMap { std::vector order_; explicit NCCLContextMap(const std::vector &places, - ncclUniqueId *nccl_id = nullptr, + mcclUniqueId *nccl_id = nullptr, size_t num_trainers = 1, size_t trainer_id = 0) { PADDLE_ENFORCE_EQ(!places.empty(), @@ -179,11 +185,11 @@ struct NCCLContextMap { platform::errors::Unavailable("NCCL Context Map does not support " "contain two or more same device.")); - std::unique_ptr comms(new ncclComm_t[order_.size()]); + std::unique_ptr comms(new mcclComm_t[order_.size()]); // if num_trainers == 1, should create a new nccl id for local comms. if (num_trainers == 1 && nccl_id == nullptr) { std::lock_guard guard(NCCLGroupGuard::NCCLMutex()); - PADDLE_RETRY_CUDA_SUCCESS(platform::dynload::ncclCommInitAll( + PADDLE_RETRY_CUDA_SUCCESS(platform::dynload::mcclCommInitAll( comms.get(), static_cast(order_.size()), order_.data())); } else { PADDLE_ENFORCE_NOT_NULL( @@ -203,7 +209,7 @@ struct NCCLContextMap { VLOG(1) << "init nccl rank:" << rank << ", nranks:" << nranks << ", gpu_id:" << gpu_id << ", dev_id:" << order_[i]; SetDeviceId(gpu_id); - PADDLE_RETRY_CUDA_SUCCESS(platform::dynload::ncclCommInitRank( + PADDLE_RETRY_CUDA_SUCCESS(platform::dynload::mcclCommInitRank( comms.get() + i, nranks, *nccl_id, rank)); } } @@ -298,7 +304,7 @@ class NCCLCommunicator { } void InitFlatCtxs(const std::vector &places, - const std::vector &nccl_ids, + const std::vector &nccl_ids, size_t trainers_num, size_t trainer_id) { if (nccl_ids.size() == 0) { @@ -330,8 +336,8 @@ class NCCLCommunicator { } void InitHierarchicalCtxs(const std::vector &places, - const std::vector &inter_nccl_ids, - const std::vector &exter_nccl_ids, + const std::vector &inter_nccl_ids, + const std::vector &exter_nccl_ids, size_t trainers_num, size_t trainer_id, size_t inter_trainers_num, diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index c4f40767fd52c..786b38239e60e 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -27,7 +27,7 @@ limitations under the License. */ #include "paddle/phi/core/expect.h" #include "paddle/phi/core/generator.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) #include "paddle/fluid/memory/allocation/cuda_device_context_allocator.h" #include "paddle/fluid/platform/cuda_device_guard.h" #include "paddle/phi/backends/gpu/gpu_context.h" @@ -53,7 +53,7 @@ DeviceType Place2DeviceType(const platform::Place& place) { } } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) template typename std::enable_if::value, DevCtx*>::type @@ -86,7 +86,7 @@ inline std::unique_ptr CreateDeviceContext( DevCtx* dev_ctx = ConstructDevCtx(p, stream_priority); auto& instance = paddle::memory::allocation::AllocatorFacade::Instance(); if (p.GetType() == phi::AllocationType::GPU) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) auto* cuda_ctx = dynamic_cast(dev_ctx); PADDLE_ENFORCE_NOT_NULL( cuda_ctx, @@ -184,7 +184,7 @@ void EmplaceDeviceContexts( /*unused*/ stream_priority); #endif } else if (place.GetType() == phi::AllocationType::GPU) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) EmplaceDeviceContext( place_to_device_context, place, @@ -221,7 +221,7 @@ void EmplaceDeviceContexts( "option.")); #endif } else if (platform::is_cuda_pinned_place(place)) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) EmplaceDeviceContext( place_to_device_context, place, diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index 4a75d3ea97f9a..b015bb9a3e625 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -53,6 +53,18 @@ limitations under the License. */ #include "paddle/fluid/platform/device/gpu/gpu_info.h" // NOLINT #endif + +#ifdef PADDLE_WITH_MUSA +#include "paddle/fluid/platform/device/gpu/gpu_helper.h" // NOLINT +#include "paddle/fluid/platform/dynload/mudnn.h" +#include "paddle/fluid/platform/dynload/mublas.h" +#include "paddle/phi/backends/gpu/gpu_context.h" // NOLINT +#if !defined(__APPLE__) && defined(PADDLE_WITH_MCCL) +#include "paddle/fluid/platform/dynload/mccl.h" +#endif +#include "paddle/fluid/platform/device/gpu/gpu_info.h" // NOLINT +#endif + #if defined(PADDLE_WITH_XPU_BKCL) #include "xpu/bkcl.h" #endif @@ -136,7 +148,7 @@ namespace xpu = baidu::xpu::api; using XPUDeviceContext = phi::XPUContext; #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) using CUDAPinnedDeviceContext = phi::GPUPinnedContext; #endif @@ -165,7 +177,7 @@ struct DefaultDeviceContextType { }; #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) template <> struct DefaultDeviceContextType { using TYPE = paddle::platform::CUDAPinnedDeviceContext; diff --git a/paddle/fluid/platform/device_event.h b/paddle/fluid/platform/device_event.h index 402974b89e5c9..cb43f00f7fe0f 100644 --- a/paddle/fluid/platform/device_event.h +++ b/paddle/fluid/platform/device_event.h @@ -31,7 +31,7 @@ using ::paddle::platform::kXPU; USE_EVENT(kCPU) USE_EVENT_WAIT(kCPU, kCPU) -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) USE_EVENT(kCUDA); USE_EVENT_WAIT(kCUDA, kCUDA) USE_EVENT_WAIT(kCPU, kCUDA) diff --git a/paddle/fluid/platform/device_event_base.cc b/paddle/fluid/platform/device_event_base.cc index cd2d31f1fbefb..c23f395e0e36b 100644 --- a/paddle/fluid/platform/device_event_base.cc +++ b/paddle/fluid/platform/device_event_base.cc @@ -53,6 +53,14 @@ unsigned int GenerateDeviceEventFlag(bool enable_timing, return flags; #endif +#ifdef PADDLE_WITH_MUSA + unsigned int flags = + (blocking ? musaEventBlockingSync : musaEventDefault) | + (enable_timing ? musaEventDefault : musaEventDisableTiming) | + (interprocess ? musaEventInterprocess : musaEventDefault); + return flags; +#endif + return 0; } diff --git a/paddle/fluid/platform/device_event_gpu.cc b/paddle/fluid/platform/device_event_gpu.cc index d64b062cda0ac..bbeb67821e023 100644 --- a/paddle/fluid/platform/device_event_gpu.cc +++ b/paddle/fluid/platform/device_event_gpu.cc @@ -15,7 +15,7 @@ #include "paddle/fluid/platform/device_event_base.h" #include "paddle/fluid/platform/event.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) namespace paddle { namespace platform { struct CUDADeviceEventWrapper { diff --git a/paddle/fluid/platform/dynload/CMakeLists.txt b/paddle/fluid/platform/dynload/CMakeLists.txt index 29f7b91a17157..10f582069e661 100644 --- a/paddle/fluid/platform/dynload/CMakeLists.txt +++ b/paddle/fluid/platform/dynload/CMakeLists.txt @@ -22,6 +22,10 @@ endif() if(WITH_ROCM) list(APPEND HIP_SRCS rocblas.cc miopen.cc hiprand.cc hipfft.cc) endif() +if(WITH_MUSA) + list(APPEND MUSA_SRCS mublas.cc mudnn.cc murand.cc mufft.cc) +endif() + # There is no macOS version of NCCL. # Disable nvrtc and cuda_driver api on MacOS, and only do a early test on Linux and Windows. @@ -39,6 +43,15 @@ if(NOT APPLE) list(APPEND HIP_SRCS cupti.cc) endif() endif() + if(WITH_MUSA) + list(APPEND MUSA_SRCS musartc.cc musa_driver.cc) + if(WITH_MCCL) + list(APPEND MUSA_SRCS mccl.cc) + endif() + if(CUPTI_FOUND) + list(APPEND MUSA_SRCS mupti.cc) + endif() + endif() endif() if(TENSORRT_FOUND) @@ -62,6 +75,15 @@ if(WITH_ROCM) dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc phi common) +elseif(WITH_MUSA) + musa_library( + dynload_cuda + SRCS ${MUSA_SRCS} + DEPS dynamic_loader phi common) + cc_library( + dynload_warpctc + SRCS warpctc.cc + DEPS dynamic_loader warpctc phi common) else() nv_library( dynload_cuda diff --git a/paddle/fluid/platform/dynload/dynamic_loader.h b/paddle/fluid/platform/dynload/dynamic_loader.h index 93a19645a0a34..aebdd715b9e1c 100644 --- a/paddle/fluid/platform/dynload/dynamic_loader.h +++ b/paddle/fluid/platform/dynload/dynamic_loader.h @@ -47,6 +47,7 @@ void* GetMKLRTDsoHandle(); void* GetROCFFTDsoHandle(); void* GetCusparseLtDsoHandle(); void* GetXPTIDsoHandle(); +void* GetMUFFTDsoHandle(); void SetPaddleLibPath(const std::string&); } // namespace dynload diff --git a/paddle/fluid/platform/dynload/mccl.cc b/paddle/fluid/platform/dynload/mccl.cc new file mode 100644 index 0000000000000..8497d35e2484d --- /dev/null +++ b/paddle/fluid/platform/dynload/mccl.cc @@ -0,0 +1,43 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/platform/dynload/mccl.h" + +namespace paddle { +namespace platform { +namespace dynload { + +#define DEFINE_WRAP(__name) DynLoad__##__name __name + +MCCL_RAND_ROUTINE_EACH(DEFINE_WRAP); + +// #if NCCL_VERSION_CODE >= 2212 +MCCL_RAND_ROUTINE_EACH_AFTER_2212(DEFINE_WRAP) +// #endif + +// #if NCCL_VERSION_CODE >= 2304 +MCCL_RAND_ROUTINE_EACH_AFTER_2304(DEFINE_WRAP) +// #endif + +// #if NCCL_VERSION_CODE >= 2703 +MCCL_RAND_ROUTINE_EACH_AFTER_2703(DEFINE_WRAP) +// #endif + +// #if NCCL_VERSION_CODE >= 21100 +MCCL_RAND_ROUTINE_EACH_AFTER_21100(DEFINE_WRAP) +// #endif + +} // namespace dynload +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/dynload/mccl.h b/paddle/fluid/platform/dynload/mccl.h new file mode 100644 index 0000000000000..0e1eac41691a5 --- /dev/null +++ b/paddle/fluid/platform/dynload/mccl.h @@ -0,0 +1,51 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#pragma once + +#include + +#include // NOLINT + +#include "paddle/phi/backends/dynload/mccl.h" + +namespace paddle { +namespace platform { +namespace dynload { + +#define PLATFORM_DECLARE_DYNAMIC_LOAD_MCCL_WRAP(__name) \ + using DynLoad__##__name = phi::dynload::DynLoad__##__name; \ + extern DynLoad__##__name __name + +MCCL_RAND_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_MCCL_WRAP) + +#define MCCL_RAND_ROUTINE_EACH_AFTER_2212(__macro) __macro(mcclBroadcast); +MCCL_RAND_ROUTINE_EACH_AFTER_2212(PLATFORM_DECLARE_DYNAMIC_LOAD_MCCL_WRAP) + +#define MCCL_RAND_ROUTINE_EACH_AFTER_2304(__macro) __macro(mcclGetVersion); +MCCL_RAND_ROUTINE_EACH_AFTER_2304(PLATFORM_DECLARE_DYNAMIC_LOAD_MCCL_WRAP) + +#define MCCL_RAND_ROUTINE_EACH_AFTER_2703(__macro) \ + __macro(mcclSend); \ + __macro(mcclRecv); +MCCL_RAND_ROUTINE_EACH_AFTER_2703(PLATFORM_DECLARE_DYNAMIC_LOAD_MCCL_WRAP) + + +#define MCCL_RAND_ROUTINE_EACH_AFTER_21100(__macro) \ + __macro(mcclRedOpCreatePreMulSum); \ + __macro(mcclRedOpDestroy); +MCCL_RAND_ROUTINE_EACH_AFTER_21100(PLATFORM_DECLARE_DYNAMIC_LOAD_MCCL_WRAP) + +} // namespace dynload +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/dynload/mublas.cc b/paddle/fluid/platform/dynload/mublas.cc new file mode 100644 index 0000000000000..0ca4c6c3dac99 --- /dev/null +++ b/paddle/fluid/platform/dynload/mublas.cc @@ -0,0 +1,38 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/platform/dynload/mublas.h" + +namespace paddle { +namespace platform { +namespace dynload { + +#define DEFINE_WRAP(__name) DynLoad__##__name __name + +MUBLAS_BLAS_ROUTINE_EACH(DEFINE_WRAP); + +#ifdef MUBLAS_BLAS_ROUTINE_EACH_R2 +MUBLAS_BLAS_ROUTINE_EACH_R2(DEFINE_WRAP); +#endif + +#ifdef MUBLAS_BLAS_ROUTINE_EACH_R3 +MUBLAS_BLAS_ROUTINE_EACH_R3(DEFINE_WRAP); +#endif + +#ifdef MUBLAS_BLAS_ROUTINE_EACH_R4 +MUBLAS_BLAS_ROUTINE_EACH_R4(DEFINE_WRAP); +#endif +} // namespace dynload +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/dynload/mublas.h b/paddle/fluid/platform/dynload/mublas.h new file mode 100644 index 0000000000000..0b7d21a4ecb76 --- /dev/null +++ b/paddle/fluid/platform/dynload/mublas.h @@ -0,0 +1,55 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include + +#include // NOLINT +#include + +#include "paddle/phi/backends/dynload/mublas.h" + +namespace paddle { +namespace platform { +namespace dynload { + +/** + * The following macro definition can generate structs + * (for each function) to dynamic load mublas routine + * via operator overloading. + * + * note: default dynamic linked libs + */ +#define PLATFORM_DECLARE_DYNAMIC_LOAD_MUBLAS_WRAP(__name) \ + using DynLoad__##__name = phi::dynload::DynLoad__##__name; \ + extern DynLoad__##__name __name + + +MUBLAS_BLAS_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_MUBLAS_WRAP) + + +MUBLAS_BLAS_ROUTINE_EACH_R2(PLATFORM_DECLARE_DYNAMIC_LOAD_MUBLAS_WRAP) + + +MUBLAS_BLAS_ROUTINE_EACH_R3(PLATFORM_DECLARE_DYNAMIC_LOAD_MUBLAS_WRAP) + + +MUBLAS_BLAS_ROUTINE_EACH_R4(PLATFORM_DECLARE_DYNAMIC_LOAD_MUBLAS_WRAP) + +#undef PLATFORM_DECLARE_DYNAMIC_LOAD_MUBLAS_WRAP +} // namespace dynload +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/dynload/mudnn.cc b/paddle/fluid/platform/dynload/mudnn.cc new file mode 100644 index 0000000000000..8b6ee172e1455 --- /dev/null +++ b/paddle/fluid/platform/dynload/mudnn.cc @@ -0,0 +1,30 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/platform/dynload/mudnn.h" + +#include "paddle/phi/backends/dynload/mudnn.h" +#define DEFINE_WRAP(__name) DynLoad__##__name __name + +namespace paddle { +namespace platform { +namespace dynload { + +// MUDNN_DNN_ROUTINE_EACH(DEFINE_WRAP); + +bool HasCUDNN() { return phi::dynload::HasCUDNN(); } + +} // namespace dynload +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/dynload/mudnn.h b/paddle/fluid/platform/dynload/mudnn.h new file mode 100644 index 0000000000000..f980972538a0e --- /dev/null +++ b/paddle/fluid/platform/dynload/mudnn.h @@ -0,0 +1,39 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#ifdef PADDLE_WITH_MUSA +#include +#include +#include "paddle/phi/backends/dynload/mudnn.h" + +namespace paddle { +namespace platform { +namespace dynload { + +using ::musa::dnn::BatchNorm; +using ::musa::dnn::Convolution; +using ::musa::dnn::Handle; +using ::musa::dnn::MemoryHandler; +using ::musa::dnn::Pooling; +using ::musa::dnn::Softmax; +using ::musa::dnn::Tensor; + +extern bool HasCUDNN(); + +} // namespace dynload +} // namespace platform +} // namespace paddle + +#endif diff --git a/paddle/fluid/platform/dynload/mufft.cc b/paddle/fluid/platform/dynload/mufft.cc new file mode 100644 index 0000000000000..1126ab516619c --- /dev/null +++ b/paddle/fluid/platform/dynload/mufft.cc @@ -0,0 +1,30 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/platform/dynload/mufft.h" + +#include "paddle/phi/backends/dynload/mufft.h" + +namespace paddle { +namespace platform { +namespace dynload { + +#define DEFINE_WRAP(__name) DynLoad__##__name __name + +MUFFT_FFT_ROUTINE_EACH(DEFINE_WRAP); + + +} // namespace dynload +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/dynload/mufft.h b/paddle/fluid/platform/dynload/mufft.h new file mode 100644 index 0000000000000..31452acd9d817 --- /dev/null +++ b/paddle/fluid/platform/dynload/mufft.h @@ -0,0 +1,93 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#ifdef PADDLE_WITH_MUSA +#include +#include +#include + +#include // NOLINT + +#include "paddle/phi/backends/dynload/mufft.h" + +namespace paddle { +namespace platform { +namespace dynload { + + +#define PLATFORM_DECLARE_DYNAMIC_LOAD_MUFFT_WRAP(__name) \ + using DynLoad__##__name = phi::dynload::DynLoad__##__name; \ + extern DynLoad__##__name __name + +/** + * include all needed cufft functions in HPPL + * different cufft version has different interfaces + **/ +#define MUFFT_FFT_ROUTINE_EACH(__macro) \ + __macro(mufftPlan1d); \ + __macro(mufftPlan2d); \ + __macro(mufftPlan3d); \ + __macro(mufftPlanMany); \ + __macro(mufftMakePlan1d); \ + __macro(mufftMakePlan2d); \ + __macro(mufftMakePlan3d); \ + __macro(mufftMakePlanMany); \ + __macro(mufftEstimate1d); \ + __macro(mufftEstimate2d); \ + __macro(mufftEstimate3d); \ + __macro(mufftEstimateMany); \ + __macro(mufftCreate); \ + __macro(mufftGetSize1d); \ + __macro(mufftGetSize2d); \ + __macro(mufftGetSize3d); \ + __macro(mufftGetSizeMany); \ + __macro(mufftGetSize); \ + __macro(mufftSetWorkArea); \ + __macro(mufftSetAutoAllocation); \ + __macro(mufftExecC2C); \ + __macro(mufftExecR2C); \ + __macro(mufftExecC2R); \ + __macro(mufftExecZ2Z); \ + __macro(mufftExecD2Z); \ + __macro(mufftExecZ2D); \ + __macro(mufftSetStream); \ + __macro(mufftDestroy); \ + __macro(mufftGetVersion); \ + __macro(mufftGetProperty); \ + __macro(mufftXtSetGPUs); \ + __macro(mufftXtMalloc); \ + __macro(mufftXtMemcpy); \ + __macro(mufftXtFree); \ + __macro(mufftXtExecDescriptorC2C); \ + __macro(mufftXtExecDescriptorR2C); \ + __macro(mufftXtExecDescriptorC2R); \ + __macro(mufftXtExecDescriptorZ2Z); \ + __macro(mufftXtExecDescriptorD2Z); \ + __macro(mufftXtExecDescriptorZ2D); \ + __macro(mufftXtQueryPlan); \ + __macro(mufftXtSetCallback); \ + __macro(mufftXtClearCallback); \ + __macro(mufftXtMakePlanMany); \ + __macro(mufftXtGetSizeMany); \ + __macro(mufftXtExec); \ + __macro(mufftXtExecDescriptor); + +MUFFT_FFT_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_MUFFT_WRAP) + +} // namespace dynload +} // namespace platform +} // namespace paddle + +#endif diff --git a/paddle/fluid/platform/dynload/murand.cc b/paddle/fluid/platform/dynload/murand.cc new file mode 100644 index 0000000000000..82b911ead3271 --- /dev/null +++ b/paddle/fluid/platform/dynload/murand.cc @@ -0,0 +1,27 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/platform/dynload/murand.h" + +namespace paddle { +namespace platform { +namespace dynload { + +#define DEFINE_WRAP(__name) DynLoad__##__name __name + +MURAND_RAND_ROUTINE_EACH(DEFINE_WRAP); + +} // namespace dynload +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/dynload/murand.h b/paddle/fluid/platform/dynload/murand.h new file mode 100644 index 0000000000000..b20a49a704384 --- /dev/null +++ b/paddle/fluid/platform/dynload/murand.h @@ -0,0 +1,43 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#pragma once + +#include + +#include // NOLINT + +#include "paddle/phi/backends/dynload/murand.h" + +namespace paddle { +namespace platform { +namespace dynload { + +#define PLATFORM_DECLARE_DYNAMIC_LOAD_CURAND_WRAP(__name) \ + using DynLoad__##__name = phi::dynload::DynLoad__##__name; \ + extern DynLoad__##__name __name + +#define MURAND_RAND_ROUTINE_EACH(__macro) \ + __macro(murandCreateGenerator); \ + __macro(murandSetStream); \ + __macro(murandSetPseudoRandomGeneratorSeed); \ + __macro(murandGenerateUniform); \ + __macro(murandGenerateUniformDouble); \ + __macro(murandGenerateNormal); \ + __macro(murandDestroyGenerator); + +MURAND_RAND_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_CURAND_WRAP); + +} // namespace dynload +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/dynload/musa_driver.cc b/paddle/fluid/platform/dynload/musa_driver.cc new file mode 100644 index 0000000000000..8898bd4dfb654 --- /dev/null +++ b/paddle/fluid/platform/dynload/musa_driver.cc @@ -0,0 +1,31 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/platform/dynload/musa_driver.h" + +#include "paddle/phi/backends/dynload/musa_driver.h" + +namespace paddle { +namespace platform { +namespace dynload { + +#define DEFINE_WRAP(__name) DynLoad__##__name __name + +MUSA_ROUTINE_EACH(DEFINE_WRAP); + +bool HasCUDADriver() { return phi::dynload::HasCUDADriver(); } + +} // namespace dynload +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/dynload/musa_driver.h b/paddle/fluid/platform/dynload/musa_driver.h new file mode 100644 index 0000000000000..261841e8e7384 --- /dev/null +++ b/paddle/fluid/platform/dynload/musa_driver.h @@ -0,0 +1,58 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include + +#include // NOLINT + +#include "paddle/phi/backends/dynload/musa_driver.h" + +namespace paddle { +namespace platform { +namespace dynload { + +extern bool HasCUDADriver(); + +#define PLATFORM_DECLARE_DYNAMIC_LOAD_MUSA_WRAP(__name) \ + using DynLoad__##__name = phi::dynload::DynLoad__##__name; \ + extern DynLoad__##__name __name + +/** + * include all needed musa driver functions + **/ +#define PLATFORM_MUSA_ROUTINE_EACH(__macro) \ + __macro(muInit); \ + __macro(muDriverGetVersion); \ + __macro(muGetErrorString); \ + __macro(muModuleLoadData); \ + __macro(muModuleGetFunction); \ + __macro(muModuleUnload); \ + __macro(muOccupancyMaxActiveBlocksPerMultiprocessor); \ + __macro(muLaunchKernel); \ + __macro(muCtxCreate); \ + __macro(muCtxGetCurrent); \ + __macro(muDeviceGetCount); \ + __macro(muDevicePrimaryCtxGetState); \ + __macro(muDeviceGetAttribute); \ + __macro(muDeviceGet) + +PLATFORM_MUSA_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_MUSA_WRAP); + +#undef PLATFORM_DECLARE_DYNAMIC_LOAD_MUSA_WRAP + +} // namespace dynload +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/dynload/musartc.cc b/paddle/fluid/platform/dynload/musartc.cc new file mode 100644 index 0000000000000..4e15dab9c1359 --- /dev/null +++ b/paddle/fluid/platform/dynload/musartc.cc @@ -0,0 +1,31 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/platform/dynload/musartc.h" + +#include "paddle/phi/backends/dynload/musartc.h" + +namespace paddle { +namespace platform { +namespace dynload { + +#define DEFINE_WRAP(__name) DynLoad__##__name __name + +MUSARTC_ROUTINE_EACH(DEFINE_WRAP); + +bool HasNVRTC() { return phi::dynload::HasNVRTC(); } + +} // namespace dynload +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/dynload/musartc.h b/paddle/fluid/platform/dynload/musartc.h new file mode 100644 index 0000000000000..fca957131ef4e --- /dev/null +++ b/paddle/fluid/platform/dynload/musartc.h @@ -0,0 +1,51 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include // NOLINT + +#include "paddle/phi/backends/dynload/musartc.h" + +namespace paddle { +namespace platform { +namespace dynload { + +extern bool HasNVRTC(); + +#define PLATFORM_DECLARE_DYNAMIC_LOAD_NVRTC_WRAP(__name) \ + using DynLoad__##__name = phi::dynload::DynLoad__##__name; \ + extern DynLoad__##__name __name + +/** + * include all needed musartc functions + **/ +#define MUSARTC_ROUTINE_EACH(__macro) \ + __macro(mtrtcVersion); \ + __macro(mtrtcGetErrorString); \ + __macro(mtrtcCompileProgram); \ + __macro(mtrtcCreateProgram); \ + __macro(mtrtcDestroyProgram); \ + __macro(mtrtcGetMUSA); \ + __macro(mtrtcGetMUSASize); \ + __macro(mtrtcGetProgramLog); \ + __macro(mtrtcGetProgramLogSize) + +MUSARTC_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_NVRTC_WRAP); + +#undef PLATFORM_DECLARE_DYNAMIC_LOAD_NVRTC_WRAP + +} // namespace dynload +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/dynload/musparse.cc b/paddle/fluid/platform/dynload/musparse.cc new file mode 100644 index 0000000000000..347059362bc8d --- /dev/null +++ b/paddle/fluid/platform/dynload/musparse.cc @@ -0,0 +1,30 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/platform/dynload/musparse.h" + +namespace paddle { +namespace platform { +namespace dynload { + +#define DEFINE_WRAP(__name) DynLoad__##__name __name + +#ifdef MUSPARSE_ROUTINE_EACH +MUSPARSE_ROUTINE_EACH(DEFINE_WRAP); +#endif + +} // namespace dynload +} // namespace platform +} // namespace paddle + diff --git a/paddle/fluid/platform/dynload/musparse.h b/paddle/fluid/platform/dynload/musparse.h new file mode 100644 index 0000000000000..586decb9c55c1 --- /dev/null +++ b/paddle/fluid/platform/dynload/musparse.h @@ -0,0 +1,41 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#pragma once + +#include +#include + +#include // NOLINT + +#include "paddle/phi/backends/dynload/musparse.h" + +namespace paddle { +namespace platform { +namespace dynload { + +#define PLATFORM_DECLARE_DYNAMIC_LOAD_MUSPARSE_WRAP(__name) \ + using DynLoad__##__name = phi::dynload::DynLoad__##__name; \ + extern DynLoad__##__name __name + +#if defined(PADDLE_WITH_MUSA) + + +MUSPARSE_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_MUSPARSE_WRAP) +#endif // PADDLE_WITH_MUSA + +#undef PLATFORM_DECLARE_DYNAMIC_LOAD_MUSPARSE_WRAP +} // namespace dynload +} // namespace platform +} // namespace paddle + diff --git a/paddle/fluid/platform/dynload/nccl.cc b/paddle/fluid/platform/dynload/nccl.cc index 7b0ea3bb7f3c1..2cf04248687f2 100644 --- a/paddle/fluid/platform/dynload/nccl.cc +++ b/paddle/fluid/platform/dynload/nccl.cc @@ -22,21 +22,21 @@ namespace dynload { NCCL_RAND_ROUTINE_EACH(DEFINE_WRAP); -#if NCCL_VERSION_CODE >= 2212 +// #if NCCL_VERSION_CODE >= 2212 NCCL_RAND_ROUTINE_EACH_AFTER_2212(DEFINE_WRAP) -#endif +// #endif -#if NCCL_VERSION_CODE >= 2304 +// #if NCCL_VERSION_CODE >= 2304 NCCL_RAND_ROUTINE_EACH_AFTER_2304(DEFINE_WRAP) -#endif +// #endif -#if NCCL_VERSION_CODE >= 2703 +// #if NCCL_VERSION_CODE >= 2703 NCCL_RAND_ROUTINE_EACH_AFTER_2703(DEFINE_WRAP) -#endif +// #endif -#if NCCL_VERSION_CODE >= 21100 +// #if NCCL_VERSION_CODE >= 21100 NCCL_RAND_ROUTINE_EACH_AFTER_21100(DEFINE_WRAP) -#endif +// #endif } // namespace dynload } // namespace platform diff --git a/paddle/fluid/platform/dynload/nccl.h b/paddle/fluid/platform/dynload/nccl.h index d9516c9f4de4e..d2150204b8810 100644 --- a/paddle/fluid/platform/dynload/nccl.h +++ b/paddle/fluid/platform/dynload/nccl.h @@ -29,18 +29,18 @@ namespace dynload { #define NCCL_RAND_ROUTINE_EACH(__macro) \ __macro(ncclCommInitAll); \ - __macro(ncclGetUniqueId); \ + __macro(mcclGetUniqueId); \ __macro(ncclCommInitRank); \ __macro(ncclCommAbort); \ __macro(ncclCommDestroy); \ __macro(ncclCommCount); \ __macro(ncclCommCuDevice); \ __macro(ncclCommUserRank); \ - __macro(ncclAllReduce); \ - __macro(ncclBcast); \ - __macro(ncclAllGather); \ - __macro(ncclGroupStart); \ - __macro(ncclGroupEnd); \ + __macro(mcclAllReduce); \ + __macro(mcclBcast); \ + __macro(mcclAllGather); \ + __macro(mcclGroupStart); \ + __macro(mcclGroupEnd); \ __macro(ncclReduce); \ __macro(ncclReduceScatter); \ __macro(ncclCommGetAsyncError); \ @@ -48,29 +48,29 @@ namespace dynload { NCCL_RAND_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_NCCL_WRAP) -#if NCCL_VERSION_CODE >= 2212 +// #if NCCL_VERSION_CODE >= 2212 #define NCCL_RAND_ROUTINE_EACH_AFTER_2212(__macro) __macro(ncclBroadcast); NCCL_RAND_ROUTINE_EACH_AFTER_2212(PLATFORM_DECLARE_DYNAMIC_LOAD_NCCL_WRAP) -#endif +// #endif -#if NCCL_VERSION_CODE >= 2304 -#define NCCL_RAND_ROUTINE_EACH_AFTER_2304(__macro) __macro(ncclGetVersion); +// #if NCCL_VERSION_CODE >= 2304 +#define NCCL_RAND_ROUTINE_EACH_AFTER_2304(__macro) __macro(mcclGetVersion); NCCL_RAND_ROUTINE_EACH_AFTER_2304(PLATFORM_DECLARE_DYNAMIC_LOAD_NCCL_WRAP) -#endif +// #endif -#if NCCL_VERSION_CODE >= 2703 +// #if NCCL_VERSION_CODE >= 2703 #define NCCL_RAND_ROUTINE_EACH_AFTER_2703(__macro) \ __macro(ncclSend); \ __macro(ncclRecv); NCCL_RAND_ROUTINE_EACH_AFTER_2703(PLATFORM_DECLARE_DYNAMIC_LOAD_NCCL_WRAP) -#endif +// #endif -#if NCCL_VERSION_CODE >= 21100 +// #if NCCL_VERSION_CODE >= 21100 #define NCCL_RAND_ROUTINE_EACH_AFTER_21100(__macro) \ __macro(ncclRedOpCreatePreMulSum); \ __macro(ncclRedOpDestroy); NCCL_RAND_ROUTINE_EACH_AFTER_21100(PLATFORM_DECLARE_DYNAMIC_LOAD_NCCL_WRAP) -#endif +// #endif } // namespace dynload } // namespace platform diff --git a/paddle/fluid/platform/dynload/rccl.cc b/paddle/fluid/platform/dynload/rccl.cc index 62bb6a88af7c0..512a8fbafe6f6 100644 --- a/paddle/fluid/platform/dynload/rccl.cc +++ b/paddle/fluid/platform/dynload/rccl.cc @@ -22,21 +22,21 @@ namespace dynload { RCCL_RAND_ROUTINE_EACH(DEFINE_WRAP); -#if NCCL_VERSION_CODE >= 2212 +// #if NCCL_VERSION_CODE >= 2212 RCCL_RAND_ROUTINE_EACH_AFTER_2212(DEFINE_WRAP) -#endif +// #endif -#if NCCL_VERSION_CODE >= 2304 +// #if NCCL_VERSION_CODE >= 2304 RCCL_RAND_ROUTINE_EACH_AFTER_2304(DEFINE_WRAP) -#endif +// #endif -#if NCCL_VERSION_CODE >= 2703 +// #if NCCL_VERSION_CODE >= 2703 RCCL_RAND_ROUTINE_EACH_AFTER_2703(DEFINE_WRAP) -#endif +// #endif -#if NCCL_VERSION_CODE >= 21100 +// #if NCCL_VERSION_CODE >= 21100 RCCL_RAND_ROUTINE_EACH_AFTER_21100(DEFINE_WRAP) -#endif +// #endif } // namespace dynload } // namespace platform diff --git a/paddle/fluid/platform/dynload/rccl.h b/paddle/fluid/platform/dynload/rccl.h index 4d988e4fb08a0..cba083334ce5c 100644 --- a/paddle/fluid/platform/dynload/rccl.h +++ b/paddle/fluid/platform/dynload/rccl.h @@ -29,17 +29,17 @@ namespace dynload { #define RCCL_RAND_ROUTINE_EACH(__macro) \ __macro(ncclCommInitAll); \ - __macro(ncclGetUniqueId); \ + __macro(mcclGetUniqueId); \ __macro(ncclCommInitRank); \ __macro(ncclCommDestroy); \ __macro(ncclCommCount); \ __macro(ncclCommCuDevice); \ __macro(ncclCommUserRank); \ - __macro(ncclAllReduce); \ - __macro(ncclBcast); \ - __macro(ncclAllGather); \ - __macro(ncclGroupStart); \ - __macro(ncclGroupEnd); \ + __macro(mcclAllReduce); \ + __macro(mcclBcast); \ + __macro(mcclAllGather); \ + __macro(mcclGroupStart); \ + __macro(mcclGroupEnd); \ __macro(ncclReduce); \ __macro(ncclReduceScatter); \ __macro(ncclGetErrorString); @@ -52,7 +52,7 @@ RCCL_RAND_ROUTINE_EACH_AFTER_2212(PLATFORM_DECLARE_DYNAMIC_LOAD_RCCL_WRAP) #endif #if NCCL_VERSION_CODE >= 2304 -#define RCCL_RAND_ROUTINE_EACH_AFTER_2304(__macro) __macro(ncclGetVersion); +#define RCCL_RAND_ROUTINE_EACH_AFTER_2304(__macro) __macro(mcclGetVersion); RCCL_RAND_ROUTINE_EACH_AFTER_2304(PLATFORM_DECLARE_DYNAMIC_LOAD_RCCL_WRAP) #endif diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h index 1a82b05f3bc3a..8dab0df500782 100644 --- a/paddle/fluid/platform/enforce.h +++ b/paddle/fluid/platform/enforce.h @@ -38,6 +38,16 @@ limitations under the License. */ #include #endif // PADDLE_WITH_CUDA +#ifdef PADDLE_WITH_MUSA +#include +#include +#include +#include +#include +#include +#include +#endif // PADDLE_WITH_CUDA + #ifdef PADDLE_WITH_HIP #include #include @@ -81,6 +91,20 @@ limitations under the License. */ #endif // __APPLE__ #endif // PADDLE_WITH_CUDA + +#ifdef PADDLE_WITH_MUSA +#include "paddle/phi/backends/dynload/mublas.h" +#include "paddle/phi/backends/dynload/mudnn.h" +#include "paddle/phi/backends/dynload/murand.h" +// #include "paddle/phi/backends/dynload/musolver.h" +#if !defined(__APPLE__) && defined(PADDLE_WITH_MCCL) +#include + +#include "paddle/phi/backends/dynload/mccl.h" +#endif // __APPLE__ +#endif // PADDLE_WITH_MUSA + + #ifdef PADDLE_WITH_HIP #include "paddle/phi/backends/dynload/hipfft.h" #include "paddle/phi/backends/dynload/hiprand.h" @@ -98,7 +122,7 @@ limitations under the License. */ #include "paddle/fluid/imperative/type_defs.h" #include "paddle/phi/core/enforce.h" // Note: this header for simplify HIP and CUDA type string -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) #include "paddle/fluid/platform/device/gpu/gpu_types.h" #endif #include "paddle/phi/core/flags.h" diff --git a/paddle/fluid/platform/enforce_test.cc b/paddle/fluid/platform/enforce_test.cc index 690580d8f9c5d..d3148257ea6de 100644 --- a/paddle/fluid/platform/enforce_test.cc +++ b/paddle/fluid/platform/enforce_test.cc @@ -391,7 +391,7 @@ TEST(enforce, hip_success) { EXPECT_TRUE(CheckCudaStatusFailure(HIPFFT_ALLOC_FAILED, "HIPFFT error")); #if !defined(__APPLE__) && defined(PADDLE_WITH_RCCL) - EXPECT_TRUE(CheckCudaStatusSuccess(ncclSuccess)); + EXPECT_TRUE(CheckCudaStatusSuccess(mcclSuccess)); EXPECT_TRUE(CheckCudaStatusFailure(ncclUnhandledCudaError, "Rccl error")); EXPECT_TRUE(CheckCudaStatusFailure(ncclSystemError, "Rccl error")); #endif @@ -498,7 +498,7 @@ TEST(enforce, cuda_success) { EXPECT_TRUE(CheckCudaStatusFailure(CUFFT_NOT_SUPPORTED, "CUFFT error")); #if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL) - EXPECT_TRUE(CheckCudaStatusSuccess(ncclSuccess)); + EXPECT_TRUE(CheckCudaStatusSuccess(mcclSuccess)); EXPECT_TRUE(CheckCudaStatusFailure(ncclUnhandledCudaError, "NCCL error")); EXPECT_TRUE(CheckCudaStatusFailure(ncclSystemError, "NCCL error")); EXPECT_TRUE(CheckCudaStatusFailure(ncclInternalError, diff --git a/paddle/fluid/platform/event.h b/paddle/fluid/platform/event.h index e807a54fdee2d..68a7a2e462aa7 100644 --- a/paddle/fluid/platform/event.h +++ b/paddle/fluid/platform/event.h @@ -21,6 +21,11 @@ limitations under the License. */ #ifdef PADDLE_WITH_CUDA #include #endif + +#ifdef PADDLE_WITH_MUSA +#include +#endif + #ifdef PADDLE_WITH_HIP #include #endif diff --git a/paddle/fluid/platform/gen_comm_id_helper.cc b/paddle/fluid/platform/gen_comm_id_helper.cc index a77e396adee5f..6bcf6a368331f 100644 --- a/paddle/fluid/platform/gen_comm_id_helper.cc +++ b/paddle/fluid/platform/gen_comm_id_helper.cc @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) || \ defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_CUSTOM_DEVICE) #include "paddle/fluid/platform/gen_comm_id_helper.h" @@ -500,8 +500,8 @@ SocketServer& SocketServer::GetInstance(const std::string& end_point) { std::vector* nccl_ids, \ int ring_id = 0); -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) -INSTANT_TEMPLATE(ncclUniqueId) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +INSTANT_TEMPLATE(mcclUniqueId) #endif #ifdef PADDLE_WITH_XPU_BKCL INSTANT_TEMPLATE(BKCLUniqueId) diff --git a/paddle/fluid/platform/gen_comm_id_helper.h b/paddle/fluid/platform/gen_comm_id_helper.h index d97b41311995e..0d975d84093cf 100644 --- a/paddle/fluid/platform/gen_comm_id_helper.h +++ b/paddle/fluid/platform/gen_comm_id_helper.h @@ -14,7 +14,7 @@ limitations under the License. */ #pragma once -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) || \ defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_CUSTOM_DEVICE) #include #include diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc index a3fff528f7903..c07772e1a1afc 100644 --- a/paddle/fluid/platform/init.cc +++ b/paddle/fluid/platform/init.cc @@ -18,7 +18,7 @@ limitations under the License. */ #include "paddle/fluid/platform/cpu_helper.h" #include "paddle/fluid/string/split.h" #include "paddle/phi/backends/cpu/cpu_info.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) #include "paddle/fluid/platform/cuda_device_guard.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h" #endif @@ -57,8 +57,8 @@ limitations under the License. */ #include "paddle/phi/common/memory_utils.h" #include "paddle/phi/core/custom_kernel.h" -#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \ - (defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)) +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)) && \ + (defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)) #include "paddle/fluid/platform/device/gpu/gpu_resource_pool.h" #endif @@ -169,7 +169,7 @@ void InitDevices() { #endif /*Init all available devices by default */ std::vector devices; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) try { // use user specified GPUs in single-node multi-process mode. devices = platform::GetSelectedDevices(); @@ -209,7 +209,7 @@ void InitDevices(const std::vector devices) { continue; } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) places.emplace_back(platform::CUDAPlace(device)); #endif #ifdef PADDLE_WITH_XPU @@ -220,7 +220,7 @@ void InitDevices(const std::vector devices) { #endif } places.emplace_back(platform::CPUPlace()); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) places.emplace_back(platform::CUDAPinnedPlace()); #endif #ifdef PADDLE_WITH_CUSTOM_DEVICE @@ -431,19 +431,19 @@ void InitMemoryMethod() { memory_method->allocation_deleter = paddle::memory::allocation::Allocator::AllocationDeleter; #if defined(PADDLE_WITH_CUSTOM_DEVICE) || defined(PADDLE_WITH_CUDA) || \ - defined(PADDLE_WITH_HIP) + defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) memory_method->copy_with_stream = paddle::memory::Copy; #endif memory_method->copy = paddle::memory::Copy; memory_method->device_memory_stat_current_value = paddle::memory::DeviceMemoryStatCurrentValue; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) memory_method->gpu_memory_usage = paddle::platform::GpuMemoryUsage; #endif -#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \ - (defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)) +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)) && \ + (defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)) // TODO(GhostScreaming): Use phi methods later. memory_method->get_allocator = [](int device_id, phi::gpuStream_t stream) -> phi::Allocator * { diff --git a/paddle/fluid/platform/place.h b/paddle/fluid/platform/place.h index 3d215435881cf..b0bc0a111cdd2 100644 --- a/paddle/fluid/platform/place.h +++ b/paddle/fluid/platform/place.h @@ -58,7 +58,7 @@ typename Visitor::result_type VisitPlace(const Place &place, const Visitor &visitor) { switch (place.GetType()) { case phi::AllocationType::GPU: { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) platform::CUDAPlace p(place.GetDeviceId()); return visitor(p); #else @@ -68,7 +68,7 @@ typename Visitor::result_type VisitPlace(const Place &place, #endif } case phi::AllocationType::GPUPINNED: { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) platform::CUDAPinnedPlace p; return visitor(p); #else diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc index 44c17c32fa8d5..1ed73672f0e3e 100644 --- a/paddle/fluid/platform/profiler.cc +++ b/paddle/fluid/platform/profiler.cc @@ -685,7 +685,7 @@ void EnableProfiler(ProfilerState state) { HostTraceLevel::GetInstance().SetLevel(option.trace_level); should_send_profile_state = true; phi::GetDeviceTracer()->Enable(); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) if (phi::ProfilerHelper::g_state == ProfilerState::kCUDA || phi::ProfilerHelper::g_state == ProfilerState::kAll || phi::ProfilerHelper::g_state == ProfilerState::kCPU) { diff --git a/paddle/fluid/platform/profiler.cu b/paddle/fluid/platform/profiler.cu index 5d1caffd45326..84a20f8bf7d3c 100644 --- a/paddle/fluid/platform/profiler.cu +++ b/paddle/fluid/platform/profiler.cu @@ -16,6 +16,11 @@ limitations under the License. */ #include #endif +#ifdef PADDLE_WITH_MUSA +#include +#include +#endif + #ifdef PADDLE_WITH_HIP #include #endif @@ -52,6 +57,20 @@ void DummyKernelAndEvent() { PADDLE_ENFORCE_GPU_SUCCESS(hipFree(ptr)); }); } +#elif defined(PADDLE_WITH_MUSA) + for (int i = 0; i < 5; i++) { + ForEachDevice([](int d) { + platform::SetDeviceId(d); + musaStream_t stream; + PADDLE_ENFORCE_GPU_SUCCESS(musaStreamCreate(&stream)); + Mark("_musa_startup_"); + int *ptr; + PADDLE_ENFORCE_GPU_SUCCESS(musaMalloc(&ptr, sizeof(int))); + DummyKernel<<<1, 1, 0, stream>>>(ptr); + PADDLE_ENFORCE_GPU_SUCCESS(musaStreamSynchronize(stream)); + PADDLE_ENFORCE_GPU_SUCCESS(musaFree(ptr)); + }); + } #else for (int i = 0; i < 5; i++) { ForEachDevice([](int d) { diff --git a/paddle/fluid/platform/profiler.h b/paddle/fluid/platform/profiler.h index 4d6bc9cc242d4..89c78f01ac487 100644 --- a/paddle/fluid/platform/profiler.h +++ b/paddle/fluid/platform/profiler.h @@ -31,7 +31,7 @@ limitations under the License. */ #include "paddle/fluid/platform/profiler/event_tracing.h" #include "paddle/fluid/platform/profiler/mem_tracing.h" #include "paddle/fluid/platform/profiler/supplement_tracing.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) #include "paddle/fluid/platform/device/gpu/gpu_info.h" #endif @@ -198,7 +198,7 @@ std::string OpName(const framework::VariableNameMap& name_map, const std::string& type_name); void SetTracerOption(TracerOption option); platform::TracerOption GetTracerOption(); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) void DummyKernelAndEvent(); #endif diff --git a/paddle/fluid/platform/profiler/chrometracing_logger.cc b/paddle/fluid/platform/profiler/chrometracing_logger.cc index de8fd01a1e59d..e67b0fbc3c68d 100644 --- a/paddle/fluid/platform/profiler/chrometracing_logger.cc +++ b/paddle/fluid/platform/profiler/chrometracing_logger.cc @@ -552,7 +552,7 @@ void ChromeTracingLogger::LogMetaInfo(const std::string& version, span_indx); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) void ChromeTracingLogger::LogDeviceProperty( const std::map& device_property_map) { // add device property information @@ -664,6 +664,44 @@ void ChromeTracingLogger::LogDeviceProperty( device_nums -= 1; } #endif +#if defined(PADDLE_WITH_MUSA) + for (auto it = device_property_map.begin(); it != device_property_map.end(); + it++) { + const gpuDeviceProp& device_property = it->second; + if (device_nums > 1) { + output_file_stream_ << string_format(std::string( + R"JSON( + { + "id": %u, "name": "%s", "totalGlobalMem": %llu, + "computeMajor": %d, "computeMinor": %d, + "smCount": %d + }, + )JSON"), + it->first, + device_property.name, + device_property.totalGlobalMem, + device_property.major, + device_property.minor, + device_property.multiProcessorCount); + } else { + output_file_stream_ << string_format(std::string( + R"JSON( + { + "id": %u, "name": "%s", "totalGlobalMem": %llu, + "computeMajor": %d, "computeMinor": %d, + "smCount": %d + }], + )JSON"), + it->first, + device_property.name, + device_property.totalGlobalMem, + device_property.major, + device_property.minor, + device_property.multiProcessorCount); + } + device_nums -= 1; + } +#endif } #endif diff --git a/paddle/fluid/platform/profiler/chrometracing_logger.h b/paddle/fluid/platform/profiler/chrometracing_logger.h index 37323d1450bf2..e0cf523ea53ee 100644 --- a/paddle/fluid/platform/profiler/chrometracing_logger.h +++ b/paddle/fluid/platform/profiler/chrometracing_logger.h @@ -41,7 +41,7 @@ class ChromeTracingLogger : public BaseLogger { void LogNodeTrees(const NodeTrees&) override; void LogExtraInfo(const std::unordered_map); void LogMemTraceEventNode(const MemTraceEventNode&) override; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) void LogDeviceProperty( const std::map& device_property_map); #endif diff --git a/paddle/fluid/platform/profiler/dump/deserialization_reader.cc b/paddle/fluid/platform/profiler/dump/deserialization_reader.cc index 1fce7edc3e329..c2020acf35d25 100644 --- a/paddle/fluid/platform/profiler/dump/deserialization_reader.cc +++ b/paddle/fluid/platform/profiler/dump/deserialization_reader.cc @@ -129,7 +129,7 @@ std::unique_ptr DeserializationReader::Parse() { // restore NodeTrees object std::unique_ptr tree(new NodeTrees(thread_event_trees_map)); // restore gpuDeviceProp -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) std::map device_property_map; for (auto indx = 0; indx < node_trees_proto_->device_property_size(); indx++) { @@ -155,7 +155,7 @@ DeserializationReader::~DeserializationReader() { // NOLINT input_file_stream_.close(); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) gpuDeviceProp DeserializationReader::RestoreDeviceProperty( const DevicePropertyProto& device_property_proto) { gpuDeviceProp device_property; diff --git a/paddle/fluid/platform/profiler/dump/deserialization_reader.h b/paddle/fluid/platform/profiler/dump/deserialization_reader.h index 5f99f6fd82c55..c8ac33c5bea49 100644 --- a/paddle/fluid/platform/profiler/dump/deserialization_reader.h +++ b/paddle/fluid/platform/profiler/dump/deserialization_reader.h @@ -39,7 +39,7 @@ class DeserializationReader { MemTraceEventNode* RestoreMemTraceEventNode(const MemTraceEventNodeProto&); OperatorSupplementEventNode* RestoreOperatorSupplementEventNode( const OperatorSupplementEventNodeProto&); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) gpuDeviceProp RestoreDeviceProperty(const DevicePropertyProto&); #endif diff --git a/paddle/fluid/platform/profiler/dump/serialization_logger.cc b/paddle/fluid/platform/profiler/dump/serialization_logger.cc index 6f4ed06de9e8e..9b5b2636db30b 100644 --- a/paddle/fluid/platform/profiler/dump/serialization_logger.cc +++ b/paddle/fluid/platform/profiler/dump/serialization_logger.cc @@ -40,7 +40,7 @@ void SerializationLogger::OpenFile() { node_trees_proto_ = new NodeTreesProto(); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) void SerializationLogger::LogDeviceProperty( const std::map& device_property_map) { for (const auto& item : device_property_map) { diff --git a/paddle/fluid/platform/profiler/dump/serialization_logger.h b/paddle/fluid/platform/profiler/dump/serialization_logger.h index 80d5413106ded..67eafdf44e3cd 100644 --- a/paddle/fluid/platform/profiler/dump/serialization_logger.h +++ b/paddle/fluid/platform/profiler/dump/serialization_logger.h @@ -37,7 +37,7 @@ class SerializationLogger : public BaseLogger { void LogNodeTrees(const NodeTrees&) override; void LogExtraInfo(const std::unordered_map); void LogMemTraceEventNode(const MemTraceEventNode&) override; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) void LogDeviceProperty( const std::map& device_property_map); #endif diff --git a/paddle/fluid/platform/profiler/event_python.cc b/paddle/fluid/platform/profiler/event_python.cc index c01b4abcfbbd3..4ea1b756a458c 100644 --- a/paddle/fluid/platform/profiler/event_python.cc +++ b/paddle/fluid/platform/profiler/event_python.cc @@ -130,7 +130,7 @@ HostPythonNode* ProfilerResult::CopyTree(HostTraceEventNode* root) { return host_python_node; } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) ProfilerResult::ProfilerResult( std::unique_ptr tree, const ExtraInfo& extra_info, @@ -170,7 +170,7 @@ void ProfilerResult::Save(const std::string& file_name, if (format == std::string("json")) { ChromeTracingLogger logger(file_name); logger.LogMetaInfo(version_, span_indx_); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) logger.LogDeviceProperty(device_property_map_); #endif tree_->LogMe(&logger); @@ -178,7 +178,7 @@ void ProfilerResult::Save(const std::string& file_name, } else if (format == std::string("pb")) { SerializationLogger logger(file_name); logger.LogMetaInfo(version_, span_indx_); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) logger.LogDeviceProperty(device_property_map_); #endif tree_->LogMe(&logger); diff --git a/paddle/fluid/platform/profiler/event_python.h b/paddle/fluid/platform/profiler/event_python.h index dae32a1902834..f1d217674bf6c 100644 --- a/paddle/fluid/platform/profiler/event_python.h +++ b/paddle/fluid/platform/profiler/event_python.h @@ -138,7 +138,7 @@ struct HostPythonNode { class ProfilerResult { public: ProfilerResult() : tree_(nullptr) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) explicit ProfilerResult( std::unique_ptr tree, const ExtraInfo& extra_info, @@ -166,7 +166,7 @@ class ProfilerResult { std::string GetVersion() { return version_; } uint32_t GetSpanIndx() { return span_indx_; } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) std::map GetDeviceProperty() { return device_property_map_; } @@ -176,7 +176,7 @@ class ProfilerResult { std::map thread_event_trees_map_; std::shared_ptr tree_; ExtraInfo extra_info_; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) std::map device_property_map_; #endif std::string version_; diff --git a/paddle/fluid/platform/profiler/profiler.cc b/paddle/fluid/platform/profiler/profiler.cc index bcb35f5b7bd35..2bb7731b0c159 100644 --- a/paddle/fluid/platform/profiler/profiler.cc +++ b/paddle/fluid/platform/profiler/profiler.cc @@ -18,10 +18,16 @@ #ifdef PADDLE_WITH_CUDA #include #endif + +#ifdef PADDLE_WITH_MUSA +#include +#include +#endif + #ifdef PADDLE_WITH_HIP #include #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) #include "paddle/fluid/platform/device/gpu/gpu_info.h" #endif #include "paddle/fluid/platform/enforce.h" @@ -47,6 +53,9 @@ void SynchronizeDevice() { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceSynchronize()); #endif +#ifdef PADDLE_WITH_MUSA + PADDLE_ENFORCE_GPU_SUCCESS(musaDeviceSynchronize()); +#endif #ifdef PADDLE_WITH_CUSTOM_DEVICE auto dev_types = phi::DeviceManager::GetAllCustomDeviceTypes(); for (const auto& dev_type : dev_types) { @@ -162,7 +171,7 @@ std::unique_ptr Profiler::Stop() { std::string("%s"), kv.second.c_str()); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) std::map device_property_map; std::vector device_ids = GetSelectedDevices(); for (auto device_id : device_ids) { diff --git a/paddle/fluid/platform/profiler/profiler_test.cc b/paddle/fluid/platform/profiler/profiler_test.cc index f7f888d9e6739..86243e9258dd6 100644 --- a/paddle/fluid/platform/profiler/profiler_test.cc +++ b/paddle/fluid/platform/profiler/profiler_test.cc @@ -23,6 +23,9 @@ #ifdef PADDLE_WITH_HIP #include #endif +#ifdef PADDLE_WITH_MUSA +#include +#endif #include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler/event_python.h" @@ -80,6 +83,11 @@ TEST(ProfilerTest, TestCudaTracer) { hipStream_t stream; hipStreamCreate(&stream); hipStreamSynchronize(stream); +#endif +#ifdef PADDLE_WITH_MUSA + musaStream_t stream; + musaStreamCreate(&stream); + musaStreamSynchronize(stream); #endif auto profiler_result = profiler->Stop(); auto nodetree = profiler_result->GetNodeTrees(); diff --git a/paddle/fluid/platform/profiler_helper.h b/paddle/fluid/platform/profiler_helper.h index 9835e7525c51e..e1720874e1489 100644 --- a/paddle/fluid/platform/profiler_helper.h +++ b/paddle/fluid/platform/profiler_helper.h @@ -34,6 +34,10 @@ limitations under the License. */ #ifdef PADDLE_WITH_HIP #include #endif +#ifdef PADDLE_WITH_MUSA +#include +#include +#endif #ifdef PADDLE_WITH_CUSTOM_DEVICE #include "paddle/phi/backends/device_manager.h" #endif @@ -103,6 +107,17 @@ void SynchronizeAllDevice() { } SetDeviceId(pre_device_id); #endif + +#ifdef PADDLE_WITH_MUSA + int pre_device_id = GetCurrentDeviceId(); + int count = GetGPUDeviceCount(); + for (int i = 0; i < count; i++) { + SetDeviceId(i); + PADDLE_ENFORCE_GPU_SUCCESS(musaDeviceSynchronize()); + } + SetDeviceId(pre_device_id); +#endif + #ifdef PADDLE_WITH_HIP int pre_device_id = GetCurrentDeviceId(); int count = GetGPUDeviceCount(); @@ -141,7 +156,7 @@ void PrintMemProfiler( << " Memory Profiling Report " << "<-------------------------\n\n"; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) int num_gpus = GetGPUDeviceCount(); std::cout.setf(std::ios::left); if (num_gpus > 0) { @@ -343,7 +358,7 @@ void SetEvent(bool merge_thread, if (rit != pushed_events->rend()) { double event_time = 0; double gpu_time = 0.0f; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) gpu_time = rit->CudaElapsedMs(analyze_event); #endif double cpu_time = rit->CpuElapsedMs(analyze_event); diff --git a/paddle/fluid/platform/stream_callback_manager.cc b/paddle/fluid/platform/stream_callback_manager.cc index c55bcb71a7d43..97ca34c0209d3 100644 --- a/paddle/fluid/platform/stream_callback_manager.cc +++ b/paddle/fluid/platform/stream_callback_manager.cc @@ -24,6 +24,11 @@ static void StreamCallbackFunc(gpuStream_t stream, gpuError_t status, void *user_data) #endif +#ifdef PADDLE_WITH_MUSA +static void StreamCallbackFunc(gpuStream_t stream, + gpuError_t status, + void *user_data) +#endif #ifdef PADDLE_WITH_CUDA #if CUDA_VERSION >= 10000 static void CUDART_CB StreamCallbackFunc(void *user_data) @@ -58,6 +63,11 @@ void StreamCallbackManager::AddCallback( PADDLE_ENFORCE_GPU_SUCCESS( hipStreamAddCallback(stream_, StreamCallbackFunc, func, 0)); #endif +#ifdef PADDLE_WITH_MUSA + PADDLE_ENFORCE_GPU_SUCCESS( + musaStreamAddCallback(stream_, StreamCallbackFunc, func, 0)); +#endif + #ifdef PADDLE_WITH_CUDA #if CUDA_VERSION >= 10000 PADDLE_ENFORCE_GPU_SUCCESS( @@ -71,7 +81,7 @@ void StreamCallbackManager::AddCallback( template void StreamCallbackManager::Wait() const { -#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_CUDA) +#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || defined(PADDLE_WITH_CUDA) platform::GpuStreamSync(stream_); #endif { @@ -88,5 +98,8 @@ template struct StreamCallbackManager; #ifdef PADDLE_WITH_HIP template struct StreamCallbackManager; #endif +#ifdef PADDLE_WITH_MUSA +template struct StreamCallbackManager; +#endif } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/stream_callback_manager.h b/paddle/fluid/platform/stream_callback_manager.h index 7cd6930a9d0d0..1cc0f0e5cf1e9 100644 --- a/paddle/fluid/platform/stream_callback_manager.h +++ b/paddle/fluid/platform/stream_callback_manager.h @@ -25,6 +25,11 @@ #include #endif +#ifdef PADDLE_WITH_MUSA +#include +#include +#endif + #include #include // NOLINT #include diff --git a/paddle/fluid/primitive/composite/composite.h b/paddle/fluid/primitive/composite/composite.h index 66f17168ec01a..a35095c98d4a2 100644 --- a/paddle/fluid/primitive/composite/composite.h +++ b/paddle/fluid/primitive/composite/composite.h @@ -22,9 +22,6 @@ namespace paddle { namespace primitive { namespace details { -// empty_shape means x.shape=[] -static std::vector empty_shape; - template Tensor mean_decomp(const Tensor& x, const IntArray& axis, bool keepdim) { auto org_dtype = x.dtype(); @@ -348,66 +345,62 @@ std::tuple layer_norm_decomp( // cast dtype to float32 if dtype =float16 or bfloat16 if (need_cast) { - x_cast = cast(x_cast, DataType::FLOAT32); + x_cast = cast(x_cast, phi::DataType::FLOAT32); } auto x_dim = common::vectorize(x.dims()); for (size_t i = begin_norm_axis; i < x_dim.size(); i++) { axis.push_back(static_cast(i)); } - auto mean_ = mean_decomp(x_cast, axis, true); + auto mean_ = mean_decomp(x_cast, IntArray(axis), true); auto difference = x_cast - mean_; auto var_tmp1 = difference * difference; - auto variance = mean_decomp(var_tmp1, axis, true); + auto variance = mean_decomp(var_tmp1, IntArray(axis), true); auto var_tmp3 = variance + epsilon; auto rsqrt_var = elementwise_pow( - var_tmp3, full(empty_shape, -0.5, var_tmp3.dtype())); + var_tmp3, + full(common::vectorize(var_tmp3.dims()), -0.5, var_tmp3.dtype())); auto out = difference * rsqrt_var; auto scale_ptr = scale.get_ptr(); auto bias_ptr = bias.get_ptr(); - std::vector slice_shape_l; - std::vector slice_shape_r; - for (int64_t i = 0; i < static_cast(x_dim.size()); i++) { - if (i < begin_norm_axis) { - slice_shape_l.push_back(x_dim[i]); - } else { - slice_shape_r.push_back(x_dim[i]); - } + std::vector slice_shape; + for (int64_t i = begin_norm_axis; i < static_cast(x_dim.size()); + i++) { + slice_shape.push_back(x_dim[i]); } Tensor scale_cast; if (scale_ptr) { - if (slice_shape_r != scale_ptr->shape()) { - scale_cast = reshape(*scale_ptr, slice_shape_r); + if (slice_shape != scale_ptr->shape()) { + scale_cast = reshape(*scale_ptr, slice_shape); } else { scale_cast = *scale_ptr; } if (need_cast) { - scale_cast = cast(scale_cast, DataType::FLOAT32); + scale_cast = cast(scale_cast, phi::DataType::FLOAT32); } out = out * scale_cast; } Tensor bias_cast; if (bias_ptr) { - if (slice_shape_r != bias_ptr->shape()) { - bias_cast = reshape(*bias_ptr, slice_shape_r); + if (slice_shape != bias_ptr->shape()) { + bias_cast = reshape(*bias_ptr, slice_shape); } else { bias_cast = *bias_ptr; } if (need_cast) { - bias_cast = cast(bias_cast, DataType::FLOAT32); + bias_cast = cast(bias_cast, phi::DataType::FLOAT32); } out = out + bias_cast; } - mean_ = reshape(mean_, slice_shape_l); - variance = reshape(variance, slice_shape_l); + mean_ = reshape(mean_, std::vector({-1})); + variance = reshape(variance, std::vector({-1})); - // same as LayerNormInferMeta - // x: float32 --> out: float32, mean: float32, variance: float32 - // x: float16 --> out: float16, mean: float32, variance: float32 if (need_cast) { out = cast(out, org_dtype); + mean_ = cast(mean_, org_dtype); + variance = cast(variance, org_dtype); } return std::make_tuple(out, mean_, variance); diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index 4f761aa3c8536..5306d282e797c 100755 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -67,7 +67,7 @@ if(WITH_RPC) set(PYBIND_DEPS ${PYBIND_DEPS} paddle_rpc ${EXTERNAL_BRPC_DEPS} zlib phi common) endif() -if(WITH_GPU OR WITH_ROCM) +if(WITH_GPU OR WITH_ROCM OR WITH_MUSA) set(PYBIND_DEPS ${PYBIND_DEPS} dynload_cuda) set(PYBIND_DEPS ${PYBIND_DEPS} cuda_device_guard) endif() @@ -79,7 +79,7 @@ if(WITH_IPU) set(PYBIND_DEPS ${PYBIND_DEPS} ipu_info) endif() -if(WITH_NCCL OR WITH_RCCL) +if(WITH_NCCL OR WITH_RCCL OR WITH_MCCL) set(PYBIND_DEPS ${PYBIND_DEPS} nccl_wrapper) set(PYBIND_DEPS ${PYBIND_DEPS} reducer) endif() @@ -99,6 +99,7 @@ if(WITH_CUSTOM_DEVICE) if(NOT (WITH_NCCL OR WITH_RCCL + OR WITH_MCCL OR WITH_XPU_BKCL)) set(PYBIND_DEPS ${PYBIND_DEPS} reducer) set(PYBIND_DEPS ${PYBIND_DEPS} heter_ccl_context) @@ -107,7 +108,7 @@ endif() if(NOT WIN32) set(PYBIND_DEPS ${PYBIND_DEPS} data_loader) - if(WITH_NCCL OR WITH_RCCL) + if(WITH_NCCL OR WITH_RCCL OR WITH_MCCL) set(PYBIND_DEPS ${PYBIND_DEPS} nccl_context) set(PYBIND_DEPS ${PYBIND_DEPS} heter_ccl_context) endif() @@ -162,7 +163,7 @@ endif() if(WITH_PYTHON) set(PYBIND_DEPS ${PYBIND_DEPS} process_group eager_reducer) - if(WITH_NCCL OR WITH_RCCL) + if(WITH_NCCL OR WITH_RCCL OR WITH_MCCL) set(PYBIND_DEPS ${PYBIND_DEPS} process_group_nccl) endif() if(WITH_XPU_BKCL) @@ -246,7 +247,7 @@ if(WITH_RPC) set(PYBIND_SRCS rpc.cc ${PYBIND_SRCS}) endif() -if(WITH_NCCL OR WITH_RCCL) +if(WITH_NCCL OR WITH_RCCL OR WITH_MCCL) list(APPEND PYBIND_SRCS nccl_wrapper_py.cc) endif() @@ -265,7 +266,7 @@ if(WITH_PYTHON) list(APPEND OP_FUNCTION_GENERETOR_DEPS ${GLOB_OP_LIB}) list(APPEND OP_FUNCTION_GENERETOR_DEPS ${GLOB_OPERATOR_DEPS}) - if(WITH_NCCL OR WITH_RCCL) + if(WITH_NCCL OR WITH_RCCL OR WITH_MCCL) list(APPEND OP_FUNCTION_GENERETOR_DEPS nccl_context) endif() @@ -286,15 +287,20 @@ if(WITH_PYTHON) eager_legacy_op_function_generator.cc) set(GENERATOR_DEPS ${PYBIND_DEPS}) list(REMOVE_DUPLICATES GENERATOR_DEPS) - if(WIN32) + if(NOT WITH_ARM) list(REMOVE_ITEM GENERATOR_DEPS python) endif() target_link_libraries(eager_legacy_op_function_generator ${GENERATOR_DEPS}) - if(NOT WIN32) - add_executable(kernel_signature_generator kernel_signature_generator.cc) - target_link_libraries(kernel_signature_generator - ${OP_FUNCTION_GENERETOR_DEPS}) - endif() + # if(NOT WIN32) + # add_executable(kernel_signature_generator kernel_signature_generator.cc) + # if(WITH_MUSA) + # # libtinfo.so depended by libmusa.so is located in '/usr/lib/x86_64-linux-gnu/' + # target_link_options(kernel_signature_generator PRIVATE + # -Wl,-rpath,/usr/lib/x86_64-linux-gnu/) + # endif() + # target_link_libraries(kernel_signature_generator + # ${OP_FUNCTION_GENERETOR_DEPS}) + # endif() get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES) target_link_libraries(eager_legacy_op_function_generator diff --git a/paddle/fluid/pybind/communication.cc b/paddle/fluid/pybind/communication.cc index 391dbabb1a210..6351d021dfe8c 100644 --- a/paddle/fluid/pybind/communication.cc +++ b/paddle/fluid/pybind/communication.cc @@ -48,7 +48,7 @@ void BindCommContextManager(py::module *m) { .def_static("set_device_id", &phi::distributed::CommContextManager::SetDeviceId, py::call_guard()) -#if defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_NCCL) +#if defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) || defined(PADDLE_WITH_NCCL) .def_static( "create_nccl_comm_context", &phi::distributed::CommContextManager::CreateNCCLCommContext, diff --git a/paddle/fluid/pybind/cuda_streams_py.cc b/paddle/fluid/pybind/cuda_streams_py.cc index 2a6c639735a2b..a07aef2fb6996 100644 --- a/paddle/fluid/pybind/cuda_streams_py.cc +++ b/paddle/fluid/pybind/cuda_streams_py.cc @@ -24,7 +24,7 @@ namespace py = pybind11; namespace paddle { namespace platform { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) phi::CUDAStream *get_current_stream(int device_id) { if (device_id == -1) { device_id = phi::backends::gpu::GetCurrentDeviceId(); @@ -51,7 +51,7 @@ void BindCudaStream(py::module *m_ptr) { m.def( "_get_current_stream", [](int deviceId) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) return platform::get_current_stream(deviceId); #else PADDLE_THROW( @@ -64,7 +64,7 @@ void BindCudaStream(py::module *m_ptr) { m.def( "_set_current_stream", [](phi::CUDAStream *stream) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) return platform::set_current_stream(stream); #else PADDLE_THROW( @@ -75,7 +75,7 @@ void BindCudaStream(py::module *m_ptr) { py::return_value_policy::reference); m.def("_device_synchronize", [](int device_id) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) if (device_id == -1) { device_id = paddle::platform::GetCurrentDeviceId(); } @@ -84,6 +84,8 @@ void BindCudaStream(py::module *m_ptr) { paddle::platform::SetDeviceId(device_id); #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceSynchronize()); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(musaDeviceSynchronize()); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize()); #endif @@ -114,7 +116,7 @@ void BindCudaStream(py::module *m_ptr) { >>> s3 = paddle.device.cuda.Stream() )DOC") -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) .def( "wait_event", [](phi::CUDAStream &self, paddle::platform::CudaEvent &event) { @@ -249,7 +251,7 @@ void BindCudaStream(py::module *m_ptr) { .def( "__init__", [](phi::CUDAStream &self, platform::CUDAPlace *place, int priority) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) if (priority != 1 && priority != 2) { PADDLE_THROW(platform::errors::InvalidArgument( "Priority should be 1(high) or 2(normal) ")); @@ -275,7 +277,7 @@ void BindCudaStream(py::module *m_ptr) { .def( "__init__", [](phi::CUDAStream &self, int device, int priority) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) if (priority != 1 && priority != 2) { PADDLE_THROW(platform::errors::InvalidArgument( "Priority should be 1(high) or 2(normal) ")); @@ -305,7 +307,7 @@ void BindCudaStream(py::module *m_ptr) { py::arg("device") = -1, py::arg("priority") = 2) .def("__init__", [](phi::CUDAStream &self) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) int device_id = platform::GetCurrentDeviceId(); auto stream_flag = phi::CUDAStream::StreamFlag::kStreamNonBlocking; new (&self) phi::CUDAStream( @@ -332,7 +334,7 @@ void BindCudaStream(py::module *m_ptr) { >>> event = paddle.device.cuda.Event() )DOC") -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) .def( "record", [](paddle::platform::CudaEvent &self, phi::CUDAStream *stream) { @@ -399,7 +401,7 @@ void BindCudaStream(py::module *m_ptr) { bool enable_timing, bool blocking, bool interprocess) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) unsigned int flags = platform::GenerateDeviceEventFlag( enable_timing, blocking, interprocess); new (&self) paddle::platform::CudaEvent(flags); diff --git a/paddle/fluid/pybind/cuda_streams_py.h b/paddle/fluid/pybind/cuda_streams_py.h index d10608a6e8ea9..61f27960e25e9 100644 --- a/paddle/fluid/pybind/cuda_streams_py.h +++ b/paddle/fluid/pybind/cuda_streams_py.h @@ -17,7 +17,7 @@ #include "pybind11/pybind11.h" #include "pybind11/stl.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) #include "paddle/phi/core/cuda_stream.h" #else namespace phi { @@ -29,7 +29,7 @@ namespace py = pybind11; namespace paddle { namespace platform { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) phi::CUDAStream* get_current_stream(int device_id = -1); phi::CUDAStream* set_current_stream(phi::CUDAStream* stream); #endif diff --git a/paddle/fluid/pybind/distributed_py.cc b/paddle/fluid/pybind/distributed_py.cc index 4577171fd77bb..ea61387ae53e5 100644 --- a/paddle/fluid/pybind/distributed_py.cc +++ b/paddle/fluid/pybind/distributed_py.cc @@ -32,7 +32,7 @@ limitations under the License. */ #include "paddle/phi/api/all.h" #include "paddle/phi/core/distributed/types.h" -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) #include "paddle/fluid/distributed/collective/process_group_nccl.h" #endif @@ -1224,7 +1224,7 @@ void BindDistributed(py::module *m) { py::arg("id"), py::call_guard()); -#if defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_NCCL) +#if defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) || defined(PADDLE_WITH_NCCL) py::class_>( *m, "ProcessGroupNCCL", ProcessGroup) diff --git a/paddle/fluid/pybind/eager.cc b/paddle/fluid/pybind/eager.cc index 894ede8db18d2..098c2fa4bdf77 100644 --- a/paddle/fluid/pybind/eager.cc +++ b/paddle/fluid/pybind/eager.cc @@ -322,7 +322,7 @@ void InitTensorWithNumpyValue(TensorObject* self, #endif SetTensorFromPyArray(impl_ptr, array, place, zero_copy); } else if (platform::is_gpu_place(place)) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) phi::backends::gpu::SetDeviceId(place.device); VLOG(4) << "CurrentDeviceId: " << phi::backends::gpu::GetCurrentDeviceId() << " from " << static_cast(place.device); diff --git a/paddle/fluid/pybind/eager_functions.cc b/paddle/fluid/pybind/eager_functions.cc index df84ca68b9182..956de0e9d371a 100644 --- a/paddle/fluid/pybind/eager_functions.cc +++ b/paddle/fluid/pybind/eager_functions.cc @@ -58,7 +58,7 @@ typedef SSIZE_T ssize_t; #include "pybind11/numpy.h" #include "pybind11/pybind11.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) #include "paddle/fluid/pybind/cuda_streams_py.h" #endif diff --git a/paddle/fluid/pybind/eager_math_op_patch.cc b/paddle/fluid/pybind/eager_math_op_patch.cc index 2c01e122914aa..e932ecb34201c 100644 --- a/paddle/fluid/pybind/eager_math_op_patch.cc +++ b/paddle/fluid/pybind/eager_math_op_patch.cc @@ -139,7 +139,7 @@ std::set _complex_dtypes{ void SetDevice(paddle::platform::Place place) { if (paddle::platform::is_gpu_place(place)) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) phi::backends::gpu::SetDeviceId(place.device); VLOG(6) << "CurrentDeviceId: " << phi::backends::gpu::GetCurrentDeviceId() << " from " << static_cast(place.device); diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc index 584d1b8b58482..48a8fdc8daa70 100644 --- a/paddle/fluid/pybind/eager_method.cc +++ b/paddle/fluid/pybind/eager_method.cc @@ -54,7 +54,6 @@ typedef SSIZE_T ssize_t; #pragma GCC diagnostic ignored "-Wmissing-field-initializers" #include "paddle/common/ddim.h" #include "paddle/fluid/eager/amp_utils.h" -#include "paddle/fluid/eager/api/generated/eager_generated/backwards/nodes.h" #include "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h" #include "paddle/fluid/eager/eager_amp_auto_cast.h" #include "paddle/fluid/framework/python_headers.h" @@ -319,11 +318,13 @@ static PyObject* tensor_method_numpy(TensorObject* self, dense_tensor->Holder()->size()); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) } else if (self->tensor.is_gpu()) { eager_gil_scoped_release guard; #if defined(PADDLE_WITH_CUDA) gpuMemcpyKind kind = cudaMemcpyDeviceToHost; +#elif defined(PADDLE_WITH_MUSA) + gpuMemcpyKind kind = musaMemcpyDeviceToHost; #elif defined(PADDLE_WITH_HIP) gpuMemcpyKind kind = hipMemcpyDeviceToHost; phi::DeviceContextPool::Instance().Get(self->tensor.place())->Wait(); @@ -1360,7 +1361,6 @@ static PyObject* tensor__getitem_dygraph(TensorObject* self, &use_strided_slice); // step2: Dealing with basic indexing - bool out_is_view = false; auto out = getTensorWithBasicIndexing(tensor, &slice_axes, &slice_starts, @@ -1369,8 +1369,7 @@ static PyObject* tensor__getitem_dygraph(TensorObject* self, &decrease_axis, &none_axes, &infer_flags, - &use_strided_slice, - &out_is_view); + &use_strided_slice); if (!has_advanced_index) { return ToPyObject(out); @@ -1378,7 +1377,7 @@ static PyObject* tensor__getitem_dygraph(TensorObject* self, // step3: Dealing with advanced indexing std::vector transed_index; - std::vector trans_back_dim, trans_dim; + std::vector trans_back_dim; int pos_of_new_dim = INT_MAX, rank_of_new_dim = 1; paddle::Tensor transed_tensor = dealWithAdvancedIndex(out, @@ -1388,9 +1387,7 @@ static PyObject* tensor__getitem_dygraph(TensorObject* self, &transed_index, &trans_back_dim, &pos_of_new_dim, - &rank_of_new_dim, - &trans_dim, - &out_is_view); + &rank_of_new_dim); if (transed_index.size() == 1 && transed_index[0].dtype() == phi::DataType::BOOL) { @@ -1420,14 +1417,14 @@ static PyObject* tensor__getitem_dygraph(TensorObject* self, if (pos_of_new_dim != 0) { std::vector perm(out.shape().size(), 0); - int tmp1 = rank_of_new_dim, tmp2 = 0, + int tmp1 = pos_of_new_dim, tmp2 = 0, tmp3 = pos_of_new_dim + rank_of_new_dim; for (int i = 0; i < static_cast(out.shape().size()); ++i) { - if (i < pos_of_new_dim) { + if (i < rank_of_new_dim) { perm[i] = - tmp1++; // range(rank_of_new_dim, pos_of_new_dim + rank_of_new_dim) - } else if (i >= pos_of_new_dim && i < pos_of_new_dim + rank_of_new_dim) { - perm[i] = tmp2++; // range(0, rank_of_new_dim) + tmp1++; // range(pos_of_new_dim, pos_of_new_dim + rank_of_new_dim) + } else if (i >= rank_of_new_dim && i < pos_of_new_dim + rank_of_new_dim) { + perm[i] = tmp2++; // range(0, pos_of_new_dim) } else { perm[i] = tmp3++; // range(pos_of_new_dim + rank_of_new_dim, out.ndim) } @@ -1612,9 +1609,12 @@ static PyObject* tensor__setitem_dygraph(TensorObject* self, &use_strided_slice); // step2: Parse values - std::vector values; + PADDLE_ENFORCE( + PyCheckTensor(value_obj), + platform::errors::InvalidArgument("The value must be a Tensor")); + paddle::Tensor value_tensor = - dealWithValues(tensor, value_obj, &values, has_advanced_index); + reinterpret_cast(value_obj)->tensor; if (!has_advanced_index) { // use set_value OP if there is no advanced index @@ -1622,60 +1622,45 @@ static PyObject* tensor__setitem_dygraph(TensorObject* self, // Release gil and do tracing py::gil_scoped_release release; // use inplace set_value_ operator - if (value_tensor.initialized()) { + if (value_tensor.initialized() && + (self->tensor.dtype() != value_tensor.dtype())) { + if (egr::Controller::Instance().GetAMPLevel() != + paddle::imperative::AmpLevel::O0) { + paddle::small_vector, + egr::kSlotSmallVectorSize> + tmps = {{self->tensor}, {value_tensor}}; + auto amp_dtype = egr::GetAmpDestDtype("set_value", tmps); + self->tensor = egr::EagerAmpAutoCast( + self->tensor.name(), self->tensor, amp_dtype, "set_value"); + value_tensor = egr::EagerAmpAutoCast( + value_tensor.name(), value_tensor, amp_dtype, "set_value"); + } if (self->tensor.dtype() != value_tensor.dtype()) { - if (egr::Controller::Instance().GetAMPLevel() != - paddle::imperative::AmpLevel::O0) { - paddle::small_vector, - egr::kSlotSmallVectorSize> - tmps = {{self->tensor}, {value_tensor}}; - auto amp_dtype = egr::GetAmpDestDtype("set_value", tmps); - self->tensor = egr::EagerAmpAutoCast( - self->tensor.name(), self->tensor, amp_dtype, "set_value"); - value_tensor = egr::EagerAmpAutoCast( - value_tensor.name(), value_tensor, amp_dtype, "set_value"); - } - if (self->tensor.dtype() != value_tensor.dtype()) { - value_tensor = cast_ad_func(value_tensor, self->tensor.dtype()); - } + value_tensor = cast_ad_func(value_tensor, self->tensor.dtype()); } + } - // step3.1: Only basic indexing, use OP set_value. - const phi::distributed::ProcessMesh* mesh = nullptr; - if (InputsContainDistTensor(&mesh, self->tensor, value_tensor)) { - ConvertAllInputsToDistTensor(mesh, self->tensor, value_tensor); - } - self->tensor = set_value_with_tensor__ad_func(self->tensor, - value_tensor, - slice_starts, - slice_ends, - slice_strides, - slice_axes, - decrease_axis, - none_axes); - if (PyCheckTensor(value_obj)) { - // pass the stop_gradient from value to tensor. - // pass stop gradient should be done after CheckInplace in - // set_value__dygraph_function. - if (!egr::EagerUtils::autograd_meta(&value_tensor)->StopGradient() && - egr::EagerUtils::autograd_meta(&self->tensor)->StopGradient()) { - egr::EagerUtils::autograd_meta(&self->tensor)->SetStopGradient(false); - } - } - } else { - const phi::distributed::ProcessMesh* mesh = nullptr; - if (InputsContainDistTensor(&mesh, self->tensor)) { - ConvertAllInputsToDistTensor(mesh, self->tensor); + // step3.1: Only basic indexing, use OP set_value. + const phi::distributed::ProcessMesh* mesh = nullptr; + if (InputsContainDistTensor(&mesh, self->tensor, value_tensor)) { + ConvertAllInputsToDistTensor(mesh, self->tensor, value_tensor); + } + self->tensor = set_value_with_tensor__ad_func(self->tensor, + value_tensor, + slice_starts, + slice_ends, + slice_strides, + slice_axes, + decrease_axis, + none_axes); + if (PyCheckTensor(value_obj)) { + // pass the stop_gradient from value to tensor. + // pass stop gradient should be done after CheckInplace in + // set_value__dygraph_function. + if (!egr::EagerUtils::autograd_meta(&value_tensor)->StopGradient() && + egr::EagerUtils::autograd_meta(&self->tensor)->StopGradient()) { + egr::EagerUtils::autograd_meta(&self->tensor)->SetStopGradient(false); } - self->tensor = set_value__ad_func(self->tensor, - slice_starts, - slice_ends, - slice_strides, - slice_axes, - decrease_axis, - none_axes, - {1}, - values); } } else { // step3.2: Case for there are advanced indexing. @@ -1685,7 +1670,6 @@ static PyObject* tensor__setitem_dygraph(TensorObject* self, // 3. assign values to the sliced result by index_put OP; // 4. transpose back and assign the result to original tensor by set_value // OP. - bool out_is_view = false; paddle::Tensor sub_tensor = getTensorWithBasicIndexing(tensor, &slice_axes, &slice_starts, @@ -1694,13 +1678,12 @@ static PyObject* tensor__setitem_dygraph(TensorObject* self, &decrease_axis, &none_axes, &infer_flags, - &use_strided_slice, - &out_is_view); + &use_strided_slice); std::vector transed_index; - std::vector trans_back_dim, trans_dim; + std::vector trans_back_dim; - int pos_of_new_dim = INT_MAX, rank_of_new_dim = 1; + int pos_of_new_dim = 0, rank_of_new_dim = 0; paddle::Tensor transed_sub_tensor = dealWithAdvancedIndex(sub_tensor, @@ -1710,127 +1693,61 @@ static PyObject* tensor__setitem_dygraph(TensorObject* self, &transed_index, &trans_back_dim, &pos_of_new_dim, - &rank_of_new_dim, - &trans_dim, - &out_is_view); + &rank_of_new_dim); // Release gil and do tracing py::gil_scoped_release release; - if (value_tensor.initialized()) { - if (self->tensor.dtype() != value_tensor.dtype()) { - if (egr::Controller::Instance().GetAMPLevel() != - paddle::imperative::AmpLevel::O0) { - paddle::small_vector, - egr::kSlotSmallVectorSize> - tmps = {{self->tensor}, {value_tensor}}; - auto amp_dtype = egr::GetAmpDestDtype("index_put", tmps); - self->tensor = egr::EagerAmpAutoCast( - self->tensor.name(), self->tensor, amp_dtype, "index_put"); - value_tensor = egr::EagerAmpAutoCast( - value_tensor.name(), value_tensor, amp_dtype, "index_put"); - } - if (self->tensor.dtype() != value_tensor.dtype()) { - value_tensor = cast_ad_func(value_tensor, self->tensor.dtype()); - } - } - if (value_tensor.dims().size() > 1 && pos_of_new_dim != 0) { - value_tensor = transpose_ad_func(value_tensor, trans_dim); + if (value_tensor.initialized() && + (self->tensor.dtype() != value_tensor.dtype())) { + if (egr::Controller::Instance().GetAMPLevel() != + paddle::imperative::AmpLevel::O0) { + paddle::small_vector, + egr::kSlotSmallVectorSize> + tmps = {{self->tensor}, {value_tensor}}; + auto amp_dtype = egr::GetAmpDestDtype("index_put", tmps); + self->tensor = egr::EagerAmpAutoCast( + self->tensor.name(), self->tensor, amp_dtype, "index_put"); + value_tensor = egr::EagerAmpAutoCast( + value_tensor.name(), value_tensor, amp_dtype, "index_put"); } - - const phi::distributed::ProcessMesh* mesh = nullptr; - if (InputsContainDistTensor( - &mesh, self->tensor, transed_sub_tensor, value_tensor)) { - ConvertAllInputsToDistTensor( - mesh, self->tensor, transed_sub_tensor, value_tensor); + if (self->tensor.dtype() != value_tensor.dtype()) { + value_tensor = cast_ad_func(value_tensor, self->tensor.dtype()); } + } - if (transed_index.size() == 1 && - transed_index[0].dtype() == phi::DataType::BOOL && - transed_index[0].shape().size() == self->tensor.shape().size()) { - if (value_tensor.shape() != self->tensor.shape()) { - value_tensor = expand_ad_func(value_tensor, self->tensor.shape()); - } - transed_sub_tensor = - where__ad_func(logical_not_ad_func(transed_index[0]), - transed_sub_tensor, - value_tensor); - } else { - transed_sub_tensor = - index_put__ad_func(transed_sub_tensor, transed_index, value_tensor); - } + // TODO(zoooo0820) 1.Using inplace version index_put + // 2.Remove following code after backward bug fixed. + transed_sub_tensor = assign_ad_func(transed_sub_tensor); - if (out_is_view) { - // NOTE(zoooo0820): if out_is_view is true, it is a case of - // combined-indexing setitem, i.e. firstly we get a view of - // self->tensor, then modified it with inplace api index_put_ For now, - // in design of Paddle, the forward result is right. But the backward - // edge can not be established because the Base Tensor cannot sense - // whether it has been modified by other operations. Following codes are - // to add a new node (set_value_with_tensor_grad) to record the backward - // edge, with out ad_function which needs to do the forward calculation. - - egr::AutogradMeta* x_autograd_meta = - egr::EagerUtils::nullable_autograd_meta(self->tensor); - egr::AutogradMeta* values_autograd_meta = - egr::EagerUtils::nullable_autograd_meta(transed_sub_tensor); - bool trace_backward = egr::Controller::Instance().HasGrad(); - bool require_any_grad = egr::EagerUtils::ComputeRequireGrad( - trace_backward, x_autograd_meta, values_autograd_meta); - // Node Declaration - std::shared_ptr grad_node; - // Set grad_node before API Call - if (require_any_grad) { - paddle::Tensor transback_sub_tensor = - transpose_ad_func(transed_sub_tensor, trans_back_dim); - const auto& values_tmp = - (require_any_grad && transback_sub_tensor.is_dense_tensor() && - !std::dynamic_pointer_cast( - transback_sub_tensor.impl()) - ->meta() - .is_contiguous()) - ? paddle::Tensor( - std::make_shared( - std::move(paddle::experimental::Trans2Contiguous( - *(std::dynamic_pointer_cast( - transback_sub_tensor.impl()))))), - transback_sub_tensor.mutable_autograd_meta()) - : transback_sub_tensor; - - grad_node = std::shared_ptr( - new SetValueWithTensorGradNode(1, 2)); // NOLINT - grad_node->SetAttributestarts(slice_starts); - grad_node->SetAttributeends(slice_ends); - grad_node->SetAttributesteps(slice_strides); - grad_node->SetAttributeaxes(slice_axes); - grad_node->SetAttributedecrease_axes(decrease_axis); - grad_node->SetAttributenone_axes(none_axes); - grad_node->SetTensorWrappervalues(values_tmp); - - paddle::memory::LogDeviceMemoryStats( - egr::Controller::Instance().GetExpectedPlace(), - "set_value_with_tensor"); - egr::EagerUtils::CheckInplace( - self->tensor, x_autograd_meta, require_any_grad); - egr::EagerUtils::PassStopGradient(false, x_autograd_meta); - // SetGradOutMeta & SetEdges - grad_node->SetGradOutMeta(self->tensor, 0); - grad_node->SetGradOutMeta(transback_sub_tensor, 1); - if (x_autograd_meta) { - egr::EagerUtils::SetOutRankWithSlot(x_autograd_meta, 0); - egr::EagerUtils::SetHistory(x_autograd_meta, grad_node); - } - grad_node->SetGradInMeta(self->tensor, 0); - } - } - if (PyCheckTensor(value_obj)) { - // pass the stop_gradient from value to tensor. - // pass stop gradient should be done after CheckInplace in - // set_value__dygraph_function. - if (!egr::EagerUtils::autograd_meta(&value_tensor)->StopGradient() && - egr::EagerUtils::autograd_meta(&self->tensor)->StopGradient()) { - egr::EagerUtils::autograd_meta(&self->tensor)->SetStopGradient(false); - } + const phi::distributed::ProcessMesh* mesh = nullptr; + if (InputsContainDistTensor( + &mesh, self->tensor, transed_sub_tensor, value_tensor)) { + ConvertAllInputsToDistTensor( + mesh, self->tensor, transed_sub_tensor, value_tensor); + } + + transed_sub_tensor = + index_put_ad_func(transed_sub_tensor, transed_index, value_tensor); + + paddle::Tensor transback_sub_tensor = + transpose_ad_func(transed_sub_tensor, trans_back_dim); + + self->tensor = set_value_with_tensor__ad_func(self->tensor, + transback_sub_tensor, + slice_starts, + slice_ends, + slice_strides, + slice_axes, + decrease_axis, + none_axes); + if (PyCheckTensor(value_obj)) { + // pass the stop_gradient from value to tensor. + // pass stop gradient should be done after CheckInplace in + // set_value__dygraph_function. + if (!egr::EagerUtils::autograd_meta(&value_tensor)->StopGradient() && + egr::EagerUtils::autograd_meta(&self->tensor)->StopGradient()) { + egr::EagerUtils::autograd_meta(&self->tensor)->SetStopGradient(false); } } } diff --git a/paddle/fluid/pybind/generator_py.cc b/paddle/fluid/pybind/generator_py.cc index 520fe09bc710c..05374b08d8fc2 100644 --- a/paddle/fluid/pybind/generator_py.cc +++ b/paddle/fluid/pybind/generator_py.cc @@ -40,7 +40,7 @@ void BindGenerator(py::module* m_ptr) { [](std::shared_ptr& self) { return self->current_seed; }) -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || \ defined(PADDLE_WITH_CUSTOM_DEVICE) || defined(PADDLE_WITH_XPU) // NOTE(shenliang03): Due to the inability to serialize mt19937_64 // type, resulting in a problem with precision under the cpu. diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc index 8ba56008fb2b0..7199eb13c579b 100644 --- a/paddle/fluid/pybind/imperative.cc +++ b/paddle/fluid/pybind/imperative.cc @@ -869,7 +869,7 @@ void BindImperative(py::module *m_ptr) { }, py::call_guard()); -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) || \ defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_GLOO) || \ defined(PADDLE_WITH_CUSTOM_DEVICE) py::class_()); #endif -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) py::class_>( @@ -951,7 +951,7 @@ void BindImperative(py::module *m_ptr) { py::arg("ring_id")); #endif -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) || \ defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_CUSTOM_DEVICE) py::class_) @@ -373,10 +373,10 @@ void BindPlace(pybind11::module &m) { // NOLINT #endif .def("__repr__", string::to_string) .def("__str__", string::to_string); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) m.def("is_float16_supported", [](const platform::CUDAPlace &place) -> bool { // Only GPUs with Compute Capability >= 53 support float16 -#ifdef PADDLE_WITH_HIP +#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) return true; #else return platform::GetGPUComputeCapability(place.device) >= 53; @@ -384,7 +384,7 @@ void BindPlace(pybind11::module &m) { // NOLINT }); m.def("is_bfloat16_supported", [](const platform::CUDAPlace &place) -> bool { // Only GPUs with Compute Capability >= 80 support bfloat16 -#ifdef PADDLE_WITH_HIP +#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) return false; #else return platform::GetGPUComputeCapability(place.device) >= 80; @@ -546,7 +546,7 @@ void BindPlace(pybind11::module &m) { // NOLINT cudapinnedplace .def("__init__", [](platform::CUDAPinnedPlace &self) { -#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) +#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA) PADDLE_THROW(platform::errors::PermissionDenied( "Cannot use CUDAPinnedPlace in CPU only version, " "Please recompile or reinstall Paddle with CUDA support.")); diff --git a/paddle/fluid/pybind/process_group_utils.h b/paddle/fluid/pybind/process_group_utils.h index 3ba9ec3239c37..7b9002feed8ed 100644 --- a/paddle/fluid/pybind/process_group_utils.h +++ b/paddle/fluid/pybind/process_group_utils.h @@ -268,7 +268,7 @@ void ConcatTensor(const phi::DeviceContext &dev_ctx, const auto &place = dev_ctx.GetPlace(); if (platform::is_gpu_place(place)) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) ConcatDenseTensorWithType(static_cast(dev_ctx), tensor_list, dense_tensor, @@ -325,7 +325,7 @@ void SplitTensor(const phi::DeviceContext &dev_ctx, const auto &place = dev_ctx.GetPlace(); if (platform::is_gpu_place(place)) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) SplitDenseTensorWithType(static_cast(dev_ctx), tensor, &dense_list, diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index feafd1fa4333e..7949d7c1c3394 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -134,7 +134,7 @@ limitations under the License. */ #include "paddle/phi/core/lod_utils.h" #include "paddle/utils/none.h" -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) #include "paddle/fluid/pybind/nccl_wrapper_py.h" #endif #include "paddle/fluid/framework/data_type.h" @@ -146,11 +146,11 @@ limitations under the License. */ #include "paddle/fluid/pybind/tensor.h" #include "paddle/fluid/pybind/tensor_py.h" #include "paddle/fluid/string/to_string.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) #include "paddle/fluid/operators/nccl/nccl_gpu_common.h" #endif -#ifndef PADDLE_WITH_HIP +#if !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA) #include "paddle/fluid/platform/device/gpu/cuda/cuda_profiler.h" #endif #include "paddle/fluid/platform/device/gpu/gpu_info.h" @@ -238,7 +238,7 @@ bool IsCompiledWithAVX() { } bool IsCompiledWithCUDA() { -#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) +#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA) return false; #else return true; @@ -279,7 +279,15 @@ bool IsCompiledWithMPIAWARE() { } bool IsCompiledWithROCM() { -#ifndef PADDLE_WITH_HIP +#if !defined(PADDLE_WITH_HIP) + return false; +#else + return true; +#endif +} + +bool IsCompiledWithMUSA() { +#if !defined(PADDLE_WITH_MUSA) return false; #else return true; @@ -675,16 +683,16 @@ static void AssertStaticGraphAndDygraphGradMakerNoDiff() { string::join_strings(ops, ','))); } -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) static int GetNCCLVersion() { -#if NCCL_VERSION_CODE >= 2304 +// #if NCCL_VERSION_CODE >= 2304 int ver; - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGetVersion(&ver)); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclGetVersion(&ver)); return ver; -#else - PADDLE_THROW(platform::errors::External( - "Cannot get NCCL version successfully when nccl version < 2.3.4")); -#endif +// #else +// PADDLE_THROW(platform::errors::External( +// "Cannot get NCCL version successfully when nccl version < 2.3.4")); +// #endif } #endif @@ -930,7 +938,7 @@ PYBIND11_MODULE(libpaddle, m) { return self->OutputMeta(); }); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) m.def("cudnn_version", &platform::DnnVersion); m.def("gpu_memory_available", []() { size_t available = 0; @@ -940,7 +948,7 @@ PYBIND11_MODULE(libpaddle, m) { }); #endif -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) m.def("nccl_version", &GetNCCLVersion); #endif @@ -982,7 +990,7 @@ PYBIND11_MODULE(libpaddle, m) { if (dl.device.device_type == kDLCPU) { paddle::framework::TensorFromDLPack(dmt, &tensor); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) if (dl.device.device_type == kDLGPU) { paddle::framework::TensorFromDLPack(dmt, &tensor); } @@ -1256,7 +1264,7 @@ All parameter, weight, gradient are variables in Paddle. "get_fetch_list", [](Variable &self) { return self.GetMutable(); }, py::return_value_policy::reference) -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) .def( "get_communicator", [](Variable &self) -> platform::Communicator * { @@ -1724,7 +1732,7 @@ All parameter, weight, gradient are variables in Paddle. "create", [](paddle::platform::CUDAPlace &place) -> paddle::platform::DeviceContext * { -#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) +#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA) PADDLE_THROW(platform::errors::PermissionDenied( "Cannot use CUDAPlace in CPU only version, " "Please recompile or reinstall Paddle with CUDA support.")); @@ -1758,7 +1766,7 @@ All parameter, weight, gradient are variables in Paddle. "create", [](paddle::platform::CUDAPinnedPlace &place) -> paddle::platform::DeviceContext * { -#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) +#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA) PADDLE_THROW(platform::errors::PermissionDenied( "Cannot use CUDAPinnedPlace in CPU only version, " "Please recompile or reinstall Paddle with CUDA support.")); @@ -1766,7 +1774,7 @@ All parameter, weight, gradient are variables in Paddle. return new paddle::platform::CUDAPinnedDeviceContext(place); #endif }); -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) py::class_(m, "Communicator").def(py::init<>()); #endif m.def("get_all_device_type", []() { @@ -2106,6 +2114,7 @@ All parameter, weight, gradient are variables in Paddle. m.def("is_compiled_with_avx", IsCompiledWithAVX); m.def("is_compiled_with_cuda", IsCompiledWithCUDA); m.def("is_compiled_with_rocm", IsCompiledWithROCM); + m.def("is_compiled_with_musa", IsCompiledWithMUSA); m.def("is_compiled_with_custom_device", IsCompiledWithCustomDevice); m.def("is_compiled_with_ipu", IsCompiledWithIPU); m.def("is_compiled_with_xpu", IsCompiledWithXPU); @@ -2384,7 +2393,7 @@ All parameter, weight, gradient are variables in Paddle. py::return_value_policy::take_ownership); m.def("op_support_gpu", OpSupportGPU); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) m.def("get_cuda_device_count", platform::GetGPUDeviceCount); m.def("get_cuda_current_device_id", &platform::GetCurrentDeviceId); m.def("cuda_empty_cache", [] { @@ -2430,7 +2439,7 @@ All parameter, weight, gradient are variables in Paddle. return ostr.str(); }); -#if !defined(PADDLE_WITH_HIP) && !defined(_WIN32) +#if !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA) && !defined(_WIN32) m.def("nvprof_init", platform::CudaProfilerInit); m.def("nvprof_start", platform::CudaProfilerStart); m.def("nvprof_stop", platform::CudaProfilerStop); @@ -2512,7 +2521,7 @@ All parameter, weight, gradient are variables in Paddle. .def("save", &paddle::platform::ProfilerResult::Save) .def("get_extra_info", &paddle::platform::ProfilerResult::GetExtraInfo) .def("get_version", &paddle::platform::ProfilerResult::GetVersion) -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) .def("get_span_indx", &paddle::platform::ProfilerResult::GetSpanIndx) .def("get_device_property", &paddle::platform::ProfilerResult::GetDeviceProperty); @@ -2669,7 +2678,7 @@ All parameter, weight, gradient are variables in Paddle. m.def("enable_op_info_recorder", &phi::EnableOpInfoRecorder); m.def("disable_op_info_recorder", &phi::DisableOpInfoRecorder); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) m.def("set_cublas_switch", phi::SetAllowTF32Cublas); m.def("get_cublas_switch", phi::AllowTF32Cublas); m.def("set_cudnn_switch", phi::SetAllowTF32Cudnn); @@ -2957,7 +2966,7 @@ All parameter, weight, gradient are variables in Paddle. #ifdef PADDLE_WITH_BOX_PS BindBoxWrapper(&m); #endif -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) BindNCCLWrapper(&m); #endif #ifdef PADDLE_WITH_GLOO diff --git a/paddle/fluid/pybind/slice_utils.h b/paddle/fluid/pybind/slice_utils.h index 919a3a4650d3e..918d2eeae4272 100644 --- a/paddle/fluid/pybind/slice_utils.h +++ b/paddle/fluid/pybind/slice_utils.h @@ -26,11 +26,9 @@ #include "paddle/fluid/framework/convert_utils.h" #include "paddle/fluid/framework/scope_guard.h" #include "paddle/fluid/operators/utils.h" -#include "paddle/fluid/pybind/tensor_py.h" #include "paddle/phi/common/data_type.h" #include "paddle/phi/core/compat/convert_utils.h" #include "paddle/phi/core/dense_tensor.h" -#include "pybind11/numpy.h" #include "pybind11/pybind11.h" #include "pybind11/stl.h" @@ -347,13 +345,11 @@ static paddle::Tensor getTensorWithBasicIndexing( std::vector* decrease_axis, std::vector* none_axes, std::vector* infer_flags, - bool* use_strided_slice, - bool* out_is_view) { + bool* use_strided_slice) { paddle::Tensor out; if (slice_axes->empty()) { out = tensor; } else { - *out_is_view = true; if (!(*use_strided_slice)) { eager_gil_scoped_release guard; out = slice_ad_func(tensor, @@ -374,7 +370,6 @@ static paddle::Tensor getTensorWithBasicIndexing( } } if (!none_axes->empty()) { - *out_is_view = true; eager_gil_scoped_release guard; // Deal with cases that decrease_axes is not empty // For example: @@ -402,9 +397,9 @@ static paddle::Tensor dealWithAdvancedIndex( std::vector* transed_index, std::vector* trans_back_dim, int* pos_of_new_dim, - int* rank_of_new_dim, - std::vector* trans_dim, - bool* out_is_view) { + int* rank_of_new_dim) { + std::vector trans_dim; + int p = 0; for (size_t i = 0; i < advanced_index_dim->size(); ++i) { auto index_dim = (*advanced_index_dim)[i]; @@ -413,28 +408,30 @@ static paddle::Tensor dealWithAdvancedIndex( // advanced_index_dim auto index = (*advanced_index)[p++]; - if (index_dim == 0) { - // case 1: advanced indices at axis 0, the new dim will be at first. - *pos_of_new_dim = 0; - } else if (index_dim > 0 && trans_dim->size() > 0 && - (*trans_dim)[trans_dim->size() - 1] != index_dim - 1) { - // case 2: there are not adjacent advanced indices, the new dim will - // be at first. - *pos_of_new_dim = 0; - } else { - *pos_of_new_dim = std::min(index_dim, *pos_of_new_dim); + if (!is_for_setitem) { + if (index_dim == 0) { + // case 1: advanced indices at axis 0, the new dim will be at first. + *pos_of_new_dim = 0; + } else if (index_dim > 0 && trans_dim.size() > 0 && + trans_dim[trans_dim.size() - 1] != index_dim - 1) { + // case 2: there are not adjacent advanced indices, the new dim will + // be at first. + *pos_of_new_dim = 0; + } else { + *pos_of_new_dim = std::min(index_dim, *pos_of_new_dim); + } + *rank_of_new_dim = + std::max(*rank_of_new_dim, static_cast(index.shape().size())); } - *rank_of_new_dim = - std::max(*rank_of_new_dim, static_cast(index.shape().size())); - trans_dim->push_back(index_dim); + trans_dim.push_back(index_dim); transed_index->push_back(std::move(index)); } } for (size_t i = 0; i < tensor.shape().size(); ++i) { if ((*advanced_index_dim)[i] == -1) { - trans_dim->push_back(i); + trans_dim.push_back(i); } } @@ -444,20 +441,19 @@ static paddle::Tensor dealWithAdvancedIndex( std::vector original_dim_order(tensor.shape().size()); std::iota(original_dim_order.begin(), original_dim_order.end(), 0); - if (original_dim_order == *trans_dim) { + if (original_dim_order == trans_dim) { transed_tensor = tensor; } else { - *out_is_view = true; - transed_tensor = transpose_ad_func(tensor, *trans_dim); + transed_tensor = transpose_ad_func(tensor, trans_dim); } if (is_for_setitem) { - trans_back_dim->resize(trans_dim->size()); + trans_back_dim->resize(trans_dim.size()); std::iota(trans_back_dim->begin(), trans_back_dim->end(), 0); std::sort(trans_back_dim->begin(), trans_back_dim->end(), [&trans_dim](int left, int right) { - return (*trans_dim)[left] < (*trans_dim)[right]; + return trans_dim[left] < trans_dim[right]; }); } return transed_tensor; @@ -515,104 +511,5 @@ static void ParseBoolAndBroadcastIndices( } } -static paddle::Tensor dealWithValues(const paddle::Tensor& tensor, - PyObject* value_obj, - std::vector* values, - const bool trans_to_tensor) { - paddle::Tensor value_tensor; - if (PyCheckTensor(value_obj)) { - value_tensor = reinterpret_cast(value_obj)->tensor; - } else if (py::isinstance(value_obj)) { - paddle::Tensor value_tensor_tmp( - std::make_shared(), - egr::Controller::Instance().GenerateUniqueName()); - py::object value_obj_tmp(py::handle(value_obj), true); - py::object value = value_obj_tmp; - if (tensor.dtype() == phi::DataType::FLOAT32) { - if (!py::isinstance>(value_obj_tmp)) { - value = pybind11::detail::CastNumpyArray(value_obj_tmp); - } - } else if (tensor.dtype() == phi::DataType::FLOAT64) { - if (!py::isinstance>(value_obj_tmp)) { - value = pybind11::detail::CastNumpyArray(value_obj_tmp); - } - } else if (tensor.dtype() == phi::DataType::INT32) { - if (!py::isinstance>(value_obj_tmp)) { - value = pybind11::detail::CastNumpyArray(value_obj_tmp); - } - } else if (tensor.dtype() == phi::DataType::INT64) { - if (!py::isinstance>(value_obj_tmp)) { - value = pybind11::detail::CastNumpyArray(value_obj_tmp); - } - } else if (tensor.dtype() == phi::DataType::BOOL) { - if (!py::isinstance>(value_obj_tmp)) { - value = pybind11::detail::CastNumpyArray(value_obj_tmp); - } - } else if (tensor.dtype() == phi::DataType::COMPLEX64) { - if (!py::isinstance>>(value_obj_tmp)) { - value = pybind11::detail::CastNumpyArray>( - value_obj_tmp); - } - } else if (tensor.dtype() == phi::DataType::COMPLEX128) { - if (!py::isinstance>>(value_obj_tmp)) { - value = pybind11::detail::CastNumpyArray>( - value_obj_tmp); - } - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "When assign a numpy.np value to a paddle.Tensor, " - "the data type of the paddle.Tensor must be bool, " - "float32, float64, complex64, complex128, int32 or int64, " - "please check the type of tensor.")); - } - SetTensorFromPyArray( - static_cast(value_tensor_tmp.impl().get()), - value, - tensor.place(), - false); - value_tensor = value_tensor_tmp; - } else { - py::object value_obj_tmp(py::handle(value_obj), true); - // convert the value to self data type - if (py::isinstance(value_obj_tmp) || - py::isinstance(value_obj_tmp) || - py::isinstance(value_obj_tmp) || - PyComplex_Check(value_obj)) { - if (tensor.dtype() == phi::DataType::FLOAT32 || - tensor.dtype() == phi::DataType::FLOAT16 || - tensor.dtype() == phi::DataType::BFLOAT16) { - values->push_back(value_obj_tmp.cast()); - } else if (tensor.dtype() == phi::DataType::FLOAT64) { - values->push_back(value_obj_tmp.cast()); - } else if (tensor.dtype() == phi::DataType::INT32 || - tensor.dtype() == phi::DataType::INT16 || - tensor.dtype() == phi::DataType::INT8 || - tensor.dtype() == phi::DataType::UINT8) { - values->push_back(value_obj_tmp.cast()); - } else if (tensor.dtype() == phi::DataType::INT64) { - values->push_back(value_obj_tmp.cast()); - } else if (tensor.dtype() == phi::DataType::BOOL) { - values->push_back(value_obj_tmp.cast()); - } else if (tensor.dtype() == phi::DataType::COMPLEX64) { - values->push_back(value_obj_tmp.cast>()); - } else if (tensor.dtype() == phi::DataType::COMPLEX128) { - values->push_back(value_obj_tmp.cast>()); - } - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "Value type error. The assign value allows " - "Tensor, numpy.ndarray, integer, float, complex or bool, " - "but received %s.", - Py_TYPE(value_obj))); - } - - if (trans_to_tensor) { - value_tensor = - full_ad_func({1}, (*values)[0], tensor.dtype(), tensor.place()); - } - } - return value_tensor; -} - } // namespace pybind } // namespace paddle diff --git a/paddle/fluid/pybind/tensor.cc b/paddle/fluid/pybind/tensor.cc index 44983e3e13df7..cce09cf7fdfd5 100644 --- a/paddle/fluid/pybind/tensor.cc +++ b/paddle/fluid/pybind/tensor.cc @@ -117,7 +117,7 @@ limitations under the License. */ #include "paddle/phi/core/lod_utils.h" #include "paddle/utils/none.h" -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) #include "paddle/fluid/pybind/nccl_wrapper_py.h" #endif #include "paddle/fluid/framework/data_type.h" @@ -126,11 +126,11 @@ limitations under the License. */ #include "paddle/fluid/pybind/reader_py.h" #include "paddle/fluid/pybind/tensor_py.h" #include "paddle/fluid/string/to_string.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)|| defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) #include "paddle/fluid/operators/nccl/nccl_gpu_common.h" #endif -#ifndef PADDLE_WITH_HIP +#if !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA) #include "paddle/fluid/platform/device/gpu/cuda/cuda_profiler.h" #endif #include "paddle/fluid/platform/device/gpu/gpu_info.h" @@ -1101,7 +1101,7 @@ void BindTensor(pybind11::module &m) { // NOLINT .def("height", &phi::SelectedRows::height) .def("set_rows", [](phi::SelectedRows &self, std::vector rows) { -#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) +#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA) self.set_rows(rows); #else std::vector new_rows(rows); diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h index dd5bd7f1d91c4..622d054645eff 100644 --- a/paddle/fluid/pybind/tensor_py.h +++ b/paddle/fluid/pybind/tensor_py.h @@ -37,7 +37,7 @@ limitations under the License. */ #include "paddle/fluid/platform/device/device_wrapper.h" #include "paddle/fluid/pybind/complex.h" #include "paddle/phi/kernels/funcs/strided_memcpy.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) #include "paddle/fluid/platform/cuda_device_guard.h" #endif #include "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h" @@ -325,7 +325,7 @@ T TensorGetElement(const phi::DenseTensor &self, size_t offset) { #endif } else if (platform::is_gpu_place(self.place()) || platform::is_cuda_pinned_place(self.place())) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) const T *a = self.data(); auto p = self.place(); paddle::memory::Copy( @@ -362,7 +362,7 @@ void TensorSetElement(phi::DenseTensor *self, size_t offset, T elem) { #endif } else if (platform::is_gpu_place(self->place()) || platform::is_cuda_pinned_place(self->place())) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) auto p = self->place(); T *a = self->mutable_data(p); paddle::memory::Copy( @@ -457,7 +457,7 @@ void SetTensorFromPyArrayT( "Please recompile or reinstall Paddle with CustomDevice support.")); #endif } else { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) if (paddle::platform::is_gpu_place(place)) { // NOTE(wangxi): When copying data to the accelerator card, // we need set_device(dev_id) first. @@ -466,6 +466,9 @@ void SetTensorFromPyArrayT( #ifdef PADDLE_WITH_HIP paddle::platform::GpuMemcpySync( dst, array.data(), array.nbytes(), hipMemcpyHostToDevice); +#elif defined(PADDLE_WITH_MUSA) + paddle::platform::GpuMemcpySync( + dst, array.data(), array.nbytes(), musaMemcpyHostToDevice); #else paddle::platform::GpuMemcpySync( dst, array.data(), array.nbytes(), cudaMemcpyHostToDevice); @@ -790,7 +793,7 @@ inline phi::DenseTensor *_getTensor(const phi::DenseTensor &self, output->mutable_data(place, self.dtype()); #endif } else { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) if (platform::is_cuda_pinned_place(place)) { output->mutable_data(place, self.dtype()); } else if ((platform::is_gpu_place(place))) { @@ -1047,11 +1050,13 @@ inline py::array TensorToPyArray(const phi::DenseTensor &tensor, "Please recompile or reinstall Paddle with XPU support.")); #endif } else if (is_gpu_tensor) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) #if defined(PADDLE_WITH_CUDA) gpuMemcpyKind kind = cudaMemcpyDeviceToHost; #elif defined(PADDLE_WITH_HIP) gpuMemcpyKind kind = hipMemcpyDeviceToHost; +#elif defined(PADDLE_WITH_MUSA) + gpuMemcpyKind kind = musaMemcpyDeviceToHost; #endif phi::DenseTensor cpu_tensor; platform::CPUPlace cpu_place; diff --git a/paddle/phi/CMakeLists.txt b/paddle/phi/CMakeLists.txt index 09b4337ecb40b..8636de26c4161 100644 --- a/paddle/phi/CMakeLists.txt +++ b/paddle/phi/CMakeLists.txt @@ -51,6 +51,13 @@ if(WITH_GPU) list(APPEND PHI_DEPS external_error_proto) endif() +if(WITH_MUSA) + set(DEPENDENT_LIBRARIES "") + list(APPEND DEPENDENT_LIBRARIES "/usr/local/musa/lib/libmudnn.so") + list(APPEND PHI_DEPS ${DEPENDENT_LIBRARIES}) +endif() + + if(WITH_ASCEND_CL) list(APPEND PHI_DEPS npu_hccl) endif() @@ -134,11 +141,11 @@ if(WITH_GPU) SRCS ${PHI_SRCS} DEPS ${PHI_DEPS}) elseif(WITH_ROCM) - hip_library( - phi ${PHI_BUILD_TYPE} - SRCS ${PHI_SRCS} - DEPS ${PHI_DEPS}) - + hip_add_library(phi ${PHI_BUILD_TYPE} ${PHI_SRCS}) + target_link_libraries(phi ${PHI_DEPS}) +elseif(WITH_MUSA) + musa_add_library(phi ${PHI_BUILD_TYPE} ${PHI_SRCS}) + target_link_libraries(phi ${PHI_DEPS}) elseif(WITH_XPU_KP) xpu_library( phi ${PHI_BUILD_TYPE} diff --git a/paddle/phi/api/include/context_pool.h b/paddle/phi/api/include/context_pool.h index 86ba7b9cf7576..a6f8b3949c20a 100644 --- a/paddle/phi/api/include/context_pool.h +++ b/paddle/phi/api/include/context_pool.h @@ -99,7 +99,7 @@ namespace paddle { */ PADDLE_API phi::Allocator* GetAllocator(const phi::Place& place); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) /** * Get the current CUDA stream for the passed CUDA device. */ diff --git a/paddle/phi/api/include/tensor.h b/paddle/phi/api/include/tensor.h index a6e78686e1e4c..3ef838410bed0 100644 --- a/paddle/phi/api/include/tensor.h +++ b/paddle/phi/api/include/tensor.h @@ -24,6 +24,11 @@ limitations under the License. */ using gpuStream_t = cudaStream_t; #endif +#ifdef PADDLE_WITH_MUSA +#include +using gpuStream_t = musaStream_t; +#endif + #ifdef PADDLE_WITH_HIP #include using gpuStream_t = hipStream_t; @@ -413,7 +418,7 @@ class PADDLE_API Tensor final { */ void set_impl(std::shared_ptr&& impl); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) /** * @brief Get the stream where the tensor is currently located * This is a deprecated method and may be removed in the future! diff --git a/paddle/phi/api/lib/api_gen_utils.cc b/paddle/phi/api/lib/api_gen_utils.cc index ed64ff1c937b6..2d5d1a49f02e7 100644 --- a/paddle/phi/api/lib/api_gen_utils.cc +++ b/paddle/phi/api/lib/api_gen_utils.cc @@ -385,7 +385,7 @@ void TransStride(phi::DeviceContext* dev_ctx, delete from; return; } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) auto* gpu_ctx = dynamic_cast(dev_ctx); if (gpu_ctx) { PD_VISIT_ALL_TYPES(to->dtype(), "StridedCopyKernel", ([&] { @@ -437,7 +437,7 @@ void TransStrideLegacy(phi::DeviceContext* dev_ctx, })); return; } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) auto* gpu_ctx = dynamic_cast(dev_ctx); if (gpu_ctx) { PD_VISIT_ALL_TYPES(to->dtype(), "StridedCopyKernel", ([&] { @@ -489,7 +489,7 @@ void TransStride(phi::DeviceContext* dev_ctx, delete from[i]; continue; } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) auto* gpu_ctx = dynamic_cast(dev_ctx); if (gpu_ctx) { PD_VISIT_ALL_TYPES(to[i]->dtype(), "StridedCopyKernel", ([&] { diff --git a/paddle/phi/api/lib/context_pool.cc b/paddle/phi/api/lib/context_pool.cc index ee1e21a58e2f1..b2c3f9f28ee79 100644 --- a/paddle/phi/api/lib/context_pool.cc +++ b/paddle/phi/api/lib/context_pool.cc @@ -19,7 +19,7 @@ limitations under the License. */ #include "paddle/phi/core/allocator.h" #include "paddle/phi/core/enforce.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) #include "paddle/phi/core/cuda_stream.h" #endif @@ -75,7 +75,7 @@ PADDLE_API phi::Allocator* GetAllocator(const phi::Place& place) { return const_cast(&dev_ctx->GetAllocator()); // NOLINT } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) PADDLE_API phi::CUDAStream* GetCurrentCUDAStream(const phi::Place& place) { PADDLE_ENFORCE_EQ(place.GetType(), phi::AllocationType::GPU, diff --git a/paddle/phi/api/lib/data_transform.cc b/paddle/phi/api/lib/data_transform.cc index 03ac68d331991..2ea7ae4f5e3d8 100644 --- a/paddle/phi/api/lib/data_transform.cc +++ b/paddle/phi/api/lib/data_transform.cc @@ -116,7 +116,7 @@ phi::DenseTensor CastDataType(const Context& dev_ctx, } } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) phi::DenseTensor CastDataType(const phi::GPUContext& dev_ctx, const phi::DenseTensor& tensor, DataType dtype) { @@ -158,7 +158,7 @@ inline phi::DenseTensor TransDataType(const phi::DenseTensor& tensor, if (tensor.place().GetType() == phi::AllocationType::CPU) { auto* dev_ctx = static_cast(pool.Get(tensor.place())); return CastDataType(*dev_ctx, tensor, dtype); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) } else if (tensor.place().GetType() == phi::AllocationType::GPU) { auto* dev_ctx = static_cast(pool.Get(tensor.place())); return CastDataType(*dev_ctx, tensor, dtype); @@ -196,7 +196,7 @@ inline phi::DenseTensor TransDataPlace(const phi::DenseTensor& tensor, << " dst_place: " << dst_place; auto& pool = phi::DeviceContextPool::Instance(); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) // NOTE(yy): TransDataPlace should wait for computation of input. if (tensor.place().GetType() != phi::AllocationType::GPUPINNED) { pool.Get(tensor.place())->Wait(); @@ -247,7 +247,7 @@ phi::DenseTensor Trans2Contiguous(const phi::DenseTensor& tensor) { if (tensor.place().GetType() == phi::AllocationType::CPU) { auto* dev_ctx = static_cast(pool.Get(tensor.place())); return TensorContiguous(*dev_ctx, tensor); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) } else if (tensor.place().GetType() == phi::AllocationType::GPU) { auto* dev_ctx = static_cast(pool.Get(tensor.place())); return TensorContiguous(*dev_ctx, tensor); diff --git a/paddle/phi/api/lib/tensor.cc b/paddle/phi/api/lib/tensor.cc index 49c47cbcce363..ee88e9fb1b0c8 100644 --- a/paddle/phi/api/lib/tensor.cc +++ b/paddle/phi/api/lib/tensor.cc @@ -376,7 +376,7 @@ void Tensor::set_impl(std::shared_ptr &&impl) { impl_ = std::move(impl); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) gpuStream_t Tensor::stream() const { int device_id = phi::backends::gpu::GetCurrentDeviceId(); auto *gpu_context = DeviceContextPool::Instance().Get( diff --git a/paddle/phi/api/lib/tensor_utils.cc b/paddle/phi/api/lib/tensor_utils.cc index 9c11e88260c1d..0aad2a6da5fdc 100644 --- a/paddle/phi/api/lib/tensor_utils.cc +++ b/paddle/phi/api/lib/tensor_utils.cc @@ -20,11 +20,11 @@ limitations under the License. */ #include "paddle/phi/core/distributed/auto_parallel/reshard/reshard_utils.h" #include "paddle/phi/core/enforce.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) #ifdef PADDLE_WITH_CUDA #include #else -#include +#include #endif #endif @@ -33,26 +33,26 @@ namespace paddle { PD_REGISTER_API(from_blob) phi::Place GetPlaceFromPtr(void* data) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -#ifdef PADDLE_WITH_CUDA -#if CUDA_VERSION >= 10000 - cudaPointerAttributes attr; - cudaError_t status = cudaPointerGetAttributes(&attr, data); - if (status == cudaSuccess && attr.type == cudaMemoryTypeDevice) { +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +// #ifdef PADDLE_WITH_CUDA +// #if CUDA_VERSION >= 10000 + musaPointerAttributes attr; + musaError_t status = musaPointerGetAttributes(&attr, data); + if (status == musaSuccess && attr.type == musaMemoryTypeDevice) { return phi::GPUPlace(attr.device); } -#else - PADDLE_THROW( - phi::errors::Unimplemented("The GetPlaceFromPtr() method is only " - "supported when CUDA version >= 10.0.")); -#endif -#else - hipPointerAttribute_t attr; - hipError_t status = hipPointerGetAttributes(&attr, data); - if (status == hipSuccess && attr.memoryType == hipMemoryTypeDevice) { - return phi::GPUPlace(attr.device); - } -#endif +// #else +// PADDLE_THROW( +// phi::errors::Unimplemented("The GetPlaceFromPtr() method is only " +// "supported when CUDA version >= 10.0.")); +// #endif +// #else +// hipPointerAttribute_t attr; +// hipError_t status = hipPointerGetAttributes(&attr, data); +// if (status == hipSuccess && attr.memoryType == hipMemoryTypeDevice) { +// return phi::GPUPlace(attr.device); +// } +// #endif #endif return phi::CPUPlace(); } diff --git a/paddle/phi/api/profiler/event.h b/paddle/phi/api/profiler/event.h index eb765ebdcb9dd..70fb4d948986c 100644 --- a/paddle/phi/api/profiler/event.h +++ b/paddle/phi/api/profiler/event.h @@ -27,8 +27,10 @@ limitations under the License. */ #ifdef PADDLE_WITH_HIP #include #endif - -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#ifdef PADDLE_WITH_MUSA +#include +#endif +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) #include "paddle/phi/core/cuda_stream.h" #endif @@ -62,7 +64,7 @@ class Event { void set_name(std::string name) { name_ = name; } void set_role(EventRole role) { role_ = role; } std::string attr() const { return attr_; } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) #ifndef PADDLE_WITH_CUPTI gpuEvent_t event() const { return event_; } int device() const { return device_; } @@ -81,7 +83,7 @@ class Event { uint64_t cpu_ns_; bool visited_status_{false}; std::string attr_; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) #ifdef PADDLE_WITH_CUPTI int64_t gpu_ns_ = 0; @@ -137,12 +139,14 @@ class MemEvent { }; class CudaEvent { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) public: CudaEvent() { #ifdef PADDLE_WITH_HIP hipEventCreateWithFlags(&event_, flags_); +#elif defined(PADDLE_WITH_MUSA) + musaEventCreateWithFlags(&event_, flags_); #else cudaEventCreateWithFlags(&event_, flags_); #endif @@ -152,6 +156,8 @@ class CudaEvent { explicit CudaEvent(unsigned int flags) : flags_(flags) { #ifdef PADDLE_WITH_HIP hipEventCreateWithFlags(&event_, flags_); +#elif defined(PADDLE_WITH_MUSA) + musaEventCreateWithFlags(&event_, flags_); #else cudaEventCreateWithFlags(&event_, flags_); #endif @@ -161,6 +167,8 @@ class CudaEvent { ~CudaEvent() { #ifdef PADDLE_WITH_HIP hipEventDestroy(event_); +#elif defined(PADDLE_WITH_MUSA) + musaEventDestroy(event_); #else cudaEventDestroy(event_); #endif @@ -169,6 +177,8 @@ class CudaEvent { void Record(gpuStream_t stream) { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event_, stream)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(musaEventRecord(event_, stream)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event_, stream)); #endif @@ -183,6 +193,14 @@ class CudaEvent { if (err == hipErrorNotReady) { return false; } +#elif defined(PADDLE_WITH_MUSA) + gpuError_t err = musaEventQuery(event_); + if (err == musaSuccess) { + return true; + } + if (err == musaErrorNotReady) { + return false; + } #else gpuError_t err = cudaEventQuery(event_); if (err == cudaSuccess) { @@ -199,6 +217,8 @@ class CudaEvent { void Synchronize() { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipEventSynchronize(event_)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(musaEventSynchronize(event_)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaEventSynchronize(event_)); #endif @@ -208,6 +228,8 @@ class CudaEvent { private: #ifdef PADDLE_WITH_HIP unsigned int flags_ = hipEventDefault; +#elif defined(PADDLE_WITH_MUSA) + unsigned int flags_ = musaEventDefault; #else unsigned int flags_ = cudaEventDefault; #endif diff --git a/paddle/phi/api/yaml/backward.yaml b/paddle/phi/api/yaml/backward.yaml index 81339a24c50de..3a87826337465 100644 --- a/paddle/phi/api/yaml/backward.yaml +++ b/paddle/phi/api/yaml/backward.yaml @@ -944,6 +944,8 @@ func : gather_nd_grad composite : gather_nd_grad(x, index, out_grad, x_grad) no_need_buffer : x + data_transform : + skip_transform : index - backward_op : gaussian_inplace_grad forward : gaussian_inplace(Tensor x, float mean=0, float std=1.0, int seed=0) -> Tensor(out) @@ -1760,8 +1762,8 @@ optional : boxes_num - backward_op : put_along_axis_grad - forward : put_along_axis (Tensor arr, Tensor indices, Tensor values, int axis, str reduce = "assign", bool include_self = true) -> Tensor(out) - args : (Tensor arr, Tensor indices, Tensor values, Tensor out, Tensor out_grad, int axis, str reduce, bool include_self) + forward : put_along_axis (Tensor arr, Tensor indices, Tensor values, int axis, str reduce = "assign") -> Tensor(out) + args : (Tensor arr, Tensor indices, Tensor out_grad, int axis, str reduce) output : Tensor(arr_grad), Tensor(values_grad) infer_meta : func : GeneralBinaryGradInferMeta diff --git a/paddle/phi/api/yaml/generator/dist_bw_api_gen.py b/paddle/phi/api/yaml/generator/dist_bw_api_gen.py index c7ec9ace290ac..3769155eb27e1 100644 --- a/paddle/phi/api/yaml/generator/dist_bw_api_gen.py +++ b/paddle/phi/api/yaml/generator/dist_bw_api_gen.py @@ -425,7 +425,6 @@ def source_include(header_file_path, fw_header_file_path): #include "{fw_header_file_path}" #include "paddle/phi/infermeta/backward.h" #include "paddle/phi/infermeta/unary.h" -#include "paddle/phi/infermeta/fusion.h" #include "paddle/phi/api/profiler/event_tracing.h" #include "paddle/phi/api/profiler/supplement_tracing.h" diff --git a/paddle/phi/api/yaml/legacy_backward.yaml b/paddle/phi/api/yaml/legacy_backward.yaml index 3f11781dfe88e..04cf57a88bb7c 100755 --- a/paddle/phi/api/yaml/legacy_backward.yaml +++ b/paddle/phi/api/yaml/legacy_backward.yaml @@ -614,14 +614,14 @@ - backward_op : set_value_grad forward : set_value (Tensor x, IntArray starts, IntArray ends, IntArray steps, int64_t[] axes, int64_t[] decrease_axes, int64_t[] none_axes, int64_t[] shape, Scalar[] values) -> Tensor(out) - args : (Tensor out_grad, IntArray starts, IntArray ends, IntArray steps, int64_t[] axes, int64_t[] decrease_axes, int64_t[] none_axes) + args : (Tensor out_grad) output : Tensor(x_grad) infer_meta: func: UnchangedInferMeta param: [out_grad] kernel: - func: set_value_with_scalar_grad - param: [out_grad, starts, ends, steps, axes, decrease_axes, none_axes] + func: assign + param: [out_grad] - backward_op : set_value_with_tensor_grad forward: set_value_with_tensor (Tensor x, Tensor values, IntArray starts, IntArray ends, IntArray steps, int64_t[] axes, int64_t[] decrease_axes, int64_t[] none_axes) -> Tensor(out) diff --git a/paddle/phi/api/yaml/op_compat.yaml b/paddle/phi/api/yaml/op_compat.yaml index dfcdf65673e20..e4bbb15073f41 100755 --- a/paddle/phi/api/yaml/op_compat.yaml +++ b/paddle/phi/api/yaml/op_compat.yaml @@ -2432,7 +2432,7 @@ outputs : out : Result attrs : - {axis : Axis, reduce : Reduce, include_self: Include_self} + {axis : Axis, reduce : Reduce} - op : pylayer backward : pylayer_grad diff --git a/paddle/phi/api/yaml/ops.yaml b/paddle/phi/api/yaml/ops.yaml index efc1b17714a85..092b3d71a60b4 100644 --- a/paddle/phi/api/yaml/ops.yaml +++ b/paddle/phi/api/yaml/ops.yaml @@ -2032,7 +2032,7 @@ backward : psroi_pool_grad - op : put_along_axis - args : (Tensor arr, Tensor indices, Tensor values, int axis, str reduce = "assign", bool include_self = true) + args : (Tensor arr, Tensor indices, Tensor values, int axis, str reduce = "assign") output : Tensor(out) infer_meta : func : UnchangedInferMeta diff --git a/paddle/phi/backends/CMakeLists.txt b/paddle/phi/backends/CMakeLists.txt index ed47487553bee..db0d463bc6715 100644 --- a/paddle/phi/backends/CMakeLists.txt +++ b/paddle/phi/backends/CMakeLists.txt @@ -7,7 +7,7 @@ if(NOT APPLE AND NOT WIN32) list(APPEND BACKENDS_SRCS device_code.cc) endif() -if(WITH_GPU OR WITH_ROCM) +if(WITH_GPU OR WITH_ROCM OR WITH_MUSA) list(APPEND BACKENDS_SRCS gpu/gpu_context.cc gpu/gpu_info.cc gpu/gpu_resources.cc) if(WITH_GPU) @@ -16,6 +16,9 @@ if(WITH_GPU OR WITH_ROCM) if(WITH_ROCM) list(APPEND BACKENDS_SRCS gpu/rocm/rocm_info.cc) endif() + if(WITH_MUSA) + list(APPEND BACKENDS_SRCS gpu/musa/musa_info.cc) + endif() endif() if(WITH_XPU) @@ -49,6 +52,7 @@ list( if(WITH_GPU OR WITH_ROCM + OR WITH_MUSA OR WITH_CUSTOM_DEVICE) list(APPEND BACKENDS_SRCS device_base.cc) endif() diff --git a/paddle/phi/backends/context_pool.cc b/paddle/phi/backends/context_pool.cc index 7824fc3b160b1..9e8ecd48e453c 100644 --- a/paddle/phi/backends/context_pool.cc +++ b/paddle/phi/backends/context_pool.cc @@ -21,7 +21,7 @@ limitations under the License. */ namespace phi { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) bool allow_tf32_cublas = true; void SetAllowTF32Cublas(bool active) { allow_tf32_cublas = active; } bool AllowTF32Cublas() { return allow_tf32_cublas; } diff --git a/paddle/phi/backends/context_pool.h b/paddle/phi/backends/context_pool.h index 52f0ced275ac5..a0537c779e52f 100644 --- a/paddle/phi/backends/context_pool.h +++ b/paddle/phi/backends/context_pool.h @@ -28,7 +28,7 @@ limitations under the License. */ namespace phi { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) void SetAllowTF32Cublas(bool active); /*Get the global variable allow_tf32_cublas value*/ bool AllowTF32Cublas(); @@ -47,7 +47,7 @@ struct DefaultDeviceContextType { using TYPE = phi::CPUContext; }; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) template <> struct DefaultDeviceContextType { using TYPE = phi::GPUContext; diff --git a/paddle/phi/backends/custom/custom_device.cc b/paddle/phi/backends/custom/custom_device.cc index ddbfc60f19f08..48bedd1bd939e 100644 --- a/paddle/phi/backends/custom/custom_device.cc +++ b/paddle/phi/backends/custom/custom_device.cc @@ -383,7 +383,7 @@ class CustomDevice : public DeviceInterface { void* ptr = nullptr; const auto device = &devices_pool[dev_id]; - if (!pimpl_->host_memory_allocate) { + if (!pimpl_->unified_memory_allocate) { PADDLE_THROW(phi::errors::Unavailable( "MemoryAllocateHost is not supported on %s.", Type())); } else { diff --git a/paddle/phi/backends/device_code.cc b/paddle/phi/backends/device_code.cc index d160b5034f998..ac16a69aa7bee 100644 --- a/paddle/phi/backends/device_code.cc +++ b/paddle/phi/backends/device_code.cc @@ -78,7 +78,8 @@ DeviceCodePool::DeviceCodePool(const std::vector& places) { } for (auto& p : set) { if (p.GetType() == phi::AllocationType::GPU) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) device_codes_.emplace(p, DeviceCodeMap()); #else PADDLE_THROW(phi::errors::PreconditionNotMet( @@ -88,12 +89,14 @@ DeviceCodePool::DeviceCodePool(const std::vector& places) { } } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) GPUDeviceCode::CheckAvailableStatus(); #endif } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) #ifdef PADDLE_WITH_HIP static bool CheckCUDADriverResult(hipError_t result, std::string caller, @@ -101,6 +104,13 @@ static bool CheckCUDADriverResult(hipError_t result, if (result != hipSuccess) { const char* error = nullptr; error = dynload::hipGetErrorString(result); +#elif defined(PADDLE_WITH_MUSA) +static bool CheckCUDADriverResult(MUresult result, + std::string caller, + std::string kernel_name = "") { + if (result != MUSA_SUCCESS) { + const char* error = nullptr; + dynload::muGetErrorString(result, &error); #else static bool CheckCUDADriverResult(CUresult result, std::string caller, @@ -130,6 +140,8 @@ void GPUDeviceCode::CheckAvailableStatus() { #ifdef PADDLE_WITH_HIP hiprtcResult nvrtc_result = dynload::hiprtcVersion(&nvrtc_major, &nvrtc_minor); +#elif defined(PADDLE_WITH_MUSA) + mtrtcResult nvrtc_result = dynload::mtrtcVersion(&nvrtc_major, &nvrtc_minor); #else nvrtcResult nvrtc_result = dynload::nvrtcVersion(&nvrtc_major, &nvrtc_minor); #endif @@ -140,6 +152,9 @@ void GPUDeviceCode::CheckAvailableStatus() { #ifdef PADDLE_WITH_HIP hipError_t driver_result = dynload::hipDriverGetVersion(&driver_version); if (driver_result == hipSuccess) { +#elif defined(PADDLE_WITH_MUSA) + MUresult driver_result = dynload::muDriverGetVersion(&driver_version); + if (driver_result == MUSA_SUCCESS) { #else CUresult driver_result = dynload::cuDriverGetVersion(&driver_version); if (driver_result == CUDA_SUCCESS) { @@ -153,6 +168,8 @@ void GPUDeviceCode::CheckAvailableStatus() { << "." << nvrtc_minor; #ifdef PADDLE_WITH_HIP if (nvrtc_result != HIPRTC_SUCCESS || driver_result != hipSuccess) { +#elif defined(PADDLE_WITH_MUSA) + if (nvrtc_result != MTRTC_SUCCESS || driver_result != MUSA_SUCCESS) { #else if (nvrtc_result != NVRTC_SUCCESS || driver_result != CUDA_SUCCESS) { #endif @@ -163,6 +180,9 @@ void GPUDeviceCode::CheckAvailableStatus() { #ifdef PADDLE_WITH_HIP if (CheckCUDADriverResult(dynload::hipGetDeviceCount(&count), "hipGetDeviceCount")) { +#elif defined(PADDLE_WITH_MUSA) + if (CheckCUDADriverResult(dynload::muDeviceGetCount(&count), + "muDeviceGetCount")) { #else if (CheckCUDADriverResult(dynload::cuDeviceGetCount(&count), "cuDeviceGetCount")) { @@ -202,6 +222,8 @@ static std::string FindCUDAIncludePath() { #ifdef PADDLE_WITH_HIP cuda_include_path = "/opt/rocm/include"; +#elif defined(PADDLE_WITH_MUSA) + cuda_include_path = "/usr/local/musa/include"; #else cuda_include_path = "/usr/local/cuda/include"; #endif @@ -229,6 +251,8 @@ GPUDeviceCode::GPUDeviceCode(const Place& place, name_ = name; #ifdef PADDLE_WITH_HIP kernel_ = "#include \n" + kernel; +#elif defined(PADDLE_WITH_MUSA) + kernel_ = kernel; #else kernel_ = kernel; #endif @@ -257,12 +281,12 @@ bool GPUDeviceCode::Compile(bool include_path) { auto* dev_ctx = reinterpret_cast( DeviceContextPool::Instance().Get(place_)); int compute_capability = dev_ctx->GetComputeCapability(); - std::vector options = {"-std=c++11"}; + std::vector options = {"-std=c++11", "--amdgpu-target=gfx906"}; std::string include_option; if (include_path) { std::string cuda_include_path = FindCUDAIncludePath(); if (!cuda_include_path.empty()) { - include_option = "-I" + cuda_include_path; + include_option = "--include-path=" + cuda_include_path; options.push_back(include_option.c_str()); } } @@ -318,6 +342,86 @@ bool GPUDeviceCode::Compile(bool include_path) { "hipModuleGetFunction")) { return false; } +#elif defined(PADDLE_WITH_MUSA) + mtrtcProgram program; + if (!CheckNVRTCResult(dynload::mtrtcCreateProgram(&program, + kernel_.c_str(), // buffer + name_.c_str(), // name + 0, // numHeaders + nullptr, // headers + nullptr), // includeNames + "mtrtcCreateProgram")) { + return false; + } + + // Compile the program for specified compute_capability + auto* dev_ctx = reinterpret_cast( + DeviceContextPool::Instance().Get(place_)); + int compute_capability = dev_ctx->GetComputeCapability(); + std::string compute_flag = + "--gpu-architecture=compute_" + std::to_string(compute_capability); + std::vector options = {"--std=c++11", compute_flag.c_str()}; + std::string include_option; + if (include_path) { + std::string cuda_include_path = FindCUDAIncludePath(); + if (!cuda_include_path.empty()) { + include_option = "--include-path=" + cuda_include_path; + options.push_back(include_option.c_str()); + } + } + mtrtcResult compile_result = + dynload::mtrtcCompileProgram(program, // program + options.size(), // numOptions + options.data()); // options + if (compile_result == MTRTC_ERROR_COMPILATION) { + // Obtain compilation log from the program + size_t log_size; + if (!CheckNVRTCResult(dynload::mtrtcGetProgramLogSize(program, &log_size), + "mtrtcGetProgramLogSize")) { + return false; + } + std::vector log; + log.resize(log_size + 1); + if (!CheckNVRTCResult(dynload::mtrtcGetProgramLog(program, log.data()), + "nvrtcGetProgramLog")) { + return false; + } + LOG(WARNING) << "JIT compiling of MUSA code failed:" + << "\n Kernel name: " << name_ << "\n Kernel body:\n" + << kernel_ << "\n Compiling log: " << log.data(); + + return false; + } + + // Obtain PTX from the program + size_t ptx_size; + if (!CheckNVRTCResult(dynload::mtrtcGetMUSASize(program, &ptx_size), + "mtrtcGetMUSASize")) { + return false; + } + ptx_.resize(ptx_size + 1); + if (!CheckNVRTCResult(dynload::mtrtcGetMUSA(program, ptx_.data()), + "mtrtcGetMUSA")) { + return false; + } + + if (!CheckNVRTCResult(dynload::mtrtcDestroyProgram(&program), + "mtrtcDestroyProgram")) { + return false; + } + + if (!CheckCUDADriverResult(dynload::muModuleLoadData(&module_, ptx_.data()), + "muModuleLoadData", + name_)) { + return false; + } + + if (!CheckCUDADriverResult( + dynload::muModuleGetFunction(&function_, module_, name_.c_str()), + "muModuleGetFunction", + name_)) { + return false; + } #else nvrtcProgram program; if (!CheckNVRTCResult(dynload::nvrtcCreateProgram(&program, @@ -436,6 +540,22 @@ void GPUDeviceCode::Launch(const size_t n, std::vector* args) const { hipSuccess, errors::External("Fail to launch kernel %s (in hipModuleLaunchKernel.)", name_.c_str())); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_EQ( + dynload::muLaunchKernel(function_, + num_blocks, + 1, + 1, // grid dim + num_threads_, + 1, + 1, // block dim + 0, // shared memory + dev_ctx->stream(), // stream + args->data(), // arguments + nullptr), + MUSA_SUCCESS, + errors::External("Fail to launch kernel %s (in muLaunchKernel.)", + name_.c_str())); #else PADDLE_ENFORCE_EQ( dynload::cuLaunchKernel(function_, @@ -464,6 +584,18 @@ bool GPUDeviceCode::CheckNVRTCResult(hiprtcResult result, << " > failed: " << dynload::hiprtcGetErrorString(result); return false; } + return true; +} +#elif defined(PADDLE_WITH_MUSA) +bool GPUDeviceCode::CheckNVRTCResult(mtrtcResult result, std::string function) { + if (result != MTRTC_SUCCESS) { + LOG_FIRST_N(WARNING, 1) + << "Call " << function << " for < " << name_ + << " > failed: " << dynload::mtrtcGetErrorString(result); + return false; + } + return true; +} #else bool GPUDeviceCode::CheckNVRTCResult(nvrtcResult result, std::string function) { if (result != NVRTC_SUCCESS) { @@ -472,9 +604,9 @@ bool GPUDeviceCode::CheckNVRTCResult(nvrtcResult result, std::string function) { << " > failed: " << dynload::nvrtcGetErrorString(result); return false; } -#endif return true; } #endif +#endif } // namespace phi diff --git a/paddle/phi/backends/device_code.h b/paddle/phi/backends/device_code.h index 8debb4dc9c45e..964124076e605 100644 --- a/paddle/phi/backends/device_code.h +++ b/paddle/phi/backends/device_code.h @@ -26,11 +26,20 @@ limitations under the License. */ #include "paddle/phi/backends/dynload/cuda_driver.h" #include "paddle/phi/backends/dynload/nvrtc.h" #endif +#ifdef PADDLE_WITH_MUSA +#include "paddle/phi/backends/dynload/musa_driver.h" +#include "paddle/phi/backends/dynload/musartc.h" +#endif #ifdef PADDLE_WITH_HIP #include "paddle/phi/backends/dynload/hiprtc.h" #include "paddle/phi/backends/dynload/rocm_driver.h" #endif +#ifdef PADDLE_WITH_MUSA +// #include "paddle/phi/backends/dynload/hiprtc.h" +// #include "paddle/phi/backends/dynload/rocm_driver.h" +#endif + namespace phi { class DeviceCode { @@ -48,7 +57,7 @@ class DeviceCode { std::string kernel_; }; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) class GPUDeviceCode : public DeviceCode { public: explicit GPUDeviceCode(const Place& place, @@ -68,6 +77,8 @@ class GPUDeviceCode : public DeviceCode { private: #ifdef PADDLE_WITH_HIP bool CheckNVRTCResult(hiprtcResult result, std::string function); +#elif defined(PADDLE_WITH_MUSA) + bool CheckNVRTCResult(mtrtcResult result, std::string function); #else bool CheckNVRTCResult(nvrtcResult result, std::string function); #endif @@ -82,6 +93,9 @@ class GPUDeviceCode : public DeviceCode { #ifdef PADDLE_WITH_HIP hipModule_t module_; hipFunction_t function_; +#elif defined(PADDLE_WITH_MUSA) + MUmodule module_; + MUfunction function_; #else CUmodule module_; CUfunction function_; diff --git a/paddle/phi/backends/device_memory_aligment.h b/paddle/phi/backends/device_memory_aligment.h index c65e06364acd0..d731b6b6d1ecf 100644 --- a/paddle/phi/backends/device_memory_aligment.h +++ b/paddle/phi/backends/device_memory_aligment.h @@ -36,7 +36,7 @@ inline size_t Alignment(size_t size, if (place.GetType() == phi::AllocationType::CPU) { alignment = phi::backends::cpu::CpuMinChunkSize(); } else { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) alignment = phi::backends::gpu::GpuMinChunkSize(); #elif defined(PADDLE_WITH_XPU) alignment = phi::backends::xpu::XPUMinChunkSize(); diff --git a/paddle/phi/backends/dynload/CMakeLists.txt b/paddle/phi/backends/dynload/CMakeLists.txt index 2db75d7022f0a..2ea6f11aa53a6 100644 --- a/paddle/phi/backends/dynload/CMakeLists.txt +++ b/paddle/phi/backends/dynload/CMakeLists.txt @@ -30,6 +30,17 @@ if(WITH_ROCM) rocsparse.cc) endif() +if(WITH_MUSA) + list( + APPEND + MUSA_SRCS + mublas.cc + mudnn.cc + murand.cc + mufft.cc + musparse.cc) +endif() + # There is no macOS version of NCCL. # Disable nvrtc and cuda_driver api on macOS, and only do an early test on Linux and Windows. if(NOT APPLE) @@ -46,6 +57,15 @@ if(NOT APPLE) list(APPEND HIP_SRCS cupti.cc) endif() endif() + if(WITH_MUSA) + list(APPEND MUSA_SRCS musartc.cc musa_driver.cc) + if(WITH_MCCL) + list(APPEND MUSA_SRCS mccl.cc) + endif() + if(CUPTI_FOUND) + list(APPEND MUSA_SRCS cupti.cc) + endif() + endif() endif() if(TENSORRT_FOUND) @@ -93,6 +113,8 @@ if(WITH_ROCM) collect_srcs(backends_srcs SRCS ${DYNLOAD_COMMON_SRCS} ${HIP_SRCS}) elseif(WITH_GPU) collect_srcs(backends_srcs SRCS ${DYNLOAD_COMMON_SRCS} ${CUDA_SRCS}) +elseif(WITH_MUSA) + collect_srcs(backends_srcs SRCS ${DYNLOAD_COMMON_SRCS} ${MUSA_SRCS}) else() collect_srcs(backends_srcs SRCS ${DYNLOAD_COMMON_SRCS}) endif() diff --git a/paddle/phi/backends/dynload/dynamic_loader.cc b/paddle/phi/backends/dynload/dynamic_loader.cc index bdb9e120d2884..987f0eefc4397 100644 --- a/paddle/phi/backends/dynload/dynamic_loader.cc +++ b/paddle/phi/backends/dynload/dynamic_loader.cc @@ -102,6 +102,29 @@ PHI_DEFINE_string(rccl_dir, "dlopen will search rccl from LD_LIBRARY_PATH"); #endif +#ifdef PADDLE_WITH_MUSA + +PHI_DEFINE_string(mudnn_dir, + "", + "Specify path for loading libmudnn.so. For instance, " + "/usr/local/musa/lib. If empty [default], dlopen " + "will search mudnn from LD_LIBRARY_PATH"); + +PHI_DEFINE_string(musa_dir, + "", + "Specify path for loading rocm library, such as libmublas, " + "For instance, /usr/local/musa/lib. " + "If default, dlopen will search rocm from LD_LIBRARY_PATH"); + +PHI_DEFINE_string(mccl_dir, + "", + "Specify path for loading mccl library, such as libmccl.so. " + "For instance, /usr/local/musa/lib. If default, " + "dlopen will search rccl from LD_LIBRARY_PATH"); +#endif + + + #ifdef PADDLE_WITH_XPU PD_DEFINE_string(xpti_dir, "", "Specify path for loading libxpti.so."); #endif @@ -326,6 +349,8 @@ void* GetCublasDsoHandle() { FLAGS_cuda_dir, win_cublas_lib, true, {cuda_lib_path}); #elif defined(PADDLE_WITH_HIP) return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "librocblas.so"); +#elif defined(PADDLE_WITH_MUSA) + return GetDsoHandleFromSearchPath(FLAGS_musa_dir, "libmublas.so"); #else return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.so"); #endif @@ -367,6 +392,9 @@ void* GetCUDNNDsoHandle() { FLAGS_cudnn_dir, win_cudnn_lib, true, {cuda_lib_path}, win_warn_meg); #elif defined(PADDLE_WITH_HIP) return GetDsoHandleFromSearchPath(FLAGS_miopen_dir, "libMIOpen.so", false); +#elif defined(PADDLE_WITH_MUSA) + return GetDsoHandleFromSearchPath( + FLAGS_cudnn_dir, "libmudnn.so", false, {cuda_lib_path}); #else return GetDsoHandleFromSearchPath( FLAGS_cudnn_dir, "libcudnn.so", false, {cuda_lib_path}); @@ -391,6 +419,8 @@ void* GetCurandDsoHandle() { FLAGS_cuda_dir, win_curand_lib, true, {cuda_lib_path}); #elif defined(PADDLE_WITH_HIP) return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "libhiprand.so"); +#elif defined(PADDLE_WITH_MUSA) + return GetDsoHandleFromSearchPath(FLAGS_musa_dir, "libmurand.so"); #else return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.so"); #endif @@ -406,6 +436,12 @@ void* GetROCFFTDsoHandle() { } #endif +#ifdef PADDLE_WITH_MUSA +void* GetMUFFTDsoHandle() { + return GetDsoHandleFromSearchPath(FLAGS_musa_dir, "libmufft.so"); +} +#endif + void* GetNvjpegDsoHandle() { #if defined(__APPLE__) || defined(__OSX__) return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvjpeg.dylib"); @@ -436,6 +472,8 @@ void* GetCusparseDsoHandle() { FLAGS_cuda_dir, win_cusparse_lib, true, {cuda_lib_path}); #elif defined(PADDLE_WITH_HIP) return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "librocsparse.so"); +#elif defined(PADDLE_WITH_MUSA) + return GetDsoHandleFromSearchPath(FLAGS_musa_dir, "libmusparse.so"); #else return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcusparse.so"); #endif @@ -446,6 +484,8 @@ void* GetNVRTCDsoHandle() { return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvrtc.dylib", false); #elif defined(PADDLE_WITH_HIP) return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "libamdhip64.so", false); +#elif defined(PADDLE_WITH_MUSA) + return GetDsoHandleFromSearchPath(FLAGS_musa_dir, "libmusart.so", false); #else return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvrtc.so", false); #endif @@ -456,6 +496,8 @@ void* GetCUDADsoHandle() { return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcuda.dylib", false); #elif defined(PADDLE_WITH_HIP) return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "libamdhip64.so", false); +#elif defined(PADDLE_WITH_MUSA) + return GetDsoHandleFromSearchPath(FLAGS_musa_dir, "libmusa.so", false); #elif defined(_WIN32) char system32_dir[MAX_PATH]; GetSystemDirectory(system32_dir, MAX_PATH); @@ -513,6 +555,9 @@ void* GetNCCLDsoHandle() { "You may need to install 'rccl' from ROCM official website: " "https://rocmdocs.amd.com/en/latest/Installation_Guide/" "Installation-Guide.html before install PaddlePaddle."); +#elif defined(PADDLE_WITH_MUSA) + std::string warning_msg( + "You may need to install 'mccl' from musa official website."); #else std::string warning_msg( "You may need to install 'nccl2' from NVIDIA official website: " @@ -526,6 +571,9 @@ void* GetNCCLDsoHandle() { #elif defined(PADDLE_WITH_HIP) && defined(PADDLE_WITH_RCCL) return GetDsoHandleFromSearchPath( FLAGS_rccl_dir, "librccl.so", true, {}, warning_msg); +#elif defined(PADDLE_WITH_MUSA) && defined(PADDLE_WITH_MCCL) + return GetDsoHandleFromSearchPath( + FLAGS_mccl_dir, "libmccl.so", true, {}, warning_msg); #else return GetDsoHandleFromSearchPath( FLAGS_nccl_dir, "libnccl.so", true, {}, warning_msg); diff --git a/paddle/phi/backends/dynload/dynamic_loader.h b/paddle/phi/backends/dynload/dynamic_loader.h index 6ddeb1386410f..02da303b2020f 100644 --- a/paddle/phi/backends/dynload/dynamic_loader.h +++ b/paddle/phi/backends/dynload/dynamic_loader.h @@ -48,6 +48,7 @@ void* GetMKLRTDsoHandle(); void* GetROCFFTDsoHandle(); void* GetCusparseLtDsoHandle(); void* GetXPTIDsoHandle(); +void* GetMUFFTDsoHandle(); void SetPaddleLibPath(const std::string&); diff --git a/paddle/phi/backends/dynload/mccl.cc b/paddle/phi/backends/dynload/mccl.cc new file mode 100644 index 0000000000000..3bf5fd8c985d1 --- /dev/null +++ b/paddle/phi/backends/dynload/mccl.cc @@ -0,0 +1,36 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/backends/dynload/mccl.h" + +namespace phi { +namespace dynload { + +std::once_flag mccl_dso_flag; +void *mccl_dso_handle; + +#define DEFINE_WRAP(__name) DynLoad__##__name __name + +MCCL_RAND_ROUTINE_EACH(DEFINE_WRAP); + +MCCL_RAND_ROUTINE_EACH_AFTER_2212(DEFINE_WRAP) + +MCCL_RAND_ROUTINE_EACH_AFTER_2304(DEFINE_WRAP) + +MCCL_RAND_ROUTINE_EACH_AFTER_2703(DEFINE_WRAP) + +MCCL_RAND_ROUTINE_EACH_AFTER_21100(DEFINE_WRAP) + +} // namespace dynload +} // namespace phi diff --git a/paddle/phi/backends/dynload/mccl.h b/paddle/phi/backends/dynload/mccl.h new file mode 100644 index 0000000000000..4e2eaeea00afa --- /dev/null +++ b/paddle/phi/backends/dynload/mccl.h @@ -0,0 +1,80 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#pragma once + +#include + +#include // NOLINT + +#include "paddle/phi/backends/dynload/dynamic_loader.h" +#include "paddle/phi/backends/dynload/port.h" + +namespace phi { +namespace dynload { + +extern std::once_flag mccl_dso_flag; +extern void* mccl_dso_handle; + +#define DECLARE_DYNAMIC_LOAD_MCCL_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + auto operator()(Args... args) -> decltype(__name(args...)) { \ + using nccl_func = decltype(&::__name); \ + std::call_once(mccl_dso_flag, []() { \ + mccl_dso_handle = phi::dynload::GetNCCLDsoHandle(); \ + }); \ + static void* p_##__name = dlsym(mccl_dso_handle, #__name); \ + return reinterpret_cast(p_##__name)(args...); \ + } \ + }; \ + extern DynLoad__##__name __name + +#define MCCL_RAND_ROUTINE_EACH(__macro) \ + __macro(mcclCommInitAll); \ + __macro(mcclGetUniqueId); \ + __macro(mcclCommInitRank); \ + __macro(mcclCommAbort); \ + __macro(mcclCommDestroy); \ + __macro(mcclCommCount); \ + __macro(mcclCommCuDevice); \ + __macro(mcclCommUserRank); \ + __macro(mcclAllReduce); \ + __macro(mcclBcast); \ + __macro(mcclGroupStart); \ + __macro(mcclAllGather); \ + __macro(mcclGroupEnd); \ + __macro(mcclReduce); \ + __macro(mcclReduceScatter); \ + __macro(mcclCommGetAsyncError); \ + __macro(mcclGetErrorString); + +MCCL_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_MCCL_WRAP) + +#define MCCL_RAND_ROUTINE_EACH_AFTER_2212(__macro) __macro(mcclBroadcast); +MCCL_RAND_ROUTINE_EACH_AFTER_2212(DECLARE_DYNAMIC_LOAD_MCCL_WRAP) + +#define MCCL_RAND_ROUTINE_EACH_AFTER_2304(__macro) __macro(mcclGetVersion); +MCCL_RAND_ROUTINE_EACH_AFTER_2304(DECLARE_DYNAMIC_LOAD_MCCL_WRAP) + +#define MCCL_RAND_ROUTINE_EACH_AFTER_2703(__macro) \ + __macro(mcclSend); \ + __macro(mcclRecv); +MCCL_RAND_ROUTINE_EACH_AFTER_2703(DECLARE_DYNAMIC_LOAD_MCCL_WRAP) + +#define MCCL_RAND_ROUTINE_EACH_AFTER_21100(__macro) \ + __macro(mcclRedOpCreatePreMulSum); \ + __macro(mcclRedOpDestroy); +MCCL_RAND_ROUTINE_EACH_AFTER_21100(DECLARE_DYNAMIC_LOAD_MCCL_WRAP) +} // namespace dynload +} // namespace phi diff --git a/paddle/phi/backends/dynload/mublas.cc b/paddle/phi/backends/dynload/mublas.cc new file mode 100644 index 0000000000000..fd05d45414b47 --- /dev/null +++ b/paddle/phi/backends/dynload/mublas.cc @@ -0,0 +1,38 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/backends/dynload/mublas.h" + +namespace phi { +namespace dynload { +std::once_flag mublas_dso_flag; +void *mublas_dso_handle = nullptr; + +#define DEFINE_WRAP(__name) DynLoad__##__name __name + +MUBLAS_BLAS_ROUTINE_EACH(DEFINE_WRAP); + +#ifdef MUBLAS_BLAS_ROUTINE_EACH_R2 +MUBLAS_BLAS_ROUTINE_EACH_R2(DEFINE_WRAP); +#endif + +#ifdef MUBLAS_BLAS_ROUTINE_EACH_R3 +MUBLAS_BLAS_ROUTINE_EACH_R3(DEFINE_WRAP); +#endif + +#ifdef MUBLAS_BLAS_ROUTINE_EACH_R4 +MUBLAS_BLAS_ROUTINE_EACH_R4(DEFINE_WRAP); +#endif +} // namespace dynload +} // namespace phi diff --git a/paddle/phi/backends/dynload/mublas.h b/paddle/phi/backends/dynload/mublas.h new file mode 100644 index 0000000000000..9f8db31bd2d06 --- /dev/null +++ b/paddle/phi/backends/dynload/mublas.h @@ -0,0 +1,128 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + + +#include +#include + +#include // NOLINT +#include + +#include "paddle/phi/backends/dynload/dynamic_loader.h" +#include "paddle/phi/backends/dynload/port.h" + +namespace phi { +namespace dynload { + +extern std::once_flag mublas_dso_flag; +extern void *mublas_dso_handle; + +/** + * The following macro definition can generate structs + * (for each function) to dynamic load mublas routine + * via operator overloading. + * + * note: default dynamic linked libs + */ +#define DECLARE_DYNAMIC_LOAD_MUBLAS_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + inline auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \ + using blas_func = \ + decltype(::__name(std::declval()...)) (*)(Args...); \ + std::call_once(mublas_dso_flag, []() { \ + mublas_dso_handle = phi::dynload::GetCublasDsoHandle(); \ + }); \ + static void *p_##__name = dlsym(mublas_dso_handle, #__name); \ + return reinterpret_cast(p_##__name)(args...); \ + } \ + }; \ + extern DynLoad__##__name __name + +#define MUBLAS_BLAS_ROUTINE_EACH(__macro) \ + __macro(mublasSaxpy); \ + __macro(mublasDaxpy); \ + __macro(mublasCaxpy); \ + __macro(mublasZaxpy); \ + __macro(mublasSscal); \ + __macro(mublasDscal); \ + __macro(mublasScopy); \ + __macro(mublasDcopy); \ + __macro(mublasSgemv); \ + __macro(mublasDgemv); \ + __macro(mublasCgemv); \ + __macro(mublasZgemv); \ + __macro(mublasSgemm); \ + __macro(mublasDgemm); \ + __macro(mublasCgemm); \ + __macro(mublasZgemm); \ + __macro(mublasSgeam); \ + __macro(mublasDgeam); \ + __macro(mublasStrsm); \ + __macro(mublasDtrsm); \ + __macro(mublasCtrsm); \ + __macro(mublasZtrsm); \ + __macro(mublasCreate); \ + __macro(mublasDestroy); \ + __macro(mublasSetStream); \ + __macro(mublasSetPointerMode); \ + __macro(mublasGetPointerMode); \ + __macro(mublasSgemmBatched); \ + __macro(mublasDgemmBatched); \ + __macro(mublasCgemmBatched); \ + __macro(mublasZgemmBatched); \ + __macro(mublasStrsmBatched); \ + __macro(mublasDtrsmBatched); \ + __macro(mublasCtrsmBatched); \ + __macro(mublasZtrsmBatched); + // __macro(mublasHgemm); + //__macro(mublasSgemmEx); + //__macro(mublasSgetrfBatched); + //__macro(mublasSgetriBatched); + //__macro(mublasDgetrfBatched); + //__macro(mublasDgetriBatched); + //__macro(mublasSmatinvBatched); + //__macro(mublasDmatinvBatched); + //__macro(mublasSgetrsBatched); +// __macro(mublasDgetrsBatched); + +MUBLAS_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_MUBLAS_WRAP) + +#define MUBLAS_BLAS_ROUTINE_EACH_R2(__macro) \ + __macro(mublasGemmEx); \ + __macro(mublasSgemmStridedBatched); \ + __macro(mublasDgemmStridedBatched); \ + __macro(mublasCgemmStridedBatched); \ + __macro(mublasZgemmStridedBatched); \ + __macro(mublasHgemmStridedBatched); + +MUBLAS_BLAS_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_MUBLAS_WRAP) + +#define MUBLAS_BLAS_ROUTINE_EACH_R3(__macro) \ + __macro(mublasSetMathMode); \ + __macro(mublasGetMathMode); + +MUBLAS_BLAS_ROUTINE_EACH_R3(DECLARE_DYNAMIC_LOAD_MUBLAS_WRAP) + +#define MUBLAS_BLAS_ROUTINE_EACH_R4(__macro) \ + __macro(mublasGemmBatchedEx); + // __macro(mublasGemmStridedBatchedEx); + +MUBLAS_BLAS_ROUTINE_EACH_R4(DECLARE_DYNAMIC_LOAD_MUBLAS_WRAP) + +#undef DECLARE_DYNAMIC_LOAD_MUBLAS_WRAP +} // namespace dynload +} // namespace phi diff --git a/paddle/phi/backends/dynload/mudnn.cc b/paddle/phi/backends/dynload/mudnn.cc new file mode 100644 index 0000000000000..cd193688bc347 --- /dev/null +++ b/paddle/phi/backends/dynload/mudnn.cc @@ -0,0 +1,41 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#ifdef PADDLE_WITH_MUSA +#include "paddle/phi/backends/dynload/mudnn.h" + +namespace phi { +namespace dynload { + +bool HasCUDNN() { + // note: mudnn.so is not imported by dlopen, which will be linked + // in cmakelist.txt. + return true; +} + +void mudnnCreate(Handle** handle, int device) { *handle = new Handle(device); } + +void mudnnSetStream(Handle* handle, musaStream_t stream) { + handle->SetStream(stream); +} + +void mudnnDestroy(Handle* handle) { + if (handle != nullptr) { + delete handle; + handle = nullptr; + } +} + +} // namespace dynload +} // namespace phi +#endif diff --git a/paddle/phi/backends/dynload/mudnn.h b/paddle/phi/backends/dynload/mudnn.h new file mode 100644 index 0000000000000..d05f32a8b5df0 --- /dev/null +++ b/paddle/phi/backends/dynload/mudnn.h @@ -0,0 +1,41 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#ifdef PADDLE_WITH_MUSA +#include +#include + +namespace phi { +namespace dynload { + +using ::musa::dnn::BatchNorm; +using ::musa::dnn::Convolution; +using ::musa::dnn::Handle; +using ::musa::dnn::MemoryHandler; +using ::musa::dnn::Pooling; +using ::musa::dnn::Softmax; +using ::musa::dnn::Tensor; + +extern bool HasCUDNN(); + +void mudnnCreate(Handle** handle, int device); + +void mudnnSetStream(Handle* handle, musaStream_t stream); + +void mudnnDestroy(Handle* handle); + +} // namespace dynload +} // namespace phi +#endif diff --git a/paddle/phi/backends/dynload/mufft.cc b/paddle/phi/backends/dynload/mufft.cc new file mode 100644 index 0000000000000..9e30463ea39fa --- /dev/null +++ b/paddle/phi/backends/dynload/mufft.cc @@ -0,0 +1,43 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/backends/dynload/mufft.h" + +#include "paddle/phi/core/enforce.h" + +namespace phi { +namespace dynload { +std::once_flag mufft_dso_flag; +void* mufft_dso_handle = nullptr; + +#define DEFINE_WRAP(__name) DynLoad__##__name __name + +MUFFT_FFT_ROUTINE_EACH(DEFINE_WRAP); + +bool HasMUFFT() { + std::call_once(mufft_dso_flag, + []() { mufft_dso_handle = GetMUFFTDsoHandle(); }); + return mufft_dso_handle != nullptr; +} + +void EnforceMUFFTLoaded(const char* fn_name) { + PADDLE_ENFORCE_NOT_NULL( + mufft_dso_handle, + phi::errors::PreconditionNotMet( + "Cannot load mufft shared library. Cannot invoke method %s.", + fn_name)); +} + +} // namespace dynload +} // namespace phi diff --git a/paddle/phi/backends/dynload/mufft.h b/paddle/phi/backends/dynload/mufft.h new file mode 100644 index 0000000000000..70bfdd4c1efd1 --- /dev/null +++ b/paddle/phi/backends/dynload/mufft.h @@ -0,0 +1,155 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#ifdef PADDLE_WITH_MUSA +#include +#include +#include + +#include // NOLINT + +#include "paddle/phi/backends/dynload/dynamic_loader.h" +#include "paddle/phi/backends/dynload/port.h" + +namespace phi { +namespace dynload { + +extern std::once_flag mufft_dso_flag; +extern void* mufft_dso_handle; + +extern void EnforceMUFFTLoaded(const char* fn_name); +#define DECLARE_DYNAMIC_LOAD_MUFFT_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \ + using mufft_func = decltype(&::__name); \ + std::call_once(mufft_dso_flag, []() { \ + mufft_dso_handle = phi::dynload::GetMUFFTDsoHandle(); \ + }); \ + EnforceMUFFTLoaded(#__name); \ + static void* p_##__name = dlsym(mufft_dso_handle, #__name); \ + return reinterpret_cast(p_##__name)(args...); \ + } \ + }; \ + extern struct DynLoad__##__name __name + +/** + * include all needed mufft functions in HPPL + * different mufft version has different interfaces + **/ +#define MUFFT_FFT_ROUTINE_EACH(__macro) \ + __macro(mufftPlan1d); \ + __macro(mufftPlan2d); \ + __macro(mufftPlan3d); \ + __macro(mufftPlanMany); \ + __macro(mufftMakePlan1d); \ + __macro(mufftMakePlan2d); \ + __macro(mufftMakePlan3d); \ + __macro(mufftMakePlanMany); \ + __macro(mufftEstimate1d); \ + __macro(mufftEstimate2d); \ + __macro(mufftEstimate3d); \ + __macro(mufftEstimateMany); \ + __macro(mufftCreate); \ + __macro(mufftGetSize1d); \ + __macro(mufftGetSize2d); \ + __macro(mufftGetSize3d); \ + __macro(mufftGetSizeMany); \ + __macro(mufftGetSize); \ + __macro(mufftSetWorkArea); \ + __macro(mufftSetAutoAllocation); \ + __macro(mufftExecC2C); \ + __macro(mufftExecR2C); \ + __macro(mufftExecC2R); \ + __macro(mufftExecZ2Z); \ + __macro(mufftExecD2Z); \ + __macro(mufftExecZ2D); \ + __macro(mufftSetStream); \ + __macro(mufftDestroy); \ + __macro(mufftGetVersion); \ + __macro(mufftGetProperty); \ + __macro(mufftXtSetGPUs); \ + __macro(mufftXtMalloc); \ + __macro(mufftXtMemcpy); \ + __macro(mufftXtFree); \ + __macro(mufftXtExecDescriptorC2C); \ + __macro(mufftXtExecDescriptorR2C); \ + __macro(mufftXtExecDescriptorC2R); \ + __macro(mufftXtExecDescriptorZ2Z); \ + __macro(mufftXtExecDescriptorD2Z); \ + __macro(mufftXtExecDescriptorZ2D); \ + __macro(mufftXtQueryPlan); \ + __macro(mufftXtSetCallback); \ + __macro(mufftXtClearCallback); \ + __macro(mufftXtMakePlanMany); \ + __macro(mufftXtGetSizeMany); \ + __macro(mufftXtExec); \ + __macro(mufftXtExecDescriptor); +MUFFT_FFT_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_MUFFT_WRAP) + + +inline const char *mufftGetErrorString(mufftResult_t status) { + switch (status) { + case MUFFT_SUCCESS: + return "'MUFFT_SUCCESS'. The mufft operation was successful."; + case MUFFT_INVALID_PLAN: + return "'MUFFT_INVALID_PLAN'. mufft was passed an invalid plan handle."; + case MUFFT_ALLOC_FAILED: + return "'MUFFT_ALLOC_FAILED'. mufft failed to allocate GPU or CPU " + "memory."; + case MUFFT_INVALID_TYPE: + return "'MUFFT_INVALID_TYPE'. No longer used."; + case MUFFT_INVALID_VALUE: + return "'MUFFT_INVALID_VALUE'. User specified an invalid pointer or " + "parameter."; + case MUFFT_INTERNAL_ERROR: + return "'MUFFT_INTERNAL_ERROR'. Driver or internal mufft library " + "error."; + case MUFFT_EXEC_FAILED: + return "'MUFFT_EXEC_FAILED'. Failed to execute an FFT on the GPU."; + case MUFFT_SETUP_FAILED: + return "'MUFFT_SETUP_FAILED'. The mufft library failed to initialize."; + case MUFFT_INVALID_SIZE: + return "'MUFFT_INVALID_SIZE'. User specified an invalid transform size."; + case MUFFT_UNALIGNED_DATA: + return "'MUFFT_UNALIGNED_DATA'. No longer used."; + case MUFFT_INCOMPLETE_PARAMETER_LIST: + return "'MUFFT_INCOMPLETE_PARAMETER_LIST'. Missing parameters in call."; + case MUFFT_INVALID_DEVICE: + return "'MUFFT_INVALID_DEVICE'. Execution of a plan was on different " + "GPU than plan creation."; + case MUFFT_PARSE_ERROR: + return "'MUFFT_PARSE_ERROR'. Internal plan database error."; + case MUFFT_NO_WORKSPACE: + return "'MUFFT_NO_WORKSPACE'. No workspace has been provided prior to " + "plan execution."; + case MUFFT_NOT_IMPLEMENTED: + return "'MUFFT_NOT_IMPLEMENTED'. Function does not implement " + "functionality for parameters given."; + case MUFFT_LICENSE_ERROR: + return "'MUFFT_LICENSE_ERROR'. Operation is not supported for " + "parameters given."; + case MUFFT_NOT_SUPPORTED: + return "'MUFFT_NOT_SUPPORTED'. Operation is not supported for " + "parameters given."; + default: + return "mufft_STATUS_UNKNOWN_ERROR"; + } +} + +} // namespace dynload +} // namespace phi + +#endif diff --git a/paddle/phi/backends/dynload/murand.cc b/paddle/phi/backends/dynload/murand.cc new file mode 100644 index 0000000000000..bbeeb7bcd5898 --- /dev/null +++ b/paddle/phi/backends/dynload/murand.cc @@ -0,0 +1,28 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/backends/dynload/murand.h" + +namespace phi { +namespace dynload { + +std::once_flag murand_dso_flag; +void *murand_dso_handle; + +#define DEFINE_WRAP(__name) DynLoad__##__name __name + +MURAND_RAND_ROUTINE_EACH(DEFINE_WRAP); + +} // namespace dynload +} // namespace phi diff --git a/paddle/phi/backends/dynload/murand.h b/paddle/phi/backends/dynload/murand.h new file mode 100644 index 0000000000000..28380cd9423f0 --- /dev/null +++ b/paddle/phi/backends/dynload/murand.h @@ -0,0 +1,54 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#pragma once + +#include + +#include // NOLINT + +#include "paddle/phi/backends/dynload/dynamic_loader.h" +#include "paddle/phi/backends/dynload/port.h" + +namespace phi { +namespace dynload { +extern std::once_flag murand_dso_flag; +extern void *murand_dso_handle; + +#define DECLARE_DYNAMIC_LOAD_CURAND_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + murandStatus_t operator()(Args... args) { \ + using murandFunc = decltype(&::__name); \ + std::call_once(murand_dso_flag, []() { \ + murand_dso_handle = phi::dynload::GetCurandDsoHandle(); \ + }); \ + static void *p_##__name = dlsym(murand_dso_handle, #__name); \ + return reinterpret_cast(p_##__name)(args...); \ + } \ + }; \ + extern DynLoad__##__name __name + +#define MURAND_RAND_ROUTINE_EACH(__macro) \ + __macro(murandCreateGenerator); \ + __macro(murandSetStream); \ + __macro(murandSetPseudoRandomGeneratorSeed); \ + __macro(murandGenerateUniform); \ + __macro(murandGenerateUniformDouble); \ + __macro(murandGenerateNormal); \ + __macro(murandDestroyGenerator); + +MURAND_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CURAND_WRAP); + +} // namespace dynload +} // namespace phi diff --git a/paddle/phi/backends/dynload/musa_driver.cc b/paddle/phi/backends/dynload/musa_driver.cc new file mode 100644 index 0000000000000..2173a8d6cdd81 --- /dev/null +++ b/paddle/phi/backends/dynload/musa_driver.cc @@ -0,0 +1,33 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/backends/dynload/musa_driver.h" + +namespace phi { +namespace dynload { + +std::once_flag musa_dso_flag; +void* musa_dso_handle = nullptr; + +#define DEFINE_WRAP(__name) DynLoad__##__name __name + +MUSA_ROUTINE_EACH(DEFINE_WRAP); + +bool HasCUDADriver() { + std::call_once(musa_dso_flag, []() { musa_dso_handle = GetCUDADsoHandle(); }); + return musa_dso_handle != nullptr; +} + +} // namespace dynload +} // namespace phi diff --git a/paddle/phi/backends/dynload/musa_driver.h b/paddle/phi/backends/dynload/musa_driver.h new file mode 100644 index 0000000000000..3534ab8213c93 --- /dev/null +++ b/paddle/phi/backends/dynload/musa_driver.h @@ -0,0 +1,69 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include + +#include // NOLINT + +#include "paddle/phi/backends/dynload/dynamic_loader.h" +#include "paddle/phi/backends/dynload/port.h" + +namespace phi { +namespace dynload { + +extern std::once_flag musa_dso_flag; +extern void* musa_dso_handle; +extern bool HasCUDADriver(); + +#define DECLARE_DYNAMIC_LOAD_MUSA_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \ + using musa_func = decltype(&::__name); \ + std::call_once(musa_dso_flag, []() { \ + musa_dso_handle = phi::dynload::GetCUDADsoHandle(); \ + }); \ + static void* p_##__name = dlsym(musa_dso_handle, #__name); \ + return reinterpret_cast(p_##__name)(args...); \ + } \ + }; \ + extern struct DynLoad__##__name __name + +/** + * include all needed musa driver functions + **/ +#define MUSA_ROUTINE_EACH(__macro) \ + __macro(muInit); \ + __macro(muDriverGetVersion); \ + __macro(muGetErrorString); \ + __macro(muModuleLoadData); \ + __macro(muModuleGetFunction); \ + __macro(muModuleUnload); \ + __macro(muOccupancyMaxActiveBlocksPerMultiprocessor); \ + __macro(muLaunchKernel); \ + __macro(muCtxCreate); \ + __macro(muCtxGetCurrent); \ + __macro(muDeviceGetCount); \ + __macro(muDevicePrimaryCtxGetState); \ + __macro(muDeviceGetAttribute); \ + __macro(muDeviceGet); + +MUSA_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_MUSA_WRAP); + +#undef DECLARE_DYNAMIC_LOAD_MUSA_WRAP + +} // namespace dynload +} // namespace phi diff --git a/paddle/phi/backends/dynload/musartc.cc b/paddle/phi/backends/dynload/musartc.cc new file mode 100644 index 0000000000000..9cd25270a1016 --- /dev/null +++ b/paddle/phi/backends/dynload/musartc.cc @@ -0,0 +1,34 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/backends/dynload/musartc.h" + +namespace phi { +namespace dynload { + +std::once_flag musartc_dso_flag; +void* musartc_dso_handle = nullptr; + +#define DEFINE_WRAP(__name) DynLoad__##__name __name + +MUSARTC_ROUTINE_EACH(DEFINE_WRAP); + +bool HasNVRTC() { + std::call_once(musartc_dso_flag, + []() { musartc_dso_handle = GetNVRTCDsoHandle(); }); + return musartc_dso_handle != nullptr; +} + +} // namespace dynload +} // namespace phi diff --git a/paddle/phi/backends/dynload/musartc.h b/paddle/phi/backends/dynload/musartc.h new file mode 100644 index 0000000000000..ee85bebc503ec --- /dev/null +++ b/paddle/phi/backends/dynload/musartc.h @@ -0,0 +1,147 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +// #include + +#include // NOLINT + +#include "paddle/phi/backends/dynload/dynamic_loader.h" +#include "paddle/phi/backends/dynload/port.h" +#include "paddle/phi/core/enforce.h" + +// TODO(MTAI): The following musa runtime compiling functions are not supported +// now. Here empty implementations are given temporarily. When compiler MCC +// supports these functions, we will replace them. +typedef struct _mtrtcProgram *mtrtcProgram; + +typedef enum { + MTRTC_SUCCESS = 0, + MTRTC_ERROR_OUT_OF_MEMORY = 1, + MTRTC_ERROR_PROGRAM_CREATION_FAILURE = 2, + MTRTC_ERROR_INVALID_INPUT = 3, + MTRTC_ERROR_INVALID_PROGRAM = 4, + MTRTC_ERROR_INVALID_OPTION = 5, + MTRTC_ERROR_COMPILATION = 6, + MTRTC_ERROR_BUILTIN_OPERATION_FAILURE = 7, + MTRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION = 8, + MTRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION = 9, + MTRTC_ERROR_NAME_EXPRESSION_NOT_VALID = 10, + MTRTC_ERROR_INTERNAL_ERROR = 11 +} mtrtcResult; + +inline mtrtcResult mtrtcVersion(int *major, int *minor) { + PADDLE_THROW( + phi::errors::Unimplemented("mtrtcVersion is not supported on MUSA now!")); + return mtrtcResult::MTRTC_ERROR_INTERNAL_ERROR; +} + +inline const char *mtrtcGetErrorString(mtrtcResult result) { + PADDLE_THROW(phi::errors::Unimplemented( + "mtrtcGetErrorString is not supported on MUSA now!")); + return "mtrtcGetErrorString is not supported on MUSA now!"; +} + +inline mtrtcResult mtrtcCompileProgram(mtrtcProgram prog, + int numOptions, + const char *const *options) { + PADDLE_THROW(phi::errors::Unimplemented( + "mtrtcCompileProgram is not supported on MUSA now!")); + return mtrtcResult::MTRTC_ERROR_INTERNAL_ERROR; +} + +inline mtrtcResult mtrtcCreateProgram(mtrtcProgram *prog, + const char *src, + const char *name, + int numHeaders, + const char *const *headers, + const char *const *includeNames) { + PADDLE_THROW(phi::errors::Unimplemented( + "mtrtcCreateProgram is not supported on MUSA now!")); + return mtrtcResult::MTRTC_ERROR_INTERNAL_ERROR; +} + +inline mtrtcResult mtrtcDestroyProgram(mtrtcProgram *prog) { + PADDLE_THROW(phi::errors::Unimplemented( + "mtrtcDestroyProgram is not supported on MUSA now!")); + return mtrtcResult::MTRTC_ERROR_INTERNAL_ERROR; +} + +inline mtrtcResult mtrtcGetMUSA(mtrtcProgram prog, char *musa) { + PADDLE_THROW( + phi::errors::Unimplemented("mtrtcGetMUSA is not supported on MUSA now!")); + return mtrtcResult::MTRTC_ERROR_INTERNAL_ERROR; +} + +inline mtrtcResult mtrtcGetMUSASize(mtrtcProgram prog, size_t *musaSizeRet) { + PADDLE_THROW(phi::errors::Unimplemented( + "mtrtcGetMUSASize is not supported on MUSA now!")); + return mtrtcResult::MTRTC_ERROR_INTERNAL_ERROR; +} + +inline mtrtcResult mtrtcGetProgramLog(mtrtcProgram prog, char *log) { + PADDLE_THROW(phi::errors::Unimplemented( + "mtrtcGetProgramLog is not supported on MUSA now!")); + return mtrtcResult::MTRTC_ERROR_INTERNAL_ERROR; +} + +inline mtrtcResult mtrtcGetProgramLogSize(mtrtcProgram prog, + size_t *logSizeRet) { + PADDLE_THROW(phi::errors::Unimplemented( + "mtrtcGetProgramLogSize is not supported on MUSA now!")); + return mtrtcResult::MTRTC_ERROR_INTERNAL_ERROR; +} + +namespace phi { +namespace dynload { + +extern std::once_flag musartc_dso_flag; +extern void *musartc_dso_handle; +extern bool HasNVRTC(); + +#define DECLARE_DYNAMIC_LOAD_NVRTC_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \ + using musartc_func = decltype(&::__name); \ + std::call_once(musartc_dso_flag, []() { \ + musartc_dso_handle = phi::dynload::GetNVRTCDsoHandle(); \ + }); \ + static void *p_##__name = dlsym(musartc_dso_handle, #__name); \ + return reinterpret_cast(p_##__name)(args...); \ + } \ + }; \ + extern struct DynLoad__##__name __name + +/** + * include all needed musartc functions + **/ +#define MUSARTC_ROUTINE_EACH(__macro) \ + __macro(mtrtcVersion); \ + __macro(mtrtcGetErrorString); \ + __macro(mtrtcCompileProgram); \ + __macro(mtrtcCreateProgram); \ + __macro(mtrtcDestroyProgram); \ + __macro(mtrtcGetMUSA); \ + __macro(mtrtcGetMUSASize); \ + __macro(mtrtcGetProgramLog); \ + __macro(mtrtcGetProgramLogSize) + +MUSARTC_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_NVRTC_WRAP); + +#undef DECLARE_DYNAMIC_LOAD_NVRTC_WRAP + +} // namespace dynload +} // namespace phi diff --git a/paddle/phi/backends/dynload/musparse.cc b/paddle/phi/backends/dynload/musparse.cc new file mode 100644 index 0000000000000..40d766f963c40 --- /dev/null +++ b/paddle/phi/backends/dynload/musparse.cc @@ -0,0 +1,29 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/backends/dynload/musparse.h" + +namespace phi { +namespace dynload { + +std::once_flag musparse_dso_flag; +void *musparse_dso_handle; + +#define DEFINE_WRAP(__name) DynLoad__##__name __name + +MUSPARSE_ROUTINE_EACH(DEFINE_WRAP); + +} // namespace dynload +} // namespace phi + diff --git a/paddle/phi/backends/dynload/musparse.h b/paddle/phi/backends/dynload/musparse.h new file mode 100644 index 0000000000000..e63182943190d --- /dev/null +++ b/paddle/phi/backends/dynload/musparse.h @@ -0,0 +1,76 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#pragma once + +#include +#include + +#include // NOLINT + +#include "paddle/phi/backends/dynload/dynamic_loader.h" +#include "paddle/phi/backends/dynload/port.h" + +namespace phi { +namespace dynload { +extern std::once_flag musparse_dso_flag; +extern void *musparse_dso_handle; + +#define DECLARE_DYNAMIC_LOAD_MUSPARSE_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + musparseStatus_t operator()(Args... args) { \ + using Func = decltype(&::__name); \ + std::call_once(musparse_dso_flag, []() { \ + musparse_dso_handle = phi::dynload::GetCusparseDsoHandle(); \ + }); \ + static void *p_##__name = dlsym(musparse_dso_handle, #__name); \ + return reinterpret_cast(p_##__name)(args...); \ + } \ + }; \ + extern DynLoad__##__name __name + +#if defined(PADDLE_WITH_MUSA) +#define MUSPARSE_ROUTINE_EACH(__macro) \ + __macro(musparseCreateHandle); \ + __macro(musparseDestroyHandle); \ + __macro(musparseSetStream); \ + __macro(musparseCreateMatDescr); \ + __macro(musparseSnnz); \ + __macro(musparseDnnz); \ + __macro(musparseSetMatType); \ + __macro(musparseSetMatIndexBase); \ + __macro(musparseCreateCsr); \ + __macro(musparseCreateCoo); \ + __macro(musparseCreateDnMat); \ + __macro(musparseCreateDnVec); \ + __macro(musparseSpMM); \ + __macro(musparseDestroySpMat); \ + __macro(musparseDestroyDnMat); \ + __macro(musparseDestroyDnVec); \ + __macro(musparseSpMV); \ + __macro(musparseSDDMM_bufferSize); \ + __macro(musparseSDDMM_preprocess); \ + __macro(musparseSDDMM); \ + __macro(musparseDnMatSetStridedBatch); \ + __macro(musparseCooSetStridedBatch); \ + __macro(musparseCsrSetStridedBatch); + +MUSPARSE_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_MUSPARSE_WRAP) + +#endif // PADDLE_WITH_MUSA + +#undef DECLARE_DYNAMIC_LOAD_MUSPARSE_WRAP +} // namespace dynload +} // namespace phi + diff --git a/paddle/phi/backends/dynload/nccl.h b/paddle/phi/backends/dynload/nccl.h index 91b6f5dcd58dc..a5759b67e8df7 100644 --- a/paddle/phi/backends/dynload/nccl.h +++ b/paddle/phi/backends/dynload/nccl.h @@ -42,18 +42,18 @@ extern void* nccl_dso_handle; #define NCCL_RAND_ROUTINE_EACH(__macro) \ __macro(ncclCommInitAll); \ - __macro(ncclGetUniqueId); \ + __macro(mcclGetUniqueId); \ __macro(ncclCommInitRank); \ __macro(ncclCommAbort); \ __macro(ncclCommDestroy); \ __macro(ncclCommCount); \ __macro(ncclCommCuDevice); \ __macro(ncclCommUserRank); \ - __macro(ncclAllReduce); \ - __macro(ncclBcast); \ - __macro(ncclAllGather); \ - __macro(ncclGroupStart); \ - __macro(ncclGroupEnd); \ + __macro(mcclAllReduce); \ + __macro(mcclBcast); \ + __macro(mcclAllGather); \ + __macro(mcclGroupStart); \ + __macro(mcclGroupEnd); \ __macro(ncclReduce); \ __macro(ncclReduceScatter); \ __macro(ncclCommGetAsyncError); \ @@ -67,7 +67,7 @@ NCCL_RAND_ROUTINE_EACH_AFTER_2212(DECLARE_DYNAMIC_LOAD_NCCL_WRAP) #endif #if NCCL_VERSION_CODE >= 2304 -#define NCCL_RAND_ROUTINE_EACH_AFTER_2304(__macro) __macro(ncclGetVersion); +#define NCCL_RAND_ROUTINE_EACH_AFTER_2304(__macro) __macro(mcclGetVersion); NCCL_RAND_ROUTINE_EACH_AFTER_2304(DECLARE_DYNAMIC_LOAD_NCCL_WRAP) #endif diff --git a/paddle/phi/backends/dynload/rccl.h b/paddle/phi/backends/dynload/rccl.h index e1018a3f253fa..651cc9c68b243 100644 --- a/paddle/phi/backends/dynload/rccl.h +++ b/paddle/phi/backends/dynload/rccl.h @@ -42,18 +42,18 @@ extern void* rccl_dso_handle; #define RCCL_RAND_ROUTINE_EACH(__macro) \ __macro(ncclCommInitAll); \ - __macro(ncclGetUniqueId); \ + __macro(mcclGetUniqueId); \ __macro(ncclCommInitRank); \ __macro(ncclCommAbort); \ __macro(ncclCommDestroy); \ __macro(ncclCommCount); \ __macro(ncclCommCuDevice); \ __macro(ncclCommUserRank); \ - __macro(ncclAllReduce); \ - __macro(ncclBcast); \ - __macro(ncclAllGather); \ - __macro(ncclGroupStart); \ - __macro(ncclGroupEnd); \ + __macro(mcclAllReduce); \ + __macro(mcclBcast); \ + __macro(mcclAllGather); \ + __macro(mcclGroupStart); \ + __macro(mcclGroupEnd); \ __macro(ncclReduce); \ __macro(ncclReduceScatter); \ __macro(ncclCommGetAsyncError); \ @@ -67,7 +67,7 @@ RCCL_RAND_ROUTINE_EACH_AFTER_2212(DECLARE_DYNAMIC_LOAD_RCCL_WRAP) #endif #if NCCL_VERSION_CODE >= 2304 -#define RCCL_RAND_ROUTINE_EACH_AFTER_2304(__macro) __macro(ncclGetVersion); +#define RCCL_RAND_ROUTINE_EACH_AFTER_2304(__macro) __macro(mcclGetVersion); RCCL_RAND_ROUTINE_EACH_AFTER_2304(DECLARE_DYNAMIC_LOAD_RCCL_WRAP) #endif diff --git a/paddle/phi/backends/gpu/forwards.h b/paddle/phi/backends/gpu/forwards.h index e1f3492f76870..2b733c01bc01b 100644 --- a/paddle/phi/backends/gpu/forwards.h +++ b/paddle/phi/backends/gpu/forwards.h @@ -72,6 +72,25 @@ using cufftHandle = int; // Forward declaration of NCCL types. using ncclComm_t = struct ncclComm *; + + + +// Forward declaration of MUSA runtime types. +using musaStream_t = struct MUstream_st *; +using musaEvent_t = struct MUevent_st *; +using mublasHandle_t = struct _mublasHandle_t *; +namespace musa { +namespace dnn { +struct Handle; +} // namespace dnn +} // namespace musa +using mudnnHandle_t = musa::dnn::Handle *; +using musparseHandle_t = struct _musparse_handle *; +using mublasLtHandle_t = struct mublasLtContext *; +using mcclComm_t = struct mcclComm *; + + + /// Forward declaration of ROCM types. #include diff --git a/paddle/phi/backends/gpu/gpu_context.cc b/paddle/phi/backends/gpu/gpu_context.cc index 8d46c3e34cabd..f250fb365ce85 100644 --- a/paddle/phi/backends/gpu/gpu_context.cc +++ b/paddle/phi/backends/gpu/gpu_context.cc @@ -51,6 +51,16 @@ limitations under the License. */ #endif // !defined(__APPLE__) && defined(PADDLE_WITH_RCCL) #endif // PADDLE_WITH_HIP + +#ifdef PADDLE_WITH_MUSA +#include "paddle/phi/backends/dynload/mudnn.h" +#include "paddle/phi/backends/dynload/mublas.h" +#if !defined(__APPLE__) && defined(PADDLE_WITH_MCCL) +#include "paddle/phi/backends/dynload/mccl.h" +#endif // !defined(__APPLE__) && defined(PADDLE_WITH_MCCL) +#endif // PADDLE_WITH_MUSA + + // NOTE: The paddle framework should add WITH_EIGEN option to support compile // without eigen. #include "unsupported/Eigen/CXX11/Tensor" @@ -119,6 +129,9 @@ class EigenGpuStreamDevice : public Eigen::StreamInterface { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS( hipMemsetAsync(semaphore_, 0, sizeof(unsigned int), stream())); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS( + musaMemsetAsync(semaphore_, 0, sizeof(unsigned int), stream())); #else PADDLE_ENFORCE_GPU_SUCCESS( cudaMemsetAsync(semaphore_, 0, sizeof(unsigned int), stream())); @@ -143,6 +156,11 @@ static void StreamCallbackFunc(gpuStream_t stream, gpuError_t status, void* user_data) #endif +#ifdef PADDLE_WITH_MUSA +static void StreamCallbackFunc(gpuStream_t stream, + gpuError_t status, + void* user_data) +#endif #ifdef PADDLE_WITH_CUDA #if CUDA_VERSION >= 10000 static void CUDART_CB StreamCallbackFunc(void* user_data) @@ -170,6 +188,8 @@ void DnnWorkspaceHandle::RunFuncSync( std::lock_guard guard(*mtx_); #ifdef PADDLE_WITH_HIP auto status = hipMalloc(&workspace_ptr, size); +#elif defined(PADDLE_WITH_MUSA) + auto status = musaMalloc(&workspace_ptr, size); #else auto status = cudaMalloc(&workspace_ptr, size); #endif @@ -178,6 +198,8 @@ void DnnWorkspaceHandle::RunFuncSync( phi::backends::gpu::GpuStreamSync(stream_); #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipFree(workspace_ptr)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(musaFree(workspace_ptr)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaFree(workspace_ptr)); #endif @@ -248,9 +270,9 @@ struct GPUContext::Impl { DestoryInternalWorkspace(); DestoryInternalEigenDevice(); phi::DestroySparseHandle(sparse_handle_); - phi::DestroySolverHandle(solver_handle_); + // phi::DestroySolverHandle(solver_handle_); phi::DestroyDnnHandle(dnn_handle_); -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) if (nccl_comm_) { // NOTE(liyurui): It is not recommend calling CUDA runtime API // in destructor. Since we can not ensure the release order of @@ -264,7 +286,7 @@ struct GPUContext::Impl { phi::DestroyBlasHandle(blas_handle_); phi::DestroyBlasHandle(blas_tensor_core_handle_); phi::DestroyBlasHandle(blas_tf32_tensor_core_handle_); - phi::DestroyBlasLtHandle(blaslt_handle_); + // phi::DestroyBlasLtHandle(blaslt_handle_); } if (stream_owned_ && stream_) { delete stream_; @@ -425,24 +447,24 @@ struct GPUContext::Impl { blas_tf32_tensor_core_handle_creator_ = std::move(handle_creator); } - void SetBlasLtHandle(blasLtHandle_t blaslt) { blaslt_handle_ = blaslt; } + // void SetBlasLtHandle(blasLtHandle_t blaslt) { blaslt_handle_ = blaslt; } - void SetBlasLtHandle(std::function&& handle_creator) { - blaslt_handle_creator_ = std::move(handle_creator); - } + // void SetBlasLtHandle(std::function&& handle_creator) { + // blaslt_handle_creator_ = std::move(handle_creator); + // } - blasLtHandle_t GetBlasLtHandle() { - std::call_once(flag_blaslt_, [&]() { - if (!blaslt_handle_) { - if (!blaslt_handle_creator_) - phi::InitBlasLtHandle(&blaslt_handle_); - else - blaslt_handle_ = blaslt_handle_creator_(); - } - }); - PD_CHECK(blaslt_handle_ != nullptr, "the gpu blasLt handle is nullptr."); - return blaslt_handle_; - } + // blasLtHandle_t GetBlasLtHandle() { + // std::call_once(flag_blaslt_, [&]() { + // if (!blaslt_handle_) { + // if (!blaslt_handle_creator_) + // phi::InitBlasLtHandle(&blaslt_handle_); + // else + // blaslt_handle_ = blaslt_handle_creator_(); + // } + // }); + // PD_CHECK(blaslt_handle_ != nullptr, "the gpu blasLt handle is nullptr."); + // return blaslt_handle_; + // } dnnHandle_t GetDnnHandle() { std::call_once(flag_dnn_, [&]() { @@ -464,6 +486,11 @@ struct GPUContext::Impl { PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenDestroy(dnn_handle_)); dnn_handle_ = nullptr; } +#elif defined(PADDLE_WITH_MUSA) + if (owned_ && dnn_handle_ != nullptr) { + phi::dynload::mudnnDestroy(dnn_handle_); + dnn_handle_ = nullptr; + } #else if (owned_ && dnn_handle_ != nullptr) { PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnDestroy(dnn_handle_)); @@ -478,25 +505,25 @@ struct GPUContext::Impl { dnn_handle_creator_ = std::move(handle_creator); } - solverHandle_t GetSolverHandle() { - std::call_once(flag_slover_, [&]() { - if (!solver_handle_) { - if (!solver_handle_creator_) { - phi::InitSolverHandle(&solver_handle_, stream()); - } else { - solver_handle_ = solver_handle_creator_(); - } - } - }); - PD_CHECK(solver_handle_ != nullptr, "the gpu solver handle is nullptr."); - return solver_handle_; - } + // solverHandle_t GetSolverHandle() { + // std::call_once(flag_slover_, [&]() { + // if (!solver_handle_) { + // if (!solver_handle_creator_) { + // phi::InitSolverHandle(&solver_handle_, stream()); + // } else { + // solver_handle_ = solver_handle_creator_(); + // } + // } + // }); + // PD_CHECK(solver_handle_ != nullptr, "the gpu solver handle is nullptr."); + // return solver_handle_; + // } - void SetSolverHandle(solverHandle_t handle) { solver_handle_ = handle; } + // void SetSolverHandle(solverHandle_t handle) { solver_handle_ = handle; } - void SetSolverHandle(std::function&& handle_creator) { - solver_handle_creator_ = std::move(handle_creator); - } + // void SetSolverHandle(std::function&& handle_creator) { + // solver_handle_creator_ = std::move(handle_creator); + // } sparseHandle_t GetSparseHandle() { std::call_once(flag_sparse_, [&]() { @@ -529,6 +556,9 @@ struct GPUContext::Impl { break; } #endif // !defined(_WIN32) +#elif defined(PADDLE_WITH_MUSA) + musaError_t e_sync = musaSuccess; + e_sync = musaStreamSynchronize(stream()); #else // PADDLE_WITH_HIP cudaError_t e_sync = cudaSuccess; #if !defined(_WIN32) @@ -547,21 +577,23 @@ struct GPUContext::Impl { void WaitEvent(gpuEvent_t ev) const { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipStreamWaitEvent(stream(), ev, 0)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(musaStreamWaitEvent(stream(), ev, 0)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamWaitEvent(stream(), ev, 0)); #endif } - ncclComm_t GetNcclComm() const { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) + mcclComm_t GetNcclComm() const { +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) // PD_CHECK(nccl_comm_ != nullptr, "the gpu nccl_comm is nullptr."); return nccl_comm_; #endif return nullptr; } - void SetNcclComm(ncclComm_t comm) { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) + void SetNcclComm(mcclComm_t comm) { +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) nccl_comm_ = comm; #endif } @@ -678,6 +710,8 @@ struct GPUContext::Impl { void RecordEvent(gpuEvent_t ev) const { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(ev, stream())); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(musaEventRecord(ev, stream())); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(ev, stream())); #endif @@ -700,6 +734,12 @@ struct GPUContext::Impl { PADDLE_ENFORCE_GPU_SUCCESS( hipStreamAddCallback(stream(), internal::StreamCallbackFunc, func, 0)); #endif + +#ifdef PADDLE_WITH_MUSA + PADDLE_ENFORCE_GPU_SUCCESS( + musaStreamAddCallback(stream(), internal::StreamCallbackFunc, func, 0)); +#endif + #ifdef PADDLE_WITH_CUDA #if CUDA_VERSION >= 10000 PADDLE_ENFORCE_GPU_SUCCESS( @@ -712,7 +752,7 @@ struct GPUContext::Impl { } void WaitStreamCallback() const { -#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_CUDA) +#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || defined(PADDLE_WITH_CUDA) phi::backends::gpu::GpuStreamSync(stream()); #endif { @@ -764,12 +804,12 @@ struct GPUContext::Impl { std::function blas_tensor_core_handle_creator_{nullptr}; blasHandle_t blas_tf32_tensor_core_handle_{nullptr}; std::function blas_tf32_tensor_core_handle_creator_{nullptr}; - blasLtHandle_t blaslt_handle_{nullptr}; - std::function blaslt_handle_creator_{nullptr}; + // blasLtHandle_t blaslt_handle_{nullptr}; + // std::function blaslt_handle_creator_{nullptr}; dnnHandle_t dnn_handle_{nullptr}; std::function dnn_handle_creator_{nullptr}; - solverHandle_t solver_handle_{nullptr}; - std::function solver_handle_creator_{nullptr}; + // solverHandle_t solver_handle_{nullptr}; + // std::function solver_handle_creator_{nullptr}; sparseHandle_t sparse_handle_{nullptr}; std::function sparse_handle_creator_{nullptr}; DnnWorkspaceHandle* workspace_{nullptr}; @@ -783,7 +823,7 @@ struct GPUContext::Impl { std::once_flag flag_tensorcore_cublas_; std::once_flag flag_eigen_device_; -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) // NCCL communicator (single process version) for NCCL collective operations. // NCCL collective operations provides fast collectives over multiple GPUs // both within and across nodes. @@ -792,7 +832,7 @@ struct GPUContext::Impl { // NOTE: Distributed communicator, distributed framework manages its // resources. - ncclComm_t nccl_comm_{nullptr}; + mcclComm_t nccl_comm_{nullptr}; #endif mutable std::mutex blas_mtx_; @@ -839,13 +879,13 @@ blasHandle_t GPUContext::cublas_handle() const { return impl_->GetBlasHandle(); } -blasLtHandle_t GPUContext::cublaslt_handle() const { - return impl_->GetBlasLtHandle(); -} +// blasLtHandle_t GPUContext::cublaslt_handle() const { +// return impl_->GetBlasLtHandle(); +// } -solverHandle_t GPUContext::cusolver_dn_handle() const { - return impl_->GetSolverHandle(); -} +// solverHandle_t GPUContext::cusolver_dn_handle() const { +// return impl_->GetSolverHandle(); +// } sparseHandle_t GPUContext::cusparse_handle() const { return impl_->GetSparseHandle(); @@ -914,9 +954,9 @@ void GPUContext::AddStreamCallback( void GPUContext::WaitStreamCallback() const { impl_->WaitStreamCallback(); } -ncclComm_t GPUContext::nccl_comm() const { return impl_->GetNcclComm(); } +mcclComm_t GPUContext::nccl_comm() const { return impl_->GetNcclComm(); } -void GPUContext::set_nccl_comm(ncclComm_t comm) { impl_->SetNcclComm(comm); } +void GPUContext::set_nccl_comm(mcclComm_t comm) { impl_->SetNcclComm(comm); } void GPUContext::Init() { impl_->allocator_ = const_cast(&this->GetAllocator()); // NOLINT @@ -965,13 +1005,13 @@ void GPUContext::SetBlasTF32Handle(std::function&& func) { impl_->SetBlasTF32Handle(std::move(func)); } -void GPUContext::SetBlasLtHandle(blasLtHandle_t blaslt) { - impl_->SetBlasLtHandle(blaslt); -} +// void GPUContext::SetBlasLtHandle(blasLtHandle_t blaslt) { +// impl_->SetBlasLtHandle(blaslt); +// } -void GPUContext::SetBlasLtHandle(std::function&& func) { - impl_->SetBlasLtHandle(std::move(func)); -} +// void GPUContext::SetBlasLtHandle(std::function&& func) { +// impl_->SetBlasLtHandle(std::move(func)); +// } void GPUContext::SetDnnHandle(dnnHandle_t handle) { impl_->SetDnnHandle(handle); @@ -981,13 +1021,13 @@ void GPUContext::SetDnnHandle(std::function&& func) { impl_->SetDnnHandle(std::move(func)); } -void GPUContext::SetSolverHandle(solverHandle_t handle) { - impl_->SetSolverHandle(handle); -} +// void GPUContext::SetSolverHandle(solverHandle_t handle) { +// impl_->SetSolverHandle(handle); +// } -void GPUContext::SetSolverHandle(std::function&& func) { - impl_->SetSolverHandle(std::move(func)); -} +// void GPUContext::SetSolverHandle(std::function&& func) { +// impl_->SetSolverHandle(std::move(func)); +// } void GPUContext::SetSparseHandle(sparseHandle_t handle) { impl_->SetSparseHandle(handle); @@ -1046,7 +1086,7 @@ void GPUContext::SetDnnAttr(const std::string& attr_name, Attribute attr) { void GPUContext::ClearDnnAttr() { return impl_->ClearDnnAttr(); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) GPUPinnedContext::GPUPinnedContext() { eigen_device_ = std::make_unique(); } diff --git a/paddle/phi/backends/gpu/gpu_context.h b/paddle/phi/backends/gpu/gpu_context.h index 8cd0d414bc105..19eb5dd05cd3c 100644 --- a/paddle/phi/backends/gpu/gpu_context.h +++ b/paddle/phi/backends/gpu/gpu_context.h @@ -15,7 +15,7 @@ limitations under the License. */ #pragma once -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || \ defined(PADDLE_WITH_XPU_KP) #include @@ -109,10 +109,10 @@ class PADDLE_API GPUContext : public DeviceContext, blasHandle_t cublas_handle() const; /*! \brief Return cublasLt handle in the device context. */ - blasLtHandle_t cublaslt_handle() const; + // blasLtHandle_t cublaslt_handle() const; /*! \brief Return cusolver handle in the device context. */ - solverHandle_t cusolver_dn_handle() const; + // solverHandle_t cusolver_dn_handle() const; /*! \brief Return cusparse handle in the device context. */ sparseHandle_t cusparse_handle() const; @@ -183,10 +183,10 @@ class PADDLE_API GPUContext : public DeviceContext, public: /*! \brief Return nccl communicators. */ - ncclComm_t nccl_comm() const; + mcclComm_t nccl_comm() const; /*! \brief Set nccl communicators. */ - void set_nccl_comm(ncclComm_t comm); + void set_nccl_comm(mcclComm_t comm); public: // NOTE: DeviceContext hold resources. Used in training scenarios. @@ -232,14 +232,14 @@ class PADDLE_API GPUContext : public DeviceContext, void SetBlasTF32Handle(blasHandle_t); void SetBlasTF32Handle(std::function&&); - void SetBlasLtHandle(blasLtHandle_t); - void SetBlasLtHandle(std::function&&); + // void SetBlasLtHandle(blasLtHandle_t); + // void SetBlasLtHandle(std::function&&); void SetDnnHandle(dnnHandle_t); void SetDnnHandle(std::function&&); - void SetSolverHandle(solverHandle_t); - void SetSolverHandle(std::function&&); + // void SetSolverHandle(solverHandle_t); + // void SetSolverHandle(std::function&&); void SetSparseHandle(sparseHandle_t); void SetSparseHandle(std::function&&); @@ -276,7 +276,7 @@ using GPUDNNContext = GPUContext; // because we want to implement a KPS-based kernel and make it run // on GPU and XPU at the same time, so we need KPSContext when registering // KPS Kernel. Note: XPU and GPU cannot be compiled at the same time! -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) using KPSContext = GPUContext; #endif @@ -287,7 +287,7 @@ struct DefaultDevice; } // namespace Eigen namespace phi { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) // Currently, GPUPinnedContext is only used to data copying. class GPUPinnedContext : public DeviceContext, diff --git a/paddle/phi/backends/gpu/gpu_decls.h b/paddle/phi/backends/gpu/gpu_decls.h index 4a6b9d2fd87f1..e791326d71fd4 100644 --- a/paddle/phi/backends/gpu/gpu_decls.h +++ b/paddle/phi/backends/gpu/gpu_decls.h @@ -16,57 +16,66 @@ #pragma once #include "paddle/phi/backends/gpu/forwards.h" - +// #include "mudnn/export/c/mudnn_compatible.h" namespace phi { #ifdef PADDLE_WITH_HIP -#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \ +#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE, MUSA_TYPE) \ using GPU_TYPE = ROCM_TYPE; - +#elif defined(PADDLE_WITH_MUSA) +#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE, MUSA_TYPE) \ + using GPU_TYPE = MUSA_TYPE; #else // PADDLE_WITH_CDUA - -#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \ +#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE, MUSA_TYPE) \ using GPU_TYPE = CUDA_TYPE; #endif -DECLARE_TYPE_FOR_GPU(gpuStream_t, cudaStream_t, hipStream_t); -DECLARE_TYPE_FOR_GPU(gpuEvent_t, cudaEvent_t, hipEvent_t); +DECLARE_TYPE_FOR_GPU(gpuStream_t, cudaStream_t, hipStream_t,musaStream_t); +DECLARE_TYPE_FOR_GPU(gpuEvent_t, cudaEvent_t, hipEvent_t,musaEvent_t); -DECLARE_TYPE_FOR_GPU(dnnActivationDescriptor, - cudnnActivationStruct, - miopenActivationDescriptor); -DECLARE_TYPE_FOR_GPU(dnnTensorDescriptor, - cudnnTensorStruct, - miopenTensorDescriptor); -DECLARE_TYPE_FOR_GPU(dnnFilterDescriptor, - cudnnFilterStruct, - miopenTensorDescriptor); -DECLARE_TYPE_FOR_GPU(dnnFilterDescriptor_t, - cudnnFilterDescriptor_t, - miopenTensorDescriptor_t); -DECLARE_TYPE_FOR_GPU(dnnConvolutionDescriptor, - cudnnConvolutionStruct, - miopenConvolutionDescriptor); -DECLARE_TYPE_FOR_GPU(dnnConvolutionDescriptor_t, - cudnnConvolutionDescriptor_t, - miopenConvolutionDescriptor_t); -DECLARE_TYPE_FOR_GPU(dnnPoolingDescriptor_t, - cudnnPoolingDescriptor_t, - miopenPoolingDescriptor_t); -DECLARE_TYPE_FOR_GPU(dnnDropoutDescriptor_t, - cudnnDropoutDescriptor_t, - miopenDropoutDescriptor_t); -DECLARE_TYPE_FOR_GPU(dnnHandle_t, cudnnHandle_t, miopenHandle_t); +// DECLARE_TYPE_FOR_GPU(dnnActivationDescriptor, +// cudnnActivationStruct, +// miopenActivationDescriptor, +// mudnnActivationStruct); +// DECLARE_TYPE_FOR_GPU(dnnTensorDescriptor, +// cudnnTensorStruct, +// miopenTensorDescriptor, +// mudnnTensorStruct); +// DECLARE_TYPE_FOR_GPU(dnnFilterDescriptor, +// cudnnFilterStruct, +// miopenTensorDescriptor, +// mudnnFilterStruct); +// DECLARE_TYPE_FOR_GPU(dnnFilterDescriptor_t, +// cudnnFilterDescriptor_t, +// miopenTensorDescriptor_t, +// mudnnFilterDescriptor_t); +// DECLARE_TYPE_FOR_GPU(dnnConvolutionDescriptor, +// cudnnConvolutionStruct, +// miopenConvolutionDescriptor, +// mudnnConvolutionStruct); +// DECLARE_TYPE_FOR_GPU(dnnConvolutionDescriptor_t, +// cudnnConvolutionDescriptor_t, +// miopenConvolutionDescriptor_t, +// mudnnConvolutionDescriptor_t); +// DECLARE_TYPE_FOR_GPU(dnnPoolingDescriptor_t, +// cudnnPoolingDescriptor_t, +// miopenPoolingDescriptor_t, +// mudnnPoolingDescriptor_t); +// DECLARE_TYPE_FOR_GPU(dnnDropoutDescriptor_t, +// cudnnDropoutDescriptor_t, +// miopenDropoutDescriptor_t, +// mudnnDropoutDescriptor_t); +DECLARE_TYPE_FOR_GPU(dnnHandle_t, cudnnHandle_t, miopenHandle_t,mudnnHandle_t); -DECLARE_TYPE_FOR_GPU(blasHandle_t, cublasHandle_t, rocblas_handle); +DECLARE_TYPE_FOR_GPU(blasHandle_t, cublasHandle_t, rocblas_handle,mublasHandle_t); // TODO(Ming Huang): Since there is no blasLt handler, // use rocblas_handle for workround. -DECLARE_TYPE_FOR_GPU(blasLtHandle_t, cublasLtHandle_t, rocblas_handle); +// DECLARE_TYPE_FOR_GPU(blasLtHandle_t, cublasLtHandle_t, rocblas_handle, mublasHandle_t); -DECLARE_TYPE_FOR_GPU(solverHandle_t, cusolverDnHandle_t, rocsolver_handle); +// DECLARE_TYPE_FOR_GPU(solverHandle_t, cusolverDnHandle_t, rocsolver_handle, musolverDnHandle_t); -DECLARE_TYPE_FOR_GPU(sparseHandle_t, cusparseHandle_t, rocsparse_handle); +DECLARE_TYPE_FOR_GPU(sparseHandle_t, cusparseHandle_t, rocsparse_handle, musparseHandle_t); #undef DECLARE_TYPE_FOR_GPU diff --git a/paddle/phi/backends/gpu/gpu_device_function.h b/paddle/phi/backends/gpu/gpu_device_function.h index 0f79e2a645ab3..5c0c475b140ff 100644 --- a/paddle/phi/backends/gpu/gpu_device_function.h +++ b/paddle/phi/backends/gpu/gpu_device_function.h @@ -13,10 +13,12 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) #ifdef PADDLE_WITH_HIP #include "paddle/phi/backends/gpu/rocm/rocm_device_function.h" +#elif defined(PADDLE_WITH_MUSA) +#include "paddle/phi/backends/gpu/musa/musa_device_function.h" #else #include "paddle/phi/backends/gpu/cuda/cuda_device_function.h" #endif diff --git a/paddle/phi/backends/gpu/gpu_dnn.h b/paddle/phi/backends/gpu/gpu_dnn.h index f37afa3deeb74..30cf3fae80519 100644 --- a/paddle/phi/backends/gpu/gpu_dnn.h +++ b/paddle/phi/backends/gpu/gpu_dnn.h @@ -14,11 +14,14 @@ #pragma once -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) #ifdef PADDLE_WITH_HIP #include "paddle/phi/backends/gpu/rocm/miopen_desc.h" #include "paddle/phi/backends/gpu/rocm/miopen_helper.h" +#elif defined(PADDLE_WITH_MUSA) +#include "paddle/phi/backends/gpu/musa/mudnn_desc.h" +#include "paddle/phi/backends/gpu/musa/mudnn_helper.h" #else // CUDA #include "paddle/phi/backends/gpu/cuda/cudnn_desc.h" #include "paddle/phi/backends/gpu/cuda/cudnn_helper.h" diff --git a/paddle/phi/backends/gpu/gpu_helper.h b/paddle/phi/backends/gpu/gpu_helper.h index 2353b42794ffd..8afa826408cb7 100644 --- a/paddle/phi/backends/gpu/gpu_helper.h +++ b/paddle/phi/backends/gpu/gpu_helper.h @@ -13,10 +13,12 @@ // limitations under the License. #pragma once -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) #ifdef PADDLE_WITH_HIP #include "paddle/phi/backends/gpu/rocm/rocm_helper.h" +#elif defined(PADDLE_WITH_MUSA) +#include "paddle/phi/backends/gpu/musa/musa_helper.h" #else #include "paddle/phi/backends/gpu/cuda/cuda_helper.h" #endif diff --git a/paddle/phi/backends/gpu/gpu_info.h b/paddle/phi/backends/gpu/gpu_info.h index ebf57bd06eb19..2d1b7c1a98f27 100644 --- a/paddle/phi/backends/gpu/gpu_info.h +++ b/paddle/phi/backends/gpu/gpu_info.h @@ -11,7 +11,7 @@ limitations under the License. */ #pragma once -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) #include diff --git a/paddle/phi/backends/gpu/gpu_launch_config.h b/paddle/phi/backends/gpu/gpu_launch_config.h index fd712baf75480..4e300a3031a25 100644 --- a/paddle/phi/backends/gpu/gpu_launch_config.h +++ b/paddle/phi/backends/gpu/gpu_launch_config.h @@ -16,10 +16,12 @@ #pragma once -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) #ifdef PADDLE_WITH_CUDA #include +#elif defined(PADDLE_WITH_MUSA) +#include #else #include #endif diff --git a/paddle/phi/backends/gpu/gpu_primitives.h b/paddle/phi/backends/gpu/gpu_primitives.h index b9c49cb569663..98ebea87eedfd 100644 --- a/paddle/phi/backends/gpu/gpu_primitives.h +++ b/paddle/phi/backends/gpu/gpu_primitives.h @@ -16,6 +16,10 @@ limitations under the License. */ #ifdef PADDLE_WITH_CUDA #include #endif +#ifdef PADDLE_WITH_MUSA +#include +#endif + #ifdef PADDLE_WITH_HIP #include #endif @@ -143,7 +147,7 @@ CUDA_ATOMIC_WRAPPER(Add, int64_t) { static_cast(val)); // NOLINT } -#if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600) +#if defined(__HIPCC__) || defined(__MUSACC__)|| (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600) USE_CUDA_ATOMIC(Add, double); #else CUDA_ATOMIC_WRAPPER(Add, double) { @@ -395,188 +399,12 @@ CUDA_ATOMIC_WRAPPER(Add, complex) { CudaAtomicAdd(imag, val.imag)); } -// For atomicMul. -CUDA_ATOMIC_WRAPPER(Mul, int) { - int res = *address, old = res; // NOLINT - do { - old = res; - res = atomicCAS(address, // NOLINT - old, // NOLINT - val * old); // NOLINT - } while (old != res); - return res; -} - -CUDA_ATOMIC_WRAPPER(Mul, unsigned int) { - unsigned int res = *address, old = res; // NOLINT - do { - old = res; - res = atomicCAS(address, // NOLINT - old, // NOLINT - val * old); // NOLINT - } while (old != res); - return res; -} -// CUDA API uses unsigned long long int, we cannot use uint64_t here. -// It because unsigned long long int is not necessarily uint64_t -CUDA_ATOMIC_WRAPPER(Mul, unsigned long long int) { // NOLINT - unsigned long long int old = *address, assumed; // NOLINT - - do { - assumed = old; - old = atomicCAS(address, assumed, val * assumed); - } while (assumed != old); - return old; -} - -CUDA_ATOMIC_WRAPPER(Mul, int64_t) { - // Here, we check long long int must be int64_t. - static_assert(sizeof(int64_t) == sizeof(long long int), // NOLINT - "long long should be int64"); - long long int res = *address, old = res; // NOLINT - do { - old = res; - res = (long long int)atomicCAS( // NOLINT - (unsigned long long int *)address, // NOLINT - (unsigned long long int)old, // NOLINT - (unsigned long long int)val * (unsigned long long int)old); // NOLINT - } while (old != res); - return res; -} - -CUDA_ATOMIC_WRAPPER(Mul, float) { - int *const address_as_i = reinterpret_cast(address); - int old = *address_as_i, assumed; - - do { - assumed = old; - old = atomicCAS( - address_as_i, assumed, __float_as_int(val * __int_as_float(assumed))); - } while (assumed != old); - - return __int_as_float(old); -} - -CUDA_ATOMIC_WRAPPER(Mul, double) { - unsigned long long int *const address_as_ull = // NOLINT - reinterpret_cast(address); // NOLINT - unsigned long long int old = *address_as_ull, assumed; // NOLINT - - do { - assumed = old; - - old = atomicCAS(address_as_ull, - assumed, - __double_as_longlong(val * __longlong_as_double(assumed))); - } while (assumed != old); - - return __longlong_as_double(old); -} - -#ifdef PADDLE_CUDA_FP16 -inline static __device__ uint32_t mul_to_low_half(uint32_t val, float x) { - phi::dtype::float16 low_half; - // The float16 in lower 16bits - low_half.x = static_cast(val & 0xFFFFu); - low_half = static_cast(static_cast(low_half) * x); - return (val & 0xFFFF0000u) | low_half.x; -} - -inline static __device__ uint32_t mul_to_high_half(uint32_t val, float x) { - phi::dtype::float16 high_half; - // The float16 in higher 16bits - high_half.x = static_cast(val >> 16); - high_half = - static_cast(static_cast(high_half) * x); - return (val & 0xFFFFu) | (static_cast(high_half.x) << 16); -} - -CUDA_ATOMIC_WRAPPER(Mul, phi::dtype::float16) { - if (*address >= val) { - return *address; - } - uint32_t *address_as_ui = reinterpret_cast( - reinterpret_cast(address) - - (reinterpret_cast(address) & 0x02)); - float val_f = static_cast(val); - uint32_t old = *address_as_ui; - uint32_t assumed; - if (((uintptr_t)address & 0x02) == 0) { - // The float16 value stay at lower 16 bits of the address. - do { - assumed = old; - old = atomicCAS(address_as_ui, assumed, mul_to_low_half(assumed, val_f)); - } while (old != assumed); - phi::dtype::float16 ret; - ret.x = old & 0xFFFFu; - return ret; - } else { - // The float16 value stay at higher 16 bits of the address. - do { - assumed = old; - old = atomicCAS(address_as_ui, assumed, mul_to_high_half(assumed, val_f)); - } while (old != assumed); - phi::dtype::float16 ret; - ret.x = old >> 16; - return ret; - } -} -#endif - -inline static __device__ uint32_t bf16_mul_to_low_half(uint32_t val, float x) { - phi::dtype::bfloat16 low_half; - // The bfloat16 in lower 16bits - low_half.x = static_cast(val & 0xFFFFu); - low_half = - static_cast(static_cast(low_half) * x); - return (val & 0xFFFF0000u) | low_half.x; -} - -inline static __device__ uint32_t bf16_mul_to_high_half(uint32_t val, float x) { - phi::dtype::bfloat16 high_half; - // The bfloat16 in higher 16bits - high_half.x = static_cast(val >> 16); - high_half = - static_cast(static_cast(high_half) * x); - return (val & 0xFFFFu) | (static_cast(high_half.x) << 16); -} - -CUDA_ATOMIC_WRAPPER(Mul, phi::dtype::bfloat16) { - uint32_t *address_as_ui = reinterpret_cast( - reinterpret_cast(address) - - (reinterpret_cast(address) & 0x02)); - float val_f = static_cast(val); - uint32_t old = *address_as_ui; - uint32_t assumed; - if (((uintptr_t)address & 0x02) == 0) { - // The bfloat16 value stay at lower 16 bits of the address. - do { - assumed = old; - old = atomicCAS( - address_as_ui, assumed, bf16_mul_to_low_half(assumed, val_f)); - } while (old != assumed); - phi::dtype::bfloat16 ret; - ret.x = old & 0xFFFFu; - return ret; - } else { - // The bfloat16 value stay at higher 16 bits of the address. - do { - assumed = old; - old = atomicCAS( - address_as_ui, assumed, bf16_mul_to_high_half(assumed, val_f)); - } while (old != assumed); - phi::dtype::bfloat16 ret; - ret.x = old >> 16; - return ret; - } -} - // For atomicMax USE_CUDA_ATOMIC(Max, int); USE_CUDA_ATOMIC(Max, unsigned int); // CUDA API uses unsigned long long int, we cannot use uint64_t here. // It because unsigned long long int is not necessarily uint64_t -#if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350) +#if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350) || defined(__MUSACC__) USE_CUDA_ATOMIC(Max, unsigned long long int); // NOLINT #else CUDA_ATOMIC_WRAPPER(Max, unsigned long long int) { // NOLINT @@ -762,7 +590,7 @@ USE_CUDA_ATOMIC(Min, int); USE_CUDA_ATOMIC(Min, unsigned int); // CUDA API uses unsigned long long int, we cannot use uint64_t here. // It because unsigned long long int is not necessarily uint64_t -#if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350) +#if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350) || defined(__MUSACC__) USE_CUDA_ATOMIC(Min, unsigned long long int); // NOLINT #else CUDA_ATOMIC_WRAPPER(Min, unsigned long long int) { // NOLINT diff --git a/paddle/phi/backends/gpu/gpu_resources.cc b/paddle/phi/backends/gpu/gpu_resources.cc index a29b5e110922a..89471ba29aee0 100644 --- a/paddle/phi/backends/gpu/gpu_resources.cc +++ b/paddle/phi/backends/gpu/gpu_resources.cc @@ -37,6 +37,10 @@ #include "paddle/phi/backends/dynload/rocsparse.h" #endif +#ifdef PADDLE_WITH_MUSA +#include "paddle/phi/backends/dynload/musparse.h" +#endif + #include "glog/logging.h" #include "unsupported/Eigen/CXX11/Tensor" @@ -64,10 +68,9 @@ void InitGpuProperties(Place place, *driver_version = backends::gpu::GetGPUDriverVersion(place.GetDeviceId()); *runtime_version = backends::gpu::GetGPURuntimeVersion(place.GetDeviceId()); +#ifdef PADDLE_WITH_CUDA const gpuDeviceProp& prop = backends::gpu::GetDeviceProperties(place.GetDeviceId()); - -#ifdef PADDLE_WITH_CUDA static const std::set compiled_archs{CUDA_REAL_ARCHS}; // Make sure compiled cuda arch is as same as runtime cuda arch. if (compiled_archs.find(*compute_capability) == compiled_archs.cend() && @@ -115,6 +118,17 @@ void InitGpuProperties(Place place, } #endif +#ifdef PADDLE_WITH_MUSA + LOG_FIRST_N(INFO, 1) << "Please NOTE: device: " + << static_cast(place.device) + << ", GPU Compute Capability: " + << *compute_capability / 10 << "." + << *compute_capability % 10 + << ", Driver API Version: " << *driver_version / 10000 + << "." << (*driver_version % 10000) / 100 + << ", Runtime API Version: " << *runtime_version / 10000 + << "." << (*runtime_version % 10000) / 100; +#else // TODO(wilber): glog may be replaced in the future? LOG_FIRST_N(WARNING, 1) << "Please NOTE: device: " << static_cast(place.device) @@ -126,6 +140,7 @@ void InitGpuProperties(Place place, << ", Runtime API Version: " << *runtime_version / 1000 << "." << (*runtime_version % 100) / 10; +#endif #ifdef PADDLE_WITH_HIP size_t miopen_major, miopen_minor, miopen_patch; PADDLE_ENFORCE_GPU_SUCCESS( @@ -144,42 +159,62 @@ void InitGpuProperties(Place place, << "Please recompile or reinstall Paddle with compatible MIOPEN " "version."; } +#elif defined(PADDLE_WITH_MUSA) + // TODO(@caizhi): mudnnGetVersion is not supported for MUSA now. + // Requests have been submitted to Mudnn. + // size_t mudnn_dso_ver = dynload::mudnnGetVersion(); + size_t mudnn_dso_ver = 2500; + LOG_FIRST_N(INFO, 1) << "device: " << static_cast(place.device) + << ", muDNN Version: " << mudnn_dso_ver / 1000 << "." + << (mudnn_dso_ver % 1000) / 100 << "."; + + // Check MUSA/MUDNN version compatiblity + auto local_musa_version = *driver_version; + int compile_musa_version = MUSA_VERSION; +#if defined(__linux__) + PADDLE_ENFORCE_EQ( + (local_musa_version / 100 < compile_musa_version / 100) && + (mudnn_dso_ver / 1000 < MUDNN_VERSION / 1000), + false, + phi::errors::InvalidArgument( + "The installed Paddle is compiled with MUSA%d/muDNN%d," + "but MUSA/muDNN version in your machine is MUSA%d/muDNN%d. " + "which will cause serious incompatible bug. " + "Please recompile or reinstall Paddle with compatible MUSA/muDNN " + "version.", + compile_musa_version / 10000, + MUDNN_VERSION / 1000, + local_musa_version / 10000, + mudnn_dso_ver / 1000)); +#endif + if (local_musa_version < compile_musa_version) { + LOG_FIRST_N(WARNING, 1) + << "WARNING: device: " << static_cast(place.device) + << ". The installed Paddle is compiled with MUSA " + << compile_musa_version / 10000 << "." + << (compile_musa_version % 1000) / 100 + << ", but MUSA runtime version in your machine is " + << local_musa_version / 10000 << "." + << (local_musa_version % 1000) / 100 + << ", which may cause serious incompatible bug. " + << "Please recompile or reinstall Paddle with compatible MUSA " + "version."; + } #else size_t cudnn_dso_ver = dynload::cudnnGetVersion(); - auto get_cudnn_major = [](auto version) { - if (version < 9000) { - return version / 1000; - } - // CUDNN changes the CUDNN_VERSION rules after 9.0 - return version / 10000; - }; - auto get_cudnn_minor = [](auto version) { - if (version < 9000) { - return (version % 1000) / 100; - } - // CUDNN changes the CUDNN_VERSION rules after 9.0 - return (version % 10000) / 100; - }; - LOG_FIRST_N(WARNING, 1) << "device: " << static_cast(place.device) - << ", cuDNN Version: " - << get_cudnn_major(cudnn_dso_ver) << "." - << get_cudnn_minor(cudnn_dso_ver) << "."; + << ", cuDNN Version: " << cudnn_dso_ver / 1000 << "." + << (cudnn_dso_ver % 1000) / 100 << "."; // Check CUDA/CUDNN version compatiblity auto local_cuda_version = (*driver_version / 1000) * 10 + (*driver_version % 100) / 10; auto compile_cuda_version = (CUDA_VERSION / 1000) * 10 + (CUDA_VERSION % 100) / 10; - - // Compute cuDNN major - auto local_cudnn_major = get_cudnn_major(cudnn_dso_ver); - size_t compile_cudnn_major = CUDNN_MAJOR; - #if defined(__linux__) PADDLE_ENFORCE_EQ( (local_cuda_version / 10 < compile_cuda_version / 10) && - (local_cudnn_major < compile_cudnn_major), + (cudnn_dso_ver / 1000 < CUDNN_VERSION / 1000), false, phi::errors::InvalidArgument( "The installed Paddle is compiled with CUDA%d/cuDNN%d," @@ -188,9 +223,9 @@ void InitGpuProperties(Place place, "Please recompile or reinstall Paddle with compatible CUDA/cuDNN " "version.", compile_cuda_version / 10, - compile_cudnn_major, + CUDNN_VERSION / 1000, local_cuda_version / 10, - local_cudnn_major)); + cudnn_dso_ver / 1000)); #endif if (local_cuda_version < compile_cuda_version) { LOG_FIRST_N(WARNING, 1) @@ -206,10 +241,14 @@ void InitGpuProperties(Place place, #endif } + void InitStream(gpuStream_t* stream) { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS( hipStreamCreateWithPriority(stream, hipStreamDefault, 0)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS( + musaStreamCreateWithPriority(stream, musaStreamDefault, 0)); #else PADDLE_ENFORCE_GPU_SUCCESS( cudaStreamCreateWithPriority(stream, cudaStreamDefault, 0)); @@ -220,6 +259,8 @@ void DestoryStream(gpuStream_t stream) { if (stream != nullptr) { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipStreamDestroy(stream)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(musaStreamDestroy(stream)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamDestroy(stream)); #endif @@ -231,6 +272,9 @@ void InitBlasHandle(blasHandle_t* blas_handle, gpuStream_t stream) { #ifdef PADDLE_WITH_HIP phi::dynload::rocblas_create_handle(blas_handle); phi::dynload::rocblas_set_stream(*blas_handle, stream); +#elif defined(PADDLE_WITH_MUSA) + phi::dynload::mublasCreate(blas_handle); + phi::dynload::mublasSetStream(*blas_handle, stream); #else // PADDLE_WITH_CUDA PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasCreate(blas_handle)); PADDLE_RETRY_CUDA_SUCCESS( @@ -244,6 +288,11 @@ void DestroyBlasHandle(blasHandle_t handle) { phi::dynload::rocblas_destroy_handle(handle); handle = nullptr; } +#elif defined(PADDLE_WITH_MUSA) + if (handle != nullptr) { + phi::dynload::mublasDestroy(handle); + handle = nullptr; + } #else if (handle != nullptr) { phi::dynload::cublasDestroy(handle); @@ -252,20 +301,20 @@ void DestroyBlasHandle(blasHandle_t handle) { #endif // PADDLE_WITH_HIP } -void InitBlasLtHandle(blasLtHandle_t* blaslt_handle) { -#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060 - phi::dynload::cublasLtCreate(blaslt_handle); -#endif -} +// void InitBlasLtHandle(blasLtHandle_t* blaslt_handle) { +// #if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060 +// phi::dynload::cublasLtCreate(blaslt_handle); +// #endif +// } -void DestroyBlasLtHandle(blasLtHandle_t handle) { -#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060 - if (handle != nullptr) { - phi::dynload::cublasLtDestroy(handle); - handle = nullptr; - } -#endif -} +// void DestroyBlasLtHandle(blasLtHandle_t handle) { +// #if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060 +// if (handle != nullptr) { +// phi::dynload::cublasLtDestroy(handle); +// handle = nullptr; +// } +// #endif +// } void InitDnnHandle(dnnHandle_t* handle, gpuStream_t stream, Place place) { if (phi::dynload::HasCUDNN()) { @@ -289,6 +338,9 @@ void InitDnnHandle(dnnHandle_t* handle, gpuStream_t stream, Place place) { } PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenCreate(handle)); PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenSetStream(*handle, stream)); +#elif defined(PADDLE_WITH_MUSA) + phi::dynload::mudnnCreate(handle, place.device); + phi::dynload::mudnnSetStream(*handle, stream); #else auto version = phi::dynload::cudnnGetVersion(); auto local_cudnn_major = @@ -319,6 +371,11 @@ void DestroyDnnHandle(dnnHandle_t handle) { PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenDestroy(handle)); handle = nullptr; } +#elif defined(PADDLE_WITH_MUSA) + if (handle != nullptr) { + phi::dynload::mudnnDestroy(handle); + handle = nullptr; + } #else if (handle != nullptr) { PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnDestroy(handle)); @@ -327,21 +384,21 @@ void DestroyDnnHandle(dnnHandle_t handle) { #endif // PADDLE_WITH_HIP } -void InitSolverHandle(solverHandle_t* handle, gpuStream_t stream) { -#ifndef PADDLE_WITH_HIP - PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cusolverDnCreate(handle)); - PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cusolverDnSetStream(*handle, stream)); -#endif -} +// void InitSolverHandle(solverHandle_t* handle, gpuStream_t stream) { +// #if !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA) +// PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cusolverDnCreate(handle)); +// PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cusolverDnSetStream(*handle, stream)); +// #endif +// } -void DestroySolverHandle(solverHandle_t solver_handle) { -#ifndef PADDLE_WITH_HIP - if (solver_handle != nullptr) { - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cusolverDnDestroy(solver_handle)); - solver_handle = nullptr; - } -#endif -} +// void DestroySolverHandle(solverHandle_t solver_handle) { +// #if !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA) +// if (solver_handle != nullptr) { +// PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cusolverDnDestroy(solver_handle)); +// solver_handle = nullptr; +// } +// #endif +// } void InitSparseHandle(sparseHandle_t* handle, gpuStream_t stream) { // ROCM is not yet supported @@ -354,6 +411,9 @@ void InitSparseHandle(sparseHandle_t* handle, gpuStream_t stream) { #elif defined(PADDLE_WITH_HIP) phi::dynload::rocsparse_create_handle(handle); phi::dynload::rocsparse_set_stream(*handle, stream); +#elif defined(PADDLE_WITH_MUSA) + phi::dynload::musparseCreateHandle(handle); + phi::dynload::musparseSetStream(*handle, stream); #endif } @@ -370,6 +430,11 @@ void DestroySparseHandle(sparseHandle_t handle) { phi::dynload::rocsparse_destroy_handle(handle); handle = nullptr; } +#elif defined(PADDLE_WITH_MUSA) + if (handle != nullptr) { + phi::dynload::musparseDestroyHandle(handle); + handle = nullptr; + } #endif } diff --git a/paddle/phi/backends/gpu/gpu_resources.h b/paddle/phi/backends/gpu/gpu_resources.h index 7bec5eebf5886..df6a131ff315d 100644 --- a/paddle/phi/backends/gpu/gpu_resources.h +++ b/paddle/phi/backends/gpu/gpu_resources.h @@ -35,14 +35,14 @@ void DestoryStream(gpuStream_t stream); void InitBlasHandle(blasHandle_t* blas_handle, gpuStream_t stream); void DestroyBlasHandle(blasHandle_t handle); -void InitBlasLtHandle(blasLtHandle_t* blaslt_handle); -void DestroyBlasLtHandle(blasLtHandle_t handle); +// void InitBlasLtHandle(blasLtHandle_t* blaslt_handle); +// void DestroyBlasLtHandle(blasLtHandle_t handle); void InitDnnHandle(dnnHandle_t* handle, gpuStream_t stream, Place place); void DestroyDnnHandle(dnnHandle_t handle); -void InitSolverHandle(solverHandle_t* handle, gpuStream_t stream); -void DestroySolverHandle(solverHandle_t solver_handle); +// void InitSolverHandle(solverHandle_t* handle, gpuStream_t stream); +// void DestroySolverHandle(solverHandle_t solver_handle); void InitSparseHandle(sparseHandle_t* handle, gpuStream_t stream); void DestroySparseHandle(sparseHandle_t handle); diff --git a/paddle/phi/backends/gpu/gpu_types.h b/paddle/phi/backends/gpu/gpu_types.h index 77f403795b6b3..00c0bdf6c545b 100644 --- a/paddle/phi/backends/gpu/gpu_types.h +++ b/paddle/phi/backends/gpu/gpu_types.h @@ -17,11 +17,15 @@ #include "paddle/phi/backends/gpu/forwards.h" #include "paddle/phi/backends/gpu/gpu_decls.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) #ifdef PADDLE_WITH_HIP #include "paddle/phi/backends/dynload/miopen.h" #include "paddle/phi/backends/dynload/rocblas.h" +#elif defined(PADDLE_WITH_MUSA) +#include "paddle/phi/backends/dynload/mublas.h" +#include "paddle/phi/backends/dynload/mudnn.h" #else // PADDLE_WITH_CUDA #include "paddle/phi/backends/dynload/cublas.h" #include "paddle/phi/backends/dynload/cudnn.h" @@ -30,18 +34,39 @@ namespace phi { #ifdef PADDLE_WITH_HIP -#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \ +#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE, MUSA_TYPE) \ using GPU_TYPE = ROCM_TYPE; -#else // PADDLE_WITH_CDUA +#elif defined(PADDLE_WITH_MUSA) +#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE, MUSA_TYPE) \ + using GPU_TYPE = MUSA_TYPE; + +#else // PADDLE_WITH_MUSA +#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE, MUSA_TYPE) \ + using GPU_TYPE = CUDA_TYPE; +#endif // PADDLE_WITH_CUDA + +DECLARE_TYPE_FOR_GPU(gpuError_t, cudaError_t, hipError_t, musaError_t); +DECLARE_TYPE_FOR_GPU(gpuMemcpyKind, + cudaMemcpyKind, + hipMemcpyKind, + musaMemcpyKind); +DECLARE_TYPE_FOR_GPU(gpuDeviceProp, + cudaDeviceProp, + hipDeviceProp_t, + musaDeviceProp); +#undef DECLARE_TYPE_FOR_GPU + +#ifndef PADDLE_WITH_MUSA +#ifdef PADDLE_WITH_HIP +#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \ + using GPU_TYPE = ROCM_TYPE; +#else // PADDLE_WITH_MUSA #define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \ using GPU_TYPE = CUDA_TYPE; -#endif +#endif // PADDLE_WITH_CUDA -DECLARE_TYPE_FOR_GPU(gpuError_t, cudaError_t, hipError_t); -DECLARE_TYPE_FOR_GPU(gpuMemcpyKind, cudaMemcpyKind, hipMemcpyKind); -DECLARE_TYPE_FOR_GPU(gpuDeviceProp, cudaDeviceProp, hipDeviceProp_t); DECLARE_TYPE_FOR_GPU(dnnDataType_t, cudnnDataType_t, miopenDataType_t); DECLARE_TYPE_FOR_GPU(dnnPoolingMode_t, cudnnPoolingMode_t, miopenPoolingMode_t); DECLARE_TYPE_FOR_GPU(dnnTensorFormat_t, @@ -50,34 +75,45 @@ DECLARE_TYPE_FOR_GPU(dnnTensorFormat_t, DECLARE_TYPE_FOR_GPU(dnnActivationMode_t, cudnnActivationMode_t, miopenActivationMode_t); - #undef DECLARE_TYPE_FOR_GPU +#endif #ifdef PADDLE_WITH_HIP -#define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV) \ +#define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV, MUSA_CV) \ constexpr auto GPU_CV = ROCM_CV; +#elif defined(PADDLE_WITH_MUSA) +#define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV, MUSA_CV) \ + constexpr auto GPU_CV = MUSA_CV; #else // PADDLE_WITH_CUDA -#define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV) \ +#define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV, MUSA_CV) \ constexpr auto GPU_CV = CUDA_CV; #endif DECLARE_CONSTANT_FOR_GPU(gpuErrorOutOfMemory, cudaErrorMemoryAllocation, - hipErrorOutOfMemory); -DECLARE_CONSTANT_FOR_GPU(gpuErrorNotReady, cudaErrorNotReady, hipErrorNotReady); -DECLARE_CONSTANT_FOR_GPU(gpuSuccess, cudaSuccess, hipSuccess); + hipErrorOutOfMemory, + musaErrorMemoryAllocation); +DECLARE_CONSTANT_FOR_GPU(gpuErrorNotReady, + cudaErrorNotReady, + hipErrorNotReady, + musaErrorNotReady); +DECLARE_CONSTANT_FOR_GPU(gpuSuccess, cudaSuccess, hipSuccess, musaSuccess); DECLARE_CONSTANT_FOR_GPU(gpuMemcpyHostToDevice, cudaMemcpyKind::cudaMemcpyHostToDevice, - hipMemcpyKind::hipMemcpyHostToDevice); + hipMemcpyKind::hipMemcpyHostToDevice, + musaMemcpyKind::musaMemcpyHostToDevice); DECLARE_CONSTANT_FOR_GPU(gpuMemcpyDeviceToHost, cudaMemcpyKind::cudaMemcpyDeviceToHost, - hipMemcpyKind::hipMemcpyDeviceToHost); + hipMemcpyKind::hipMemcpyDeviceToHost, + musaMemcpyKind::musaMemcpyDeviceToHost); DECLARE_CONSTANT_FOR_GPU(gpuMemcpyDeviceToDevice, cudaMemcpyKind::cudaMemcpyDeviceToDevice, - hipMemcpyKind::hipMemcpyDeviceToDevice); + hipMemcpyKind::hipMemcpyDeviceToDevice, + musaMemcpyKind::musaMemcpyDeviceToDevice); #undef DECLARE_CONSTANT_FOR_GPU } // namespace phi -#endif // defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#endif // defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || + // defined(PADDLE_WITH_MUSA ) diff --git a/paddle/phi/backends/gpu/musa/mudnn_desc.h b/paddle/phi/backends/gpu/musa/mudnn_desc.h new file mode 100644 index 0000000000000..9de12d586bea0 --- /dev/null +++ b/paddle/phi/backends/gpu/musa/mudnn_desc.h @@ -0,0 +1,202 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include "paddle/phi/backends/gpu/musa/mudnn_helper.h" +#include "paddle/phi/core/utils/data_type.h" + +namespace phi { +namespace backends { +namespace gpu { + +template +inline std::vector TransformDimOrder(const std::vector& dims) { + std::vector transformed_dims(dims.begin(), dims.end()); + if (dims.size() < 4) { + return transformed_dims; + } + T H, W, D, C; + if (dims.size() == 4) { + H = dims[1]; + W = dims[2]; + C = dims[3]; + transformed_dims[1] = C; + transformed_dims[2] = H; + transformed_dims[3] = W; + } else { + D = dims[1]; + H = dims[2]; + W = dims[3]; + C = dims[4]; + transformed_dims[1] = C; + transformed_dims[2] = D; + transformed_dims[3] = H; + transformed_dims[4] = W; + } + return transformed_dims; +} + +inline dynload::Tensor::Type ToCudnnDataType(const phi::DataType& t) { + dynload::Tensor::Type type = dynload::Tensor::Type::FLOAT; + switch (t) { + case phi::DataType::FLOAT16: + type = dynload::Tensor::Type::HALF; + break; + case phi::DataType::FLOAT32: + type = dynload::Tensor::Type::FLOAT; + break; + case phi::DataType::FLOAT64: + type = dynload::Tensor::Type::DOUBLE; + break; + default: + PD_THROW("Don't support this data type ", t); + } + return type; +} + +class TensorDescriptor { + public: + using T = dynload::Tensor; + TensorDescriptor() : desc_(std::make_unique()) {} + T* desc() { return desc_.get(); } + T* desc() const { return desc_.get(); } + void set(const phi::DenseTensor& tensor, const int groups = 1) { + auto dims = phi::vectorize(tensor.dims()); + std::vector strides(dims.size()); + strides[dims.size() - 1] = 1; + for (int i = dims.size() - 2; i >= 0; i--) { + strides[i] = dims[i + 1] * strides[i + 1]; + } + desc_->SetType(ToCudnnDataType(tensor.dtype())); + desc_->SetNdInfo(static_cast(dims.size()), dims.data(), strides.data()); + desc_->SetAddr(tensor.data()); + } + + template + void set(const phi::DenseTensor& tensor, const Type* data) { + auto dims = phi::vectorize(tensor.dims()); + std::vector strides(dims.size()); + strides[dims.size() - 1] = 1; + for (int i = dims.size() - 2; i >= 0; i--) { + strides[i] = dims[i + 1] * strides[i + 1]; + } + desc_->SetType(ToCudnnDataType(tensor.dtype())); + desc_->SetNdInfo(static_cast(dims.size()), dims.data(), strides.data()); + desc_->SetAddr(data); + } + + void set(const std::vector& dims, + const dynload::Tensor::Format format, + const dynload::Tensor::Type dtype) { + std::vector transformed_dims; + std::vector dims_64(dims.begin(), dims.end()); + if (format == dynload::Tensor::Format::NHWC) { + transformed_dims = TransformDimOrder(dims_64); + } else { + transformed_dims = dims_64; + } + desc_->SetFormat(format); + desc_->SetType(dtype); + desc_->SetNdInfo(static_cast(transformed_dims.size()), transformed_dims.data()); + } + + void set(const phi::DenseTensor& tensor, + const dynload::Tensor::Format format) { + auto dims = phi::vectorize(tensor.dims()); + auto dtype = ToCudnnDataType(tensor.dtype()); + set(dims, format, dtype); + desc_->SetAddr(tensor.data()); + } + + private: + std::unique_ptr desc_; +}; + +class FilterDescriptor { + public: + using T = phi::dynload::Tensor; + FilterDescriptor() : desc_(std::make_unique()) {} + T* desc() { return desc_.get(); } + T* desc() const { return desc_.get(); } + + void set(const std::vector& dims, + const dynload::Tensor::Format format, + const dynload::Tensor::Type dtype, + const int groups = 1) { + std::vector transformed_dims; + std::vector dims_64(dims.begin(), dims.end()); + if (format == dynload::Tensor::Format::NHWC) { + transformed_dims = TransformDimOrder(dims_64); + } else { + transformed_dims = dims_64; + } + if (groups > 1) { + transformed_dims[1] = transformed_dims[1] / groups; + } + desc_->SetFormat(format); + desc_->SetType(dtype); + desc_->SetNdInfo(static_cast(transformed_dims.size()), transformed_dims.data()); + } + + void set(const phi::DenseTensor& tensor, + const dynload::Tensor::Format format, + const int groups = 1) { + auto dims = phi::vectorize(tensor.dims()); + auto dtype = ToCudnnDataType(tensor.dtype()); + set(dims, format, dtype, groups); + desc_->SetAddr(tensor.data()); + } + + private: + std::unique_ptr desc_; +}; + +class ConvolutionDescriptor { + public: + using T = dynload::Convolution; + ConvolutionDescriptor() : desc_(std::make_unique()) {} + T* desc() { return desc_.get(); } + T* desc() const { return desc_.get(); } + + void set(dynload::Tensor::Type dtype, + const std::vector& pads, + const std::vector& strides, + const std::vector& dilations, + bool allow_tf32, + const int groups = 1) { + allow_tf32_ = allow_tf32; + desc_->SetNdInfo( + pads.size(), pads.data(), strides.data(), dilations.data()); + desc_->SetComputeMode(dynload::Convolution::ComputeMode::TENSOR); + desc_->SetGroups(groups); + } + + bool allow_tf32_; + + private: + std::unique_ptr desc_; +}; + +} // namespace gpu +} // namespace backends +} // namespace phi diff --git a/paddle/phi/backends/gpu/musa/mudnn_helper.h b/paddle/phi/backends/gpu/musa/mudnn_helper.h new file mode 100644 index 0000000000000..55030e860b421 --- /dev/null +++ b/paddle/phi/backends/gpu/musa/mudnn_helper.h @@ -0,0 +1,323 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include + +#include "gflags/gflags.h" +#include "paddle/phi/backends/dynload/mudnn.h" +#include "paddle/phi/common/bfloat16.h" +#include "paddle/phi/common/float16.h" +#include "paddle/phi/common/place.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/enforce.h" + +#define CUDNN_BN_MIN_EPSILON 1e-05 + +DECLARE_bool(cudnn_deterministic); + +namespace phi { +namespace backends { +namespace gpu { + +#define CUDNN_VERSION_MIN(major, minor, patch) \ + (CUDNN_VERSION >= ((major)*1000 + (minor)*100 + (patch))) + +enum class DataLayout { // Not use + kNHWC, + kNCHW, + kNCDHW, + kNDHWC, // add, liyamei + kNCHW_VECT_C, +}; + +enum class PoolingMode { + kMaximum, + kMaximumDeterministic, + kAverageExclusive, + kAverageInclusive, +}; + +inline dynload::Pooling::Mode GetPoolingMode(const PoolingMode& mode) { + switch (mode) { + // case PoolingMode::kMaximumDeterministic: + // return CUDNN_POOLING_MAX_DETERMINISTIC; + case PoolingMode::kAverageExclusive: + return dynload::Pooling::Mode::AVGPOOL_COUNT_WITHOUT_PAD; + case PoolingMode::kAverageInclusive: + return dynload::Pooling::Mode::AVGPOOL_COUNT_PAD; + case PoolingMode::kMaximum: + return dynload::Pooling::Mode::MAXPOOL; + default: + PADDLE_THROW( + phi::errors::Unimplemented("Unexpected MUDNN pooling mode.")); + } +} + +template +class CudnnDataType; + +template <> +class CudnnDataType { + public: + static const dynload::Tensor::Type type = dynload::Tensor::Type::BFLOAT16; + using ScalingParamType = const float; + using BatchNormParamType = float; + static ScalingParamType* kOne() { + static ScalingParamType v = 1.0; + return &v; + } + static ScalingParamType* kZero() { + static ScalingParamType v = 0.0; + return &v; + } +}; + +template <> +class CudnnDataType { + public: + static const dynload::Tensor::Type type = dynload::Tensor::Type::HALF; + // The scaling param type is float for HALF and FLOAT tensors + using ScalingParamType = const float; + using BatchNormParamType = float; + static ScalingParamType* kOne() { + static ScalingParamType v = 1.0; + return &v; + } + static ScalingParamType* kZero() { + static ScalingParamType v = 0.0; + return &v; + } +}; + +template <> +class CudnnDataType { + public: + static const dynload::Tensor::Type type = dynload::Tensor::Type::FLOAT; + using ScalingParamType = const float; + using BatchNormParamType = float; + static ScalingParamType* kOne() { + static ScalingParamType v = 1.0; + return &v; + } + static ScalingParamType* kZero() { + static ScalingParamType v = 0.0; + return &v; + } +}; + +template <> +class CudnnDataType { + public: + static const dynload::Tensor::Type type = dynload::Tensor::Type::DOUBLE; + using ScalingParamType = const double; + using BatchNormParamType = double; + static ScalingParamType* kOne() { + static ScalingParamType v = 1.0; + return &v; + } + static ScalingParamType* kZero() { + static ScalingParamType v = 0.0; + return &v; + } +}; + +inline dynload::Tensor::Format GetCudnnTensorFormat( + const DataLayout& order) { // Not use + switch (order) { + case DataLayout::kNHWC: + return dynload::Tensor::Format::NHWC; + case DataLayout::kNCHW: + return dynload::Tensor::Format::NCHW; + case DataLayout::kNCDHW: + return dynload::Tensor::Format::NCDHW; + case DataLayout::kNDHWC: + return dynload::Tensor::Format::NDHWC; + default: + PADDLE_THROW(phi::errors::Unimplemented( + "MUDNN has no equivalent dataLayout for input order.")); + } + return dynload::Tensor::Format::NCHW; +} + +class ScopedTensorDescriptor { + public: + ScopedTensorDescriptor() {} + ~ScopedTensorDescriptor() PADDLE_MAY_THROW {} + + inline dynload::Tensor descriptor(const dynload::Tensor::Format format, + const dynload::Tensor::Type type, + const std::vector& dims, + const int groups = 1) { + // the format is not used now, will add later + std::vector strides(dims.size()); + strides[dims.size() - 1] = 1; + for (int i = dims.size() - 2; i >= 0; i--) { + strides[i] = dims[i + 1] * strides[i + 1]; + } + // Update tensor descriptor dims setting if groups > 1 + // NOTE: Here, Assume using NCHW or NCDHW order + std::vector dims_with_group(dims.begin(), dims.end()); + if (groups > 1) { + dims_with_group[1] = dims_with_group[1] / groups; + } + + PADDLE_ENFORCE_EQ( + format, + dynload::Tensor::Format::NCHW, + phi::errors::InvalidArgument("format should ONLY be NCHW in MUDNN.")); + + desc_.SetNdInfo( + static_cast(dims_with_group.size()), dims_with_group.data(), strides.data()); + desc_.SetType(type); + desc_.SetFormat(format); + + return desc_; + } + + template + inline dynload::Tensor& descriptor(const DataLayout& order, + const std::vector& dims, + const int groups = 1) { + descriptor( + GetCudnnTensorFormat(order), CudnnDataType::type, dims, groups); + return desc_; + } + + template + inline dynload::Tensor& descriptor(const phi::DenseTensor& tensor, + const DataLayout& order, + const std::vector& dims, + const int groups = 1) { + desc_.SetAddr(tensor.data()); + descriptor(order, dims, groups); + return desc_; + } + + template + inline dynload::Tensor& descriptor(const T* data, + const DataLayout& order, + const std::vector& dims, + const int groups = 1) { + desc_.SetAddr(data); + descriptor(order, dims, groups); + return desc_; + } + + inline dynload::Tensor& descriptor(const dynload::Tensor::Type mudnn_type, + const std::vector& dim, + const std::vector& stride) { + std::vector dims_64(dim.begin(), dim.end()); + std::vector stride_64(dim.begin(), dim.end()); + desc_.SetType(mudnn_type); + desc_.SetNdInfo(static_cast(dims_64.size()), dims_64.data(), stride_64.data()); + return desc_; + } + + template + inline dynload::Tensor& descriptor(const std::vector& dim, + const std::vector& stride) { + descriptor(CudnnDataType::type, dim, stride); + return desc_; + } + + inline dynload::Tensor& desc() { return desc_; } + + private: + dynload::Tensor desc_; + DISABLE_COPY_AND_ASSIGN(ScopedTensorDescriptor); +}; + +class ScopedPoolingDescriptor { + public: + ScopedPoolingDescriptor() {} + ~ScopedPoolingDescriptor() PADDLE_MAY_THROW {} + + inline dynload::Pooling& descriptor(const PoolingMode& mode, + const std::vector& kernel, + const std::vector& pads, + const std::vector& strides) { + PADDLE_ENFORCE_EQ(kernel.size(), + pads.size(), + phi::errors::InvalidArgument( + "The size of kernel and pads should be equal. But " + "received size of kernel is %d, size of pads is %d.", + kernel.size(), + pads.size())); + PADDLE_ENFORCE_EQ( + kernel.size(), + strides.size(), + phi::errors::InvalidArgument( + "The size of kernel and strides should be equal. But " + "received size of kernel is %d, size of strides is %d.", + kernel.size(), + strides.size())); + const std::vector dilation(kernel.size(), 1); + desc_.SetNdInfo(kernel.size(), + kernel.data(), + pads.data(), + strides.data(), + dilation.data()); + desc_.SetMode(GetPoolingMode(mode)); + return desc_; + } + + dynload::Pooling& desc() { return desc_; } + + private: + dynload::Pooling desc_; + DISABLE_COPY_AND_ASSIGN(ScopedPoolingDescriptor); +}; + +class ScopedSoftmaxDescriptor { + public: + ScopedSoftmaxDescriptor() {} + ~ScopedSoftmaxDescriptor() PADDLE_MAY_THROW {} + + inline dynload::Softmax& descriptor(const dynload::Softmax::Mode& mode, + const dynload::Softmax::Algorithm& algo, + const int& dim) { + desc_.SetMode(mode); + desc_.SetDim(dim); + desc_.SetAlgorithm(algo); + return desc_; + } + + dynload::Softmax& desc() { return desc_; } + + private: + dynload::Softmax desc_; + DISABLE_COPY_AND_ASSIGN(ScopedSoftmaxDescriptor); +}; + +static void InternalMemFree(void* ptr) { + if (!ptr) { + return; + } + PADDLE_ENFORCE_GPU_SUCCESS(musaFree(ptr)); +} + +static dynload::MemoryHandler InternalMemAlloc(size_t s) { + void* data = nullptr; + if (s) { + PADDLE_ENFORCE_GPU_SUCCESS(musaMalloc(&data, s)); + } + return dynload::MemoryHandler(data, InternalMemFree); +} + +} // namespace gpu +} // namespace backends +} // namespace phi diff --git a/paddle/phi/backends/gpu/musa/musa_device_function.h b/paddle/phi/backends/gpu/musa/musa_device_function.h new file mode 100644 index 0000000000000..f2847daf4dfac --- /dev/null +++ b/paddle/phi/backends/gpu/musa/musa_device_function.h @@ -0,0 +1,193 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#define PADDLE_CUDA_FP16 +// NOTE(): support float16 to half in header file. +#include "paddle/phi/common/bfloat16.h" +#include "paddle/phi/common/complex.h" +#include "paddle/phi/common/float16.h" +#include "paddle/phi/core/enforce.h" + +namespace phi { +namespace backends { +namespace gpu { + +#define FULL_WARP_MASK 0xFFFFFFFF +#define CREATE_SHFL_MASK(mask, predicate) \ + mask = __ballot_sync(FULL_WARP_MASK, (predicate)) + +#define CUDA_LAUNCH_KERNEL_BASE(dim, ...) \ + case (dim): { \ + constexpr auto kPowerOfTwoDim = (dim); \ + __VA_ARGS__; \ + } break + +#define CUDA_LAUNCH_KERNEL_HELPER(...) \ + CUDA_LAUNCH_KERNEL_BASE(1024, ##__VA_ARGS__); \ + CUDA_LAUNCH_KERNEL_BASE(512, ##__VA_ARGS__); \ + CUDA_LAUNCH_KERNEL_BASE(256, ##__VA_ARGS__); \ + CUDA_LAUNCH_KERNEL_BASE(128, ##__VA_ARGS__); \ + CUDA_LAUNCH_KERNEL_BASE(64, ##__VA_ARGS__); \ + CUDA_LAUNCH_KERNEL_BASE(32, ##__VA_ARGS__); + +template +__forceinline__ __device__ T +CudaShuffleDownSync(unsigned mask, T val, int delta, int width = warpSize) { + return __shfl_down_sync(mask, val, static_cast(delta), width); +} + +template +__forceinline__ __device__ T CudaShuffleXorSync(unsigned mask, + T val, + int width = warpSize) { + return __shfl_xor_sync(mask, val, width); +} + + +#if defined(PADDLE_WITH_MUSA) +// Due to the inconsistency between mcc and nvcc, certain type conversions are not implicitly performed, so we specialize here. +template <> +__forceinline__ __device__ phi::dtype::float16 CudaShuffleXorSync(unsigned mask, + phi::dtype::float16 val, + int width) { + return (phi::dtype::float16)(__shfl_xor_sync(mask, float(val), width)); +} +#endif + +template <> +__forceinline__ __device__ phi::dtype::float16 CudaShuffleDownSync( + unsigned mask, phi::dtype::float16 val, int delta, int width) { + return phi::dtype::float16(__shfl_down_sync( + mask, val.to_half(), static_cast(delta), width)); +} + +template <> +__forceinline__ __device__ phi::dtype::bfloat16 CudaShuffleDownSync( + unsigned mask, phi::dtype::bfloat16 val, int delta, int width) { +#if defined(PADDLE_MUSA_BF16) && defined(__MUSA_ARCH__) && __MUSA_ARCH__ >= 220 + return phi::dtype::bfloat16(__shfl_down_sync( + mask, val.to_mt_bfloat16(), static_cast(delta), width)); +#else + PADDLE_ENFORCE( + false, "__shfl_down_sync with bfloat16 is not supported on cuda <= 11."); +#endif +} + +template <> +__forceinline__ __device__ phi::dtype::complex CudaShuffleDownSync( + unsigned mask, phi::dtype::complex val, int delta, int width) { + float real = static_cast(__shfl_down_sync( + mask, static_cast(val.real), static_cast(delta), width)); + float imag = static_cast(__shfl_down_sync( + mask, static_cast(val.imag), static_cast(delta), width)); + return phi::dtype::complex(real, imag); +} + +template <> +__forceinline__ __device__ phi::dtype::complex CudaShuffleDownSync( + unsigned mask, phi::dtype::complex val, int delta, int width) { + double real = + static_cast(__shfl_down_sync(mask, + static_cast(val.real), + static_cast(delta), + width)); + double imag = + static_cast(__shfl_down_sync(mask, + static_cast(val.imag), + static_cast(delta), + width)); + return phi::dtype::complex(real, imag); +} + +template <> +__forceinline__ __device__ phi::dtype::bfloat16 CudaShuffleXorSync( + unsigned mask, phi::dtype::bfloat16 val, int width) { +#if defined(PADDLE_MUSA_BF16) + return phi::dtype::bfloat16( + __shfl_xor_sync(mask, val.to_mt_bfloat16(), width)); +#else + PADDLE_ENFORCE( + false, "__shfl_xor_sync with bfloat16 is not supported on cuda <= 11."); +#endif +} + +template <> +__forceinline__ __device__ phi::dtype::complex CudaShuffleXorSync( + unsigned mask, phi::dtype::complex val, int width) { + float real = static_cast( + __shfl_xor_sync(mask, static_cast(val.real), width)); + float imag = static_cast( + __shfl_xor_sync(mask, static_cast(val.imag), width)); + return phi::dtype::complex(real, imag); +} + +template <> +__forceinline__ __device__ phi::dtype::complex CudaShuffleXorSync( + unsigned mask, phi::dtype::complex val, int width) { + double real = static_cast( + __shfl_xor_sync(mask, static_cast(val.real), width)); + double imag = static_cast( + __shfl_xor_sync(mask, static_cast(val.imag), width)); + return phi::dtype::complex(real, imag); +} + +template +__forceinline__ __device__ T +CudaShuffleSync(unsigned mask, T val, int src_line, int width = 32) { + return __shfl_sync(mask, val, src_line, width); +} + +template +HOSTDEVICE T Infinity() { + return INFINITY; +} + +template +__device__ T reduceSum(T val, int tid, int len) { + // NOTE(zcd): The warp size should be taken from the + // parameters of the GPU but not specified as 32 simply. + // To make the reduceSum more efficiently, + // I use Warp-Level Parallelism and assume the Warp size + // is 32 which may be different for different GPU, + // but most card's warp size is 32. + const int warpSize = 32; + __shared__ T shm[warpSize]; + unsigned mask = 0u; + CREATE_SHFL_MASK(mask, tid < len); + + for (int offset = warpSize / 2; offset > 0; offset /= 2) + val += phi::backends::gpu::CudaShuffleDownSync(mask, val, offset); + + if (tid < warpSize) shm[tid] = 0; + __syncthreads(); + + if (tid % warpSize == 0) { + shm[tid / warpSize] = val; + } + __syncthreads(); + + CREATE_SHFL_MASK(mask, tid < warpSize); + + if (tid < warpSize) { + val = shm[tid]; + for (int offset = warpSize / 2; offset > 0; offset /= 2) + val += phi::backends::gpu::CudaShuffleDownSync(mask, val, offset); + } + return val; +} +} // namespace gpu +} // namespace backends +} // namespace phi diff --git a/paddle/phi/backends/gpu/musa/musa_helper.h b/paddle/phi/backends/gpu/musa/musa_helper.h new file mode 100644 index 0000000000000..7463edc5d9ff6 --- /dev/null +++ b/paddle/phi/backends/gpu/musa/musa_helper.h @@ -0,0 +1,74 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +namespace phi { +namespace backends { +namespace gpu { + +/* + * Summary: Grid stride looping macro in CUDA kernel + * + * [ Why need this macro? ] + * + * The original looping in CUDA kernel is: + * + * `for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \ + * i += blockDim.x * gridDim.x)` + * + * This for condition is risky. The value of `blockIdx.x * blockDim.x` + * may be large, such as over 1GB, the first iteration is no problem here, + * but when `i += blockDim.x * gridDim.x` is executed, the value of i + * will greater than INT_MAX and overflow becomes negative value, at + * this time, the cycle condition `i < (n)` is still satisfied, so it + * will cause illegal access to cuda memory. + * + * Here is a real example in ERINE, it will trigger above error. + * The related data are: + * - blockIdx.x = 2172938 + * - blockDim.x = 512 + * - blockIdx.x * blockDim.x = 1112543864 + * - INT_MAX = 2147483647 + * + * So we polish the for condition as follow, the int64_t __index__ will + * prevent overflow in the loop increment. + * + * Parameters: + * - i: loop index + * - num: total element numbers + * + * Examples: + * template + * __global__ void Scale(T* logit_grad, const T* loss_grad, const int num, + * const int d, const int remain) { + * CUDA_KERNEL_LOOP(index, num) { + * int idx_n = index / d; + * int idx_remain = index % remain; + * logit_grad[index] *= loss_grad[idx_n * remain + idx_remain]; + * } + * } + * + */ + +#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type) \ + int64_t __index__ = \ + static_cast(blockIdx.x) * blockDim.x + threadIdx.x; \ + int64_t __stride__ = static_cast(blockDim.x) * gridDim.x; \ + for (index_type i = __index__; __index__ < (num); \ + __index__ += __stride__, i = __index__) + +} // namespace gpu +} // namespace backends +} // namespace phi diff --git a/paddle/phi/backends/gpu/musa/musa_info.cc b/paddle/phi/backends/gpu/musa/musa_info.cc new file mode 100644 index 0000000000000..cab81b58f5ecb --- /dev/null +++ b/paddle/phi/backends/gpu/musa/musa_info.cc @@ -0,0 +1,334 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#include "paddle/fluid/framework/fleet/heter_ps/log_patch.h" +#include "paddle/phi/backends/gpu/gpu_info.h" + +#include "paddle/phi/core/enforce.h" + +#include "musa_runtime.h" + +static std::once_flag g_device_props_size_init_flag; +static std::vector> g_device_props_init_flags; +static std::vector g_device_props; + +namespace phi { +namespace backends { +namespace gpu { + +int DnnVersion() { + if (!dynload::HasCUDNN()) return -1; + // TODO(@caizhi): mudnnGetVersion is not supported now. + // version info will be returned from mudnnGetVersion later. + const int version_major = 2; + const int version_minor = 5; + const int version_patch = 0; + return version_major * 1000 + version_minor * 100 + version_patch; +} + +static int GetGPUDeviceCountImpl() { + int driverVersion = 0; + musaError_t status = musaDriverGetVersion(&driverVersion); + + if (!(status == gpuSuccess && driverVersion != 0)) { + // No GPU driver + VLOG(2) << "GPU Driver Version can't be detected. No GPU driver!"; + return 0; + } + + const auto *musa_visible_devices = std::getenv("MUSA_VISIBLE_DEVICES"); + + if (musa_visible_devices != nullptr) { + std::string musa_visible_devices_str(musa_visible_devices); + if (!musa_visible_devices_str.empty()) { + musa_visible_devices_str.erase( + 0, musa_visible_devices_str.find_first_not_of('\'')); + musa_visible_devices_str.erase( + musa_visible_devices_str.find_last_not_of('\'') + 1); + musa_visible_devices_str.erase( + 0, musa_visible_devices_str.find_first_not_of('\"')); + musa_visible_devices_str.erase( + musa_visible_devices_str.find_last_not_of('\"') + 1); + } + if (std::all_of(musa_visible_devices_str.begin(), + musa_visible_devices_str.end(), + [](char ch) { return ch == ' '; })) { + VLOG(2) << "MUSA_VISIBLE_DEVICES is set to be " + "empty. No GPU detected."; + return 0; + } + } + int count; + PADDLE_ENFORCE_GPU_SUCCESS(musaGetDeviceCount(&count)); + return count; +} + +int GetGPUDeviceCount() { + // cache the count + static auto dev_cnt = GetGPUDeviceCountImpl(); + return dev_cnt; +} + +int GetGPUComputeCapability(int id) { + PADDLE_ENFORCE_LT( + id, + GetGPUDeviceCount(), + phi::errors::InvalidArgument("Device id must be less than GPU count, " + "but received id is: %d. GPU count is: %d.", + id, + GetGPUDeviceCount())); + int major, minor; + auto major_error_code = + musaDeviceGetAttribute(&major, musaDevAttrComputeCapabilityMajor, id); + auto minor_error_code = + musaDeviceGetAttribute(&minor, musaDevAttrComputeCapabilityMinor, id); + + PADDLE_ENFORCE_GPU_SUCCESS(major_error_code); + PADDLE_ENFORCE_GPU_SUCCESS(minor_error_code); + return major * 10 + minor; +} + +int GetGPURuntimeVersion(int id) { + PADDLE_ENFORCE_LT( + id, + GetGPUDeviceCount(), + phi::errors::InvalidArgument("Device id must be less than GPU count, " + "but received id is: %d. GPU count is: %d.", + id, + GetGPUDeviceCount())); + int runtime_version = 0; + // Note: runtime_version = MAJOR * 10000 + MINOR * 100 + PATCH + PADDLE_ENFORCE_GPU_SUCCESS(musaRuntimeGetVersion(&runtime_version)); + return runtime_version; +} + +int GetGPUDriverVersion(int id) { + PADDLE_ENFORCE_LT( + id, + GetGPUDeviceCount(), + phi::errors::InvalidArgument("Device id must be less than GPU count, " + "but received id is: %d. GPU count is: %d.", + id, + GetGPUDeviceCount())); + int driver_version = 0; + // Note: driver_version = MAJOR * 10000 + MINOR * 100 + PATCH + PADDLE_ENFORCE_GPU_SUCCESS(musaDriverGetVersion(&driver_version)); + return driver_version; +} + +bool TensorCoreAvailable() { return false; } + +int GetGPUMultiProcessors(int id) { + PADDLE_ENFORCE_LT( + id, + GetGPUDeviceCount(), + phi::errors::InvalidArgument("Device id must be less than GPU count, " + "but received id is: %d. GPU count is: %d.", + id, + GetGPUDeviceCount())); + int count; + PADDLE_ENFORCE_GPU_SUCCESS( + musaDeviceGetAttribute(&count, musaDevAttrMultiProcessorCount, id)); + return count; +} + +int GetGPUMaxThreadsPerMultiProcessor(int id) { + PADDLE_ENFORCE_LT( + id, + GetGPUDeviceCount(), + phi::errors::InvalidArgument("Device id must be less than GPU count, " + "but received id is: %d. GPU count is: %d.", + id, + GetGPUDeviceCount())); + int count; + PADDLE_ENFORCE_GPU_SUCCESS(musaDeviceGetAttribute( + &count, musaDevAttrMaxThreadsPerMultiProcessor, id)); + + return count; +} + +int GetGPUMaxThreadsPerBlock(int id) { + PADDLE_ENFORCE_LT( + id, + GetGPUDeviceCount(), + phi::errors::InvalidArgument("Device id must be less than GPU count, " + "but received id is: %d. GPU count is: %d.", + id, + GetGPUDeviceCount())); + int count; + PADDLE_ENFORCE_GPU_SUCCESS( + musaDeviceGetAttribute(&count, musaDevAttrMaxThreadsPerBlock, id)); + return count; +} + +int GetCurrentDeviceId() { + int device_id; + PADDLE_ENFORCE_GPU_SUCCESS(musaGetDevice(&device_id)); + return device_id; +} + +std::array GetGpuMaxGridDimSize(int id) { + PADDLE_ENFORCE_LT( + id, + GetGPUDeviceCount(), + phi::errors::InvalidArgument("Device id must be less than GPU count, " + "but received id is: %d. GPU count is: %d.", + id, + GetGPUDeviceCount())); + std::array ret; + int size; + auto error_code_x = musaDeviceGetAttribute(&size, musaDevAttrMaxGridDimX, id); + PADDLE_ENFORCE_GPU_SUCCESS(error_code_x); + ret[0] = size; + + auto error_code_y = musaDeviceGetAttribute(&size, musaDevAttrMaxGridDimY, id); + PADDLE_ENFORCE_GPU_SUCCESS(error_code_y); + ret[1] = size; + + auto error_code_z = musaDeviceGetAttribute(&size, musaDevAttrMaxGridDimZ, id); + PADDLE_ENFORCE_GPU_SUCCESS(error_code_z); + ret[2] = size; + return ret; +} + +std::pair GetGpuStreamPriorityRange() { + int least_priority, greatest_priority; + PADDLE_ENFORCE_GPU_SUCCESS( + musaDeviceGetStreamPriorityRange(&least_priority, &greatest_priority)); + return std::make_pair(least_priority, greatest_priority); +} + +const gpuDeviceProp &GetDeviceProperties(int id) { + std::call_once(g_device_props_size_init_flag, [&] { + int gpu_num = 0; + gpu_num = GetGPUDeviceCount(); + g_device_props_init_flags.resize(gpu_num); + g_device_props.resize(gpu_num); + for (int i = 0; i < gpu_num; ++i) { + g_device_props_init_flags[i] = std::make_unique(); + } + }); + + if (id == -1) { + id = GetCurrentDeviceId(); + } + + if (id < 0 || id >= static_cast(g_device_props.size())) { + PADDLE_THROW(phi::errors::OutOfRange( + "The device id %d is out of range [0, %d), where %d is the number of " + "devices on this machine. Because the device id should be greater than " + "or equal to zero and smaller than the number of gpus. Please input " + "appropriate device again!", + id, + static_cast(g_device_props.size()), + static_cast(g_device_props.size()))); + } + + std::call_once(*(g_device_props_init_flags[id]), [&] { + PADDLE_ENFORCE_GPU_SUCCESS( + musaGetDeviceProperties(&g_device_props[id], id)); + }); + //TODO@mtai:we hope not to skip UT that ask compute capacity to be greater than 7/8 + g_device_props[id].major = 9; + g_device_props[id].minor = 9; + return g_device_props[id]; +} + +void SetDeviceId(int id) { + PADDLE_ENFORCE_LT( + id, + GetGPUDeviceCount(), + phi::errors::InvalidArgument("Device id must be less than GPU count, " + "but received id is: %d. GPU count is: %d.", + id, + GetGPUDeviceCount())); + PADDLE_RETRY_CUDA_SUCCESS(musaSetDevice(id)); +} + +void GpuMemcpyAsync(void *dst, + const void *src, + size_t count, + gpuMemcpyKind kind, + gpuStream_t stream) { + PADDLE_ENFORCE_GPU_SUCCESS(musaMemcpyAsync(dst, src, count, kind, stream)); +} + +void GpuMemcpySync(void *dst, + const void *src, + size_t count, + gpuMemcpyKind kind) { + PADDLE_ENFORCE_GPU_SUCCESS(musaMemcpy(dst, src, count, kind)); + +} + +void GpuMemcpyPeerAsync(void *dst, + int dst_device, + const void *src, + int src_device, + size_t count, + gpuStream_t stream) { + PADDLE_ENFORCE_GPU_SUCCESS( + musaMemcpyPeerAsync(dst, dst_device, src, src_device, count, stream)); +} + +void GpuMemcpyPeerSync( + void *dst, int dst_device, const void *src, int src_device, size_t count) { + PADDLE_ENFORCE_GPU_SUCCESS( + musaMemcpyPeer(dst, dst_device, src, src_device, count)); +} + +void GpuMemsetAsync(void *dst, int value, size_t count, gpuStream_t stream) { + PADDLE_ENFORCE_GPU_SUCCESS(musaMemsetAsync(dst, value, count, stream)); +} + +void GpuStreamSync(gpuStream_t stream) { + PADDLE_ENFORCE_GPU_SUCCESS(musaStreamSynchronize(stream)); +} + +void GpuDestroyStream(gpuStream_t stream) { + PADDLE_ENFORCE_GPU_SUCCESS(musaStreamDestroy(stream)); +} + +void GpuDeviceSync() { PADDLE_ENFORCE_GPU_SUCCESS(musaDeviceSynchronize()); } + +gpuError_t GpuGetLastError() { return musaGetLastError(); } + +bool IsGPUManagedMemorySupported(int dev_id) { + PADDLE_ENFORCE_LT( + dev_id, + GetGPUDeviceCount(), + phi::errors::InvalidArgument("Device id must be less than GPU count, " + "but received id is: %d. GPU count is: %d.", + dev_id, + GetGPUDeviceCount())); + return false; +} + +bool IsGPUManagedMemoryOversubscriptionSupported(int dev_id) { + PADDLE_ENFORCE_LT( + dev_id, + GetGPUDeviceCount(), + phi::errors::InvalidArgument("Device id must be less than GPU count, " + "but received id is: %d. GPU count is: %d.", + dev_id, + GetGPUDeviceCount())); + return false; +} + +} // namespace gpu +} // namespace backends +} // namespace phi diff --git a/paddle/phi/capi/include/c_meta_tensor.h b/paddle/phi/capi/include/c_meta_tensor.h index f4c9a541e526a..08f01084c6abf 100644 --- a/paddle/phi/capi/include/c_meta_tensor.h +++ b/paddle/phi/capi/include/c_meta_tensor.h @@ -39,13 +39,6 @@ int64_t PD_MetaTensorGetDim(const PD_MetaTensor *tensor, size_t index, PD_Status *status); -int64_t PD_MetaTensorGetNumStrides(const PD_MetaTensor *tensor, - PD_Status *status); - -int64_t PD_MetaTensorGetStride(const PD_MetaTensor *tensor, - size_t index, - PD_Status *status); - bool PD_MetaTensorIsValid(const PD_MetaTensor *tensor, PD_Status *status); void PD_MetaTensorSetDims(PD_MetaTensor *tensor, @@ -53,11 +46,6 @@ void PD_MetaTensorSetDims(PD_MetaTensor *tensor, const int64_t *dims, PD_Status *status); -void PD_MetaTensorSetStrides(PD_MetaTensor *tensor, - int64_t nstrides, - const int64_t *strides, - PD_Status *status); - void PD_MetaTensorSetDataType(PD_MetaTensor *tensor, PD_DataType dtype, PD_Status *status); diff --git a/paddle/phi/capi/include/c_tensor.h b/paddle/phi/capi/include/c_tensor.h index 2df292c6b946b..c4f706c70ccfb 100644 --- a/paddle/phi/capi/include/c_tensor.h +++ b/paddle/phi/capi/include/c_tensor.h @@ -41,12 +41,6 @@ int64_t PD_TensorGetDim(const PD_Tensor *tensor, size_t index, PD_Status *status); -int64_t PD_TensorGetNumStrides(const PD_Tensor *tensor, PD_Status *status); - -int64_t PD_TensorGetStride(const PD_Tensor *tensor, - size_t index, - PD_Status *status); - void PD_TensorGetLoD(const PD_Tensor *tensor, PD_List *data, PD_List *offset, @@ -58,22 +52,11 @@ bool PD_TensorIsValid(const PD_Tensor *tensor, PD_Status *status); void *PD_TensorGetHolder(const PD_Tensor *tensor, PD_Status *status); -size_t PD_TensorGetOffset(const PD_Tensor *tensor, PD_Status *status); - void PD_TensorSetDims(PD_Tensor *tensor, int64_t ndims, const int64_t *dims, PD_Status *status); -void PD_TensorSetOffset(PD_Tensor *tensor, - const int64_t offset, - PD_Status *status); - -void PD_TensorSetStrides(PD_Tensor *tensor, - int64_t nstrides, - const int64_t *strides, - PD_Status *status); - void PD_TensorSetDataType(PD_Tensor *tensor, PD_DataType dtype, PD_Status *status); diff --git a/paddle/phi/capi/include/wrapper_base.h b/paddle/phi/capi/include/wrapper_base.h index 75f3e2d9e350e..061561008a95e 100644 --- a/paddle/phi/capi/include/wrapper_base.h +++ b/paddle/phi/capi/include/wrapper_base.h @@ -72,19 +72,6 @@ inline std::vector PD_TensorGetDims(PD_Tensor* tensor, return std::vector(); } -inline std::vector PD_TensorGetStrides(PD_Tensor* tensor, - PD_Status* status) { - int64_t nstrides = PD_TensorGetNumStrides(tensor, status); - if (nstrides > 0) { - std::vector shape(nstrides); - for (int64_t i = 0; i < nstrides; ++i) { - shape[i] = PD_TensorGetStride(tensor, i, status); - } - return shape; - } - return std::vector(); -} - inline std::vector PD_MetaTensorGetDims(PD_MetaTensor* tensor, PD_Status* status) { int64_t ndims = PD_MetaTensorGetNumDims(tensor, status); @@ -98,19 +85,6 @@ inline std::vector PD_MetaTensorGetDims(PD_MetaTensor* tensor, return std::vector(); } -inline std::vector PD_MetaTensorGetStrides(PD_MetaTensor* tensor, - PD_Status* status) { - int64_t nstrides = PD_MetaTensorGetNumStrides(tensor, status); - if (nstrides > 0) { - std::vector shape(nstrides); - for (int64_t i = 0; i < nstrides; ++i) { - shape[i] = PD_MetaTensorGetStride(tensor, i, status); - } - return shape; - } - return std::vector(); -} - template class WrapperBase { public: @@ -160,13 +134,6 @@ class DenseTensor : public WrapperBase { return holder; } - size_t offset() const { - C_Status status; - auto offset = PD_TensorGetOffset(raw_data(), &status); - PD_CHECK_STATUS(status); - return offset; - } - std::vector dims() const { C_Status status; auto dimension = PD_TensorGetDims(raw_data(), &status); @@ -174,13 +141,6 @@ class DenseTensor : public WrapperBase { return dimension; } - std::vector strides() const { - C_Status status; - auto strides = PD_TensorGetStrides(raw_data(), &status); - PD_CHECK_STATUS(status); - return strides; - } - PD_DataType dtype() const { C_Status status; auto data_type = PD_TensorGetPDDataType(raw_data(), &status); @@ -247,18 +207,6 @@ class DenseTensor : public WrapperBase { PD_CHECK_STATUS(status); } - void set_offset(const int64_t& offset) { - C_Status status; - PD_TensorSetOffset(raw_data(), offset, &status); - PD_CHECK_STATUS(status); - } - - void set_strides(const std::vector& strides) { - C_Status status; - PD_TensorSetStrides(raw_data(), strides.size(), strides.data(), &status); - PD_CHECK_STATUS(status); - } - void set_dtype(PD_DataType data_type) { C_Status status; PD_TensorSetDataType(raw_data(), data_type, &status); @@ -565,13 +513,6 @@ class MetaTensor : WrapperBase { return dimension; } - std::vector strides() const { - C_Status status; - auto strides = PD_MetaTensorGetStrides(raw_data(), &status); - PD_CHECK_STATUS(status); - return strides; - } - PD_DataType dtype() const { C_Status status; auto data_type = PD_MetaTensorGetPDDataType(raw_data(), &status); @@ -599,13 +540,6 @@ class MetaTensor : WrapperBase { PD_CHECK_STATUS(status); } - void set_strides(const std::vector& strides) { - C_Status status; - PD_MetaTensorSetStrides( - raw_data(), strides.size(), strides.data(), &status); - PD_CHECK_STATUS(status); - } - void set_dtype(PD_DataType data_type) { C_Status status; PD_MetaTensorSetDataType(raw_data(), data_type, &status); diff --git a/paddle/phi/capi/lib/c_device_context.cc b/paddle/phi/capi/lib/c_device_context.cc index b415ece7e361d..6dc1ff768260d 100644 --- a/paddle/phi/capi/lib/c_device_context.cc +++ b/paddle/phi/capi/lib/c_device_context.cc @@ -35,7 +35,7 @@ PD_Stream PD_DeviceContextGetStream(const PD_DeviceContext* ctx, reinterpret_cast(ctx)->stream()); } else if (dev_ctx_type == phi::AllocationType::CPU) { return nullptr; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) } else if (dev_ctx_type == phi::AllocationType::GPU) { return reinterpret_cast( reinterpret_cast(ctx)->stream()); diff --git a/paddle/phi/capi/lib/c_kernel_context.cc b/paddle/phi/capi/lib/c_kernel_context.cc index e9fe2aada1f35..7df79117dbae5 100644 --- a/paddle/phi/capi/lib/c_kernel_context.cc +++ b/paddle/phi/capi/lib/c_kernel_context.cc @@ -30,7 +30,7 @@ PD_DeviceContext* PD_KernelContextGetDeviceContext(PD_KernelContext* ctx) { } else if (dev_ctx_type == phi::AllocationType::CPU) { return reinterpret_cast(const_cast( &kernel_context->GetDeviceContext())); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) } else if (dev_ctx_type == phi::AllocationType::GPU) { return reinterpret_cast(const_cast( &kernel_context->GetDeviceContext())); diff --git a/paddle/phi/capi/lib/c_meta_tensor.cc b/paddle/phi/capi/lib/c_meta_tensor.cc index f436ba9d3cde0..6ea6eda1a7f23 100644 --- a/paddle/phi/capi/lib/c_meta_tensor.cc +++ b/paddle/phi/capi/lib/c_meta_tensor.cc @@ -88,36 +88,6 @@ int64_t PD_MetaTensorGetDim(const PD_MetaTensor *tensor, return cc_tensor->dims()[index]; } -int64_t PD_MetaTensorGetNumStrides(const PD_MetaTensor *tensor, - PD_Status *status) { - if (status) { - if (!tensor) { - *status = C_FAILED; - return 0; - } - *status = C_SUCCESS; - } - - auto cc_tensor = reinterpret_cast(tensor); - return cc_tensor->strides().size(); -} - -int64_t PD_MetaTensorGetStride(const PD_MetaTensor *tensor, - size_t index, - PD_Status *status) { - auto cc_tensor = reinterpret_cast(tensor); - - if (status) { - if (!tensor || index >= static_cast(cc_tensor->strides().size())) { - *status = C_FAILED; - return 0; - } - *status = C_SUCCESS; - } - - return cc_tensor->strides()[index]; -} - bool PD_MetaTensorIsValid(const PD_MetaTensor *tensor, PD_Status *status) { if (status) { if (!tensor) { @@ -147,22 +117,6 @@ void PD_MetaTensorSetDims(PD_MetaTensor *tensor, cc_tensor->set_dims(common::make_ddim(shape)); } -void PD_MetaTensorSetStrides(PD_MetaTensor *tensor, - int64_t nstrides, - const int64_t *strides, - PD_Status *status) { - if (status) { - if (!tensor) { - *status = C_FAILED; - return; - } - *status = C_SUCCESS; - } - auto cc_tensor = reinterpret_cast(tensor); - std::vector shape(strides, strides + nstrides); - cc_tensor->set_strides(common::make_ddim(shape)); -} - void PD_MetaTensorSetDataType(PD_MetaTensor *tensor, PD_DataType dtype, PD_Status *status) { diff --git a/paddle/phi/capi/lib/c_tensor.cc b/paddle/phi/capi/lib/c_tensor.cc index eb8c8c6f4eb47..31a724447b7c7 100644 --- a/paddle/phi/capi/lib/c_tensor.cc +++ b/paddle/phi/capi/lib/c_tensor.cc @@ -111,35 +111,6 @@ int64_t PD_TensorGetDim(const PD_Tensor* tensor, return cc_tensor->dims()[index]; } -int64_t PD_TensorGetNumStrides(const PD_Tensor* tensor, PD_Status* status) { - if (status) { - if (!tensor) { - *status = C_FAILED; - return 0; - } - *status = C_SUCCESS; - } - - auto cc_tensor = reinterpret_cast(tensor); - return cc_tensor->strides().size(); -} - -int64_t PD_TensorGetStride(const PD_Tensor* tensor, - size_t index, - PD_Status* status) { - auto cc_tensor = reinterpret_cast(tensor); - - if (status) { - if (!tensor || index >= static_cast(cc_tensor->strides().size())) { - *status = C_FAILED; - return 0; - } - *status = C_SUCCESS; - } - - return cc_tensor->strides()[index]; -} - void PD_TensorGetLoD(const PD_Tensor* tensor, PD_List* data, PD_List* offset, @@ -214,19 +185,6 @@ void* PD_TensorGetHolder(const PD_Tensor* tensor, PD_Status* status) { return cc_tensor->Holder().get(); } -size_t PD_TensorGetOffset(const PD_Tensor* tensor, PD_Status* status) { - if (status) { - if (!tensor) { - *status = C_FAILED; - return 0; - } - *status = C_SUCCESS; - } - - auto cc_tensor = reinterpret_cast(tensor); - return cc_tensor->offset(); -} - void PD_TensorSetDims(PD_Tensor* tensor, int64_t ndims, const int64_t* dims, @@ -243,36 +201,6 @@ void PD_TensorSetDims(PD_Tensor* tensor, cc_tensor->Resize(common::make_ddim(shape)); } -void PD_TensorSetOffset(PD_Tensor* tensor, - const int64_t offset, - PD_Status* status) { - if (status) { - if (!tensor) { - *status = C_FAILED; - return; - } - *status = C_SUCCESS; - } - auto cc_tensor = reinterpret_cast(tensor); - cc_tensor->set_offset(offset); -} - -void PD_TensorSetStrides(PD_Tensor* tensor, - int64_t nstrides, - const int64_t* strides, - PD_Status* status) { - if (status) { - if (!tensor) { - *status = C_FAILED; - return; - } - *status = C_SUCCESS; - } - auto cc_tensor = reinterpret_cast(tensor); - std::vector shape(strides, strides + nstrides); - cc_tensor->set_strides(common::make_ddim(shape)); -} - void PD_TensorSetDataType(PD_Tensor* tensor, PD_DataType dtype, PD_Status* status) { diff --git a/paddle/phi/common/backend.h b/paddle/phi/common/backend.h index 64dab3ccdeb3b..4f238496c4149 100644 --- a/paddle/phi/common/backend.h +++ b/paddle/phi/common/backend.h @@ -138,7 +138,7 @@ inline Backend StringToBackend(const char* backend_cstr) { } else if (s == std::string("GPUDNN")) { return Backend::GPUDNN; } else if (s == std::string("KPS")) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) // NOTE(chenweihang) KPS is not yet a complete backend, and it still needs // to be converted // to GPU in the GPU environment diff --git a/paddle/phi/common/bfloat16.h b/paddle/phi/common/bfloat16.h index 028851e34c8bc..9609dc50a9a0b 100644 --- a/paddle/phi/common/bfloat16.h +++ b/paddle/phi/common/bfloat16.h @@ -31,7 +31,13 @@ #include #endif -#ifndef PADDLE_WITH_HIP +#if defined(__MUSACC__) +#define PADDLE_MUSA_BF16 +#include +#include +#endif + +#if !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA) #if !defined(_WIN32) #define PADDLE_ALIGN(x) __attribute__((aligned(x))) #else @@ -65,13 +71,14 @@ struct PADDLE_ALIGN(2) bfloat16 { tempRes = reinterpret_cast(&val); res = *tempRes; x = res >> 16; -#else -#if defined(PADDLE_CUDA_BF16) +#elif defined(PADDLE_CUDA_BF16) __nv_bfloat16 tmp = __float2bfloat16(val); x = *reinterpret_cast(&tmp); +#elif defined(PADDLE_MUSA_BF16) + __mt_bfloat16 tmp = __float2bfloat16(val); + x = *reinterpret_cast(&tmp); #else std::memcpy(&x, reinterpret_cast(&val) + 2, 2); -#endif #endif } @@ -81,6 +88,12 @@ struct PADDLE_ALIGN(2) bfloat16 { } #endif +#if defined(PADDLE_MUSA_BF16) + HOSTDEVICE inline explicit bfloat16(const __mt_bfloat16& val) { + x = *reinterpret_cast(&val); // NOLINT + } +#endif + template HOSTDEVICE inline explicit bfloat16(const T& val) : x(bfloat16(static_cast(val)).x) {} @@ -93,6 +106,13 @@ struct PADDLE_ALIGN(2) bfloat16 { } #endif +#if defined(PADDLE_MUSA_BF16) + HOSTDEVICE inline bfloat16& operator=(const __mt_bfloat16& val) { + x = *reinterpret_cast(&val); // NOLINT + return *this; + } +#endif + HOSTDEVICE inline bfloat16& operator=(bool b) { x = b ? 0x3f80 : 0; return *this; @@ -160,16 +180,16 @@ struct PADDLE_ALIGN(2) bfloat16 { // return res; res = res << 16; return *reinterpret_cast(&res); -#else -#ifdef PADDLE_CUDA_BF16 +#elif defined(PADDLE_CUDA_BF16) return __bfloat162float(*reinterpret_cast(&x)); +#elif defined(PADDLE_MUSA_BF16) + return __bfloat162float(*reinterpret_cast(&x)); #else float val = 0.f; uint16_t temp = x; std::memcpy( reinterpret_cast(&val) + 2, reinterpret_cast(&temp), 2); return val; -#endif #endif } @@ -179,6 +199,12 @@ struct PADDLE_ALIGN(2) bfloat16 { } #endif +#ifdef PADDLE_MUSA_BF16 + HOSTDEVICE inline __mt_bfloat16 to_mt_bfloat16() const { + return *reinterpret_cast(&x); + } +#endif + HOSTDEVICE inline explicit operator bool() const { return (x & 0x7fff) != 0; } HOSTDEVICE inline explicit operator int8_t() const { diff --git a/paddle/phi/common/complex.h b/paddle/phi/common/complex.h index 5de6290fb7705..4fb04ed0f7f66 100644 --- a/paddle/phi/common/complex.h +++ b/paddle/phi/common/complex.h @@ -26,12 +26,17 @@ #include #endif // PADDLE_WITH_CUDA +#ifdef PADDLE_WITH_MUSA +#include +#include +#endif // PADDLE_WITH_MUSA + #ifdef PADDLE_WITH_HIP #include #include // NOLINT #endif -#ifndef PADDLE_WITH_HIP +#if !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA) #if !defined(_WIN32) #define PADDLE_ALIGN(x) __attribute__((aligned(x))) #else @@ -41,7 +46,7 @@ #define PADDLE_ALIGN(x) #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) // todo #define PADDLE_WITH_CUDA_OR_HIP_COMPLEX #endif @@ -66,7 +71,7 @@ struct PADDLE_ALIGN(sizeof(T) * 2) complex { HOSTDEVICE complex(T real, T imag) : real(real), imag(imag) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) template HOSTDEVICE inline explicit complex(const thrust::complex& c) { @@ -95,6 +100,14 @@ struct PADDLE_ALIGN(sizeof(T) * 2) complex { HOSTDEVICE inline explicit operator hipDoubleComplex() const { return make_hipDoubleComplex(real, imag); } +#elif defined(PADDLE_WITH_MUSA) + HOSTDEVICE inline explicit operator muFloatComplex() const { + return make_muFloatComplex(real, imag); + } + + HOSTDEVICE inline explicit operator muDoubleComplex() const { + return make_muDoubleComplex(real, imag); + } #else HOSTDEVICE inline explicit operator cuFloatComplex() const { return make_cuFloatComplex(real, imag); diff --git a/paddle/phi/common/cpstring_impl.h b/paddle/phi/common/cpstring_impl.h index 1906fd4e57a44..c88d4ac21cd4a 100644 --- a/paddle/phi/common/cpstring_impl.h +++ b/paddle/phi/common/cpstring_impl.h @@ -26,7 +26,7 @@ limitations under the License. */ #include "paddle/common/macros.h" -#if (defined(__NVCC__) || defined(__HIPCC__)) +#if (defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)) #define HOSTDEVICE __host__ __device__ #define DEVICE __device__ #define HOST __host__ @@ -77,7 +77,7 @@ HOSTDEVICE static inline uint32_t swap32(uint32_t host_int) { } #endif -#if PD_PSTRING_LITTLE_ENDIAN || (defined(__NVCC__) || defined(__HIPCC__)) +#if PD_PSTRING_LITTLE_ENDIAN || (defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)) #define PD_le32toh(x) x #else // PD_PSTRING_LITTLE_ENDIAN #define PD_le32toh(x) swap32(x) @@ -209,7 +209,7 @@ HOSTDEVICE static inline void *PD_Malloc(size_t size) { return malloc(size); } HOSTDEVICE static inline void *PD_Realloc(void *ptr, size_t old_size UNUSED, size_t new_size) { -#if (defined(__NVCC__) || defined(__HIPCC__)) +#if (defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)) if (old_size >= new_size) { return ptr; } diff --git a/paddle/phi/common/float16.h b/paddle/phi/common/float16.h index 9d60b8c6241ae..e4f4a5ae272eb 100644 --- a/paddle/phi/common/float16.h +++ b/paddle/phi/common/float16.h @@ -37,6 +37,10 @@ #include #endif // PADDLE_WITH_CUDA +#ifdef PADDLE_WITH_MUSA +#include +#endif // PADDLE_WITH_MUSA + #ifdef PADDLE_WITH_HIP #include #endif @@ -46,12 +50,17 @@ #include #endif +#ifdef __MUSACC__ +#define PADDLE_CUDA_FP16 +#include +#endif + #ifdef __HIPCC__ #define PADDLE_CUDA_FP16 #include #endif -#ifndef PADDLE_WITH_HIP +#if !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA) #if !defined(_WIN32) #define PADDLE_ALIGN(x) __attribute__((aligned(x))) #else @@ -86,8 +95,8 @@ struct PADDLE_ALIGN(2) float16 { // Constructors #ifdef PADDLE_CUDA_FP16 HOSTDEVICE inline explicit float16(const half& h) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -#if defined(PADDLE_WITH_HIP) || CUDA_VERSION >= 9000 +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || CUDA_VERSION >= 9000 x = reinterpret_cast<__half_raw*>(const_cast(&h))->x; #else x = h.x; @@ -106,7 +115,7 @@ struct PADDLE_ALIGN(2) float16 { HOSTDEVICE inline explicit float16(float val) { #if defined(PADDLE_CUDA_FP16) && \ - (defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300)) + (defined(__HIPCC__) || defined(__MUSACC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300)) half tmp = __float2half(val); x = *reinterpret_cast(&tmp); @@ -148,7 +157,7 @@ struct PADDLE_ALIGN(2) float16 { // Assignment operators #ifdef PADDLE_CUDA_FP16 HOSTDEVICE inline float16& operator=(const half& rhs) { -#if defined(PADDLE_WITH_HIP) || CUDA_VERSION >= 9000 +#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || CUDA_VERSION >= 9000 x = reinterpret_cast<__half_raw*>(const_cast(&rhs))->x; #else x = rhs.x; @@ -222,7 +231,7 @@ struct PADDLE_ALIGN(2) float16 { // Conversion operators #ifdef PADDLE_CUDA_FP16 HOSTDEVICE inline half to_half() const { -#if defined(PADDLE_WITH_HIP) || CUDA_VERSION >= 9000 +#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)|| CUDA_VERSION >= 9000 __half_raw h; h.x = x; return half(h); @@ -242,7 +251,7 @@ struct PADDLE_ALIGN(2) float16 { HOSTDEVICE inline operator float() const { #if defined(PADDLE_CUDA_FP16) && \ - (defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300)) + (defined(__HIPCC__) || defined(__MUSACC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300)) half tmp = *reinterpret_cast(this); return __half2float(tmp); @@ -351,7 +360,7 @@ struct PADDLE_ALIGN(2) float16 { // CUDA 9.0 regarding the half data type. // ROCM has built-in arithmetic operators as not defined // __HIP_NO_HALF_OPERATORS__ -#if defined(PADDLE_CUDA_FP16) && !defined(__HIPCC__) && CUDA_VERSION < 9000 +#if defined(PADDLE_CUDA_FP16) && !defined(__HIPCC__) && !defined(__MUSACC__) && CUDA_VERSION < 9000 DEVICE inline half operator+(const half& a, const half& b) { #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 return __hadd(a, b); @@ -399,7 +408,7 @@ DEVICE inline half operator-(const half& a) { #endif } -#ifndef PADDLE_WITH_HIP // not defined __HIP_NO_HALF_OPERATORS__ +#if !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA) // not defined __HIP_NO_HALF_OPERATORS__ DEVICE inline half& operator+=(half& a, const half& b) { // NOLINT a = a + b; return a; @@ -475,7 +484,7 @@ DEVICE inline bool operator>=(const half& a, const half& b) { #if defined(PADDLE_CUDA_FP16) // HIPCC has compile error if call __device__ function __hadd, __hsub, etc. // in __host__ __device__ function -#if defined(__HIPCC__) +#if defined(__HIPCC__) || defined(__MUSACC__) DEVICE inline float16 operator+(const float16& a, const float16& b) { return float16(__hadd(a.to_half(), b.to_half())); } @@ -492,7 +501,7 @@ HOSTDEVICE inline float16 operator+(const float16& a, const float16& b) { } #endif -#if defined(__HIPCC__) +#if defined(__HIPCC__) || defined(__MUSACC__) DEVICE inline float16 operator-(const float16& a, const float16& b) { return float16(__hsub(a.to_half(), b.to_half())); } @@ -509,7 +518,7 @@ HOSTDEVICE inline float16 operator-(const float16& a, const float16& b) { } #endif -#if defined(__HIPCC__) +#if defined(__HIPCC__) || defined(__MUSACC__) DEVICE inline float16 operator*(const float16& a, const float16& b) { return float16(__hmul(a.to_half(), b.to_half())); } @@ -526,7 +535,7 @@ HOSTDEVICE inline float16 operator*(const float16& a, const float16& b) { } #endif -#if defined(__HIPCC__) +#if defined(__HIPCC__) || defined(__MUSACC__) DEVICE inline float16 operator/(const float16& a, const float16& b) { return float16(__hdiv(a.to_half(), b.to_half())); } @@ -546,7 +555,7 @@ HOSTDEVICE inline float16 operator/(const float16& a, const float16& b) { } #endif -#if defined(__HIPCC__) +#if defined(__HIPCC__) || defined(__MUSACC__) DEVICE inline float16 operator-(const float16& a) { return float16(__hneg(a.to_half())); } @@ -589,7 +598,7 @@ HOSTDEVICE inline float16& operator/=(float16& a, const float16& b) { // NOLINT // HIPCC has compile error if call __device__ function __heq, __hne, etc. // in __host__ __device__ function -#if defined(__HIPCC__) +#if defined(__HIPCC__) || defined(__MUSACC__) DEVICE inline bool operator==(const float16& a, const float16& b) { return __heq(a.to_half(), b.to_half()); } @@ -606,7 +615,7 @@ HOSTDEVICE inline bool operator==(const float16& a, const float16& b) { } #endif // __HIPCC__ -#if defined(__HIPCC__) +#if defined(__HIPCC__) || defined(__MUSACC__) DEVICE inline bool operator!=(const float16& a, const float16& b) { return __hne(a.to_half(), b.to_half()); } @@ -623,7 +632,7 @@ HOSTDEVICE inline bool operator!=(const float16& a, const float16& b) { } #endif // __HIPCC__ -#if defined(__HIPCC__) +#if defined(__HIPCC__) || defined(__MUSACC__) DEVICE inline bool operator<(const float16& a, const float16& b) { return __hlt(a.to_half(), b.to_half()); } @@ -640,7 +649,7 @@ HOSTDEVICE inline bool operator<(const float16& a, const float16& b) { } #endif // __HIPCC__ -#if defined(__HIPCC__) +#if defined(__HIPCC__) || defined(__MUSACC__) DEVICE inline bool operator<=(const float16& a, const float16& b) { return __hle(a.to_half(), b.to_half()); } @@ -657,7 +666,7 @@ HOSTDEVICE inline bool operator<=(const float16& a, const float16& b) { } #endif // __HIPCC__ -#if defined(__HIPCC__) +#if defined(__HIPCC__) || defined(__MUSACC__) DEVICE inline bool operator>(const float16& a, const float16& b) { return __hgt(a.to_half(), b.to_half()); } @@ -674,7 +683,7 @@ HOSTDEVICE inline bool operator>(const float16& a, const float16& b) { } #endif // __HIPCC__ -#if defined(__HIPCC__) +#if defined(__HIPCC__) || defined(__MUSACC__) DEVICE inline bool operator>=(const float16& a, const float16& b) { return __hge(a.to_half(), b.to_half()); } @@ -965,7 +974,7 @@ DEVICE inline bool(isnan)(const float16& a) { return __hisnan(a.to_half()); } HOST inline bool(isnan)(const float16& a) { return (a.x & 0x7fff) > 0x7c00; } #else HOSTDEVICE inline bool(isnan)(const float16& a) { -#if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 +#if defined(PADDLE_CUDA_FP16) && ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || defined(__MUSACC__)) return __hisnan(a.to_half()); #else return (a.x & 0x7fff) > 0x7c00; @@ -983,7 +992,7 @@ HOSTDEVICE inline bool(isfinite)(const float16& a) { HOSTDEVICE inline float16(abs)(const float16& a) { #if defined(PADDLE_CUDA_FP16) && \ - (defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530)) + (defined(__HIPCC__) || defined(__MUSACC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530)) return float16(::fabs(static_cast(a))); #else return float16(std::abs(static_cast(a))); diff --git a/paddle/phi/common/memory_utils.cc b/paddle/phi/common/memory_utils.cc index 1af8cc442a117..a1fc14073d96a 100644 --- a/paddle/phi/common/memory_utils.cc +++ b/paddle/phi/common/memory_utils.cc @@ -69,7 +69,7 @@ int64_t DeviceMemoryStatCurrentValue(const std::string& stat_type, int dev_id) { dev_id); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) void GpuMemoryUsage(size_t* available, size_t* total) { return MemoryUtils::Instance().GpuMemoryUsage(available, total); } @@ -90,8 +90,8 @@ void EmplaceDeviceContexts( stream_priority); } -#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \ - (defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)) +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)) && \ + (defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)) const phi::Allocator* GetAllocator(int device_id, phi::gpuStream_t stream) { return MemoryUtils::Instance().GetAllocator(device_id, stream); } diff --git a/paddle/phi/common/memory_utils.h b/paddle/phi/common/memory_utils.h index 9e4e573277549..abcc6ac003c64 100644 --- a/paddle/phi/common/memory_utils.h +++ b/paddle/phi/common/memory_utils.h @@ -34,6 +34,11 @@ #include #endif +#ifdef PADDLE_WITH_MUSA +#include +#include +#endif + namespace phi { struct MemoryInterface { @@ -128,7 +133,7 @@ struct MemoryInterface { int64_t (*device_memory_stat_current_value)(const std::string& stat_type, int dev_id); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) /** * @brief get the memory usage of current GPU device. * @@ -161,8 +166,8 @@ struct MemoryInterface { bool disable_setting_default_stream_for_allocator, int stream_priority); -#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \ - (defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)) +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)) && \ + (defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)) phi::Allocator* (*get_allocator)(int device_id, phi::gpuStream_t stream); phi::Allocator* (*get_host_allocator)(); phi::Allocator* (*get_zero_allocator)(int device_id); @@ -292,7 +297,7 @@ class MemoryUtils { return memory_method_->device_memory_stat_current_value(stat_type, dev_id); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) void GpuMemoryUsage(size_t* available, size_t* total) { CheckMemoryMethod(); PADDLE_ENFORCE_NOT_NULL( @@ -344,8 +349,8 @@ class MemoryUtils { "Fluid. You can call InitMemoryMethod() for initialization.")); } -#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \ - (defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)) +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)) && \ + (defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)|| defined(PADDLE_WITH_MCCL)) const phi::Allocator* GetAllocator(int device_id, phi::gpuStream_t stream) { return memory_method_->get_allocator(device_id, stream); } @@ -421,7 +426,7 @@ void Copy(const Place& dst_place, int64_t DeviceMemoryStatCurrentValue(const std::string& stat_type, int dev_id); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) void GpuMemoryUsage(size_t* available, size_t* total); #endif @@ -434,8 +439,8 @@ void EmplaceDeviceContexts( bool disable_setting_default_stream_for_allocator, int stream_priority); -#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \ - (defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)) +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)) && \ + (defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)|| defined(PADDLE_WITH_MCCL)) const Allocator* GetAllocator(int device_id, phi::gpuStream_t stream); const Allocator* GetHostAllocator(); diff --git a/paddle/phi/common/place.cc b/paddle/phi/common/place.cc index 008f45aa93554..c205bb7675393 100644 --- a/paddle/phi/common/place.cc +++ b/paddle/phi/common/place.cc @@ -129,7 +129,7 @@ static int8_t GetCorrectDeviceIdByPlaceType( switch (place_type) { case paddle::PlaceType::kCPU: return 0; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) case paddle::PlaceType::kGPU: return phi::backends::gpu::GetCurrentDeviceId(); #endif @@ -175,7 +175,7 @@ bool operator==(PlaceType place_type, const Place &place) { GPUPlace DefaultGPUPlace() { return GPUPlace( -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) phi::backends::gpu::GetCurrentDeviceId()); #else 0); diff --git a/paddle/phi/common/transform.h b/paddle/phi/common/transform.h index e80561284b885..0b1a94aa0c1b9 100644 --- a/paddle/phi/common/transform.h +++ b/paddle/phi/common/transform.h @@ -21,7 +21,7 @@ limitations under the License. */ #include "paddle/phi/core/enforce.h" #include "paddle/phi/core/hostdevice.h" -#if defined(__NVCC__) || defined(__HIPCC__) +#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) #include #include #include "thrust/device_ptr.h" @@ -92,7 +92,7 @@ struct Transform { } }; -#if defined(__NVCC__) || defined(__HIPCC__) +#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) // PointerToThrustDevicePtr has two specializations, one casts a (CUDA // device) pointer into thrust::device_ptr, the other keeps rest types @@ -153,6 +153,12 @@ struct Transform { CastToCUDATransformIterator(last), CastToCUDATransformIterator(result), op); +#elif defined(__MUSACC__) + thrust::transform(thrust::musa::par.on(context.stream()), + CastToCUDATransformIterator(first), + CastToCUDATransformIterator(last), + CastToCUDATransformIterator(result), + op); #else thrust::transform(thrust::cuda::par.on(context.stream()), CastToCUDATransformIterator(first), @@ -184,6 +190,13 @@ struct Transform { CastToCUDATransformIterator(first2), CastToCUDATransformIterator(result), op); +#elif defined(__MUSACC__) + thrust::transform(thrust::musa::par.on(context.stream()), + CastToCUDATransformIterator(first1), + CastToCUDATransformIterator(last1), + CastToCUDATransformIterator(first2), + CastToCUDATransformIterator(result), + op); #else thrust::transform(thrust::cuda::par.on(context.stream()), CastToCUDATransformIterator(first1), diff --git a/paddle/phi/core/compat/convert_utils.cc b/paddle/phi/core/compat/convert_utils.cc index d4c5de0dbe6dc..15585543417d8 100644 --- a/paddle/phi/core/compat/convert_utils.cc +++ b/paddle/phi/core/compat/convert_utils.cc @@ -61,7 +61,7 @@ phi::Place TransToPhiPlace(const Backend& backend, bool set_device_id) { return phi::CPUPlace(); case phi::Backend::UNDEFINED: return phi::Place(); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) case phi::Backend::GPU: return phi::GPUPlace( set_device_id ? phi::backends::gpu::GetCurrentDeviceId() : 0); @@ -70,7 +70,7 @@ phi::Place TransToPhiPlace(const Backend& backend, bool set_device_id) { case phi::Backend::ONEDNN: // NOLINT return phi::CPUPlace(); #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) case phi::Backend::GPUDNN: return phi::GPUPlace( set_device_id ? phi::backends::gpu::GetCurrentDeviceId() : 0); @@ -81,7 +81,7 @@ phi::Place TransToPhiPlace(const Backend& backend, bool set_device_id) { set_device_id ? phi::backends::xpu::GetXPUCurrentDeviceId() : 0); #endif case phi::Backend::KPS: -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) return phi::GPUPlace( set_device_id ? phi::backends::gpu::GetCurrentDeviceId() : 0); #elif defined(PADDLE_WITH_XPU_KP) diff --git a/paddle/phi/core/cuda_stream.h b/paddle/phi/core/cuda_stream.h index b27770b081433..50c07b6e2cc46 100644 --- a/paddle/phi/core/cuda_stream.h +++ b/paddle/phi/core/cuda_stream.h @@ -23,6 +23,11 @@ limitations under the License. */ using gpuStream_t = cudaStream_t; #endif +#ifdef PADDLE_WITH_MUSA +#include +using gpuStream_t = musaStream_t; +#endif + #ifdef PADDLE_WITH_HIP #include using gpuStream_t = hipStream_t; @@ -73,6 +78,9 @@ class CUDAStream { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipStreamCreateWithPriority( &stream, static_cast(flag), priority)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(musaStreamCreateWithPriority( + &stream, static_cast(flag), priority)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamCreateWithPriority( &stream, static_cast(flag), priority)); @@ -92,6 +100,8 @@ class CUDAStream { backends::gpu::GPUDeviceGuard guard(place_.device); #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipStreamDestroy(raw_stream())); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(musaStreamDestroy(raw_stream())); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamDestroy(raw_stream())); #endif @@ -112,6 +122,14 @@ class CUDAStream { if (err == hipErrorNotReady) { return false; } +#elif defined(PADDLE_WITH_MUSA) + musaError_t err = musaStreamQuery(raw_stream()); + if (err == musaSuccess) { + return true; + } + if (err == musaErrorNotReady) { + return false; + } #else cudaError_t err = cudaStreamQuery(raw_stream()); if (err == cudaSuccess) { @@ -134,6 +152,8 @@ class CUDAStream { void WaitEvent(gpuEvent_t ev) const { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipStreamWaitEvent(raw_stream(), ev, 0)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(musaStreamWaitEvent(raw_stream(), ev, 0)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamWaitEvent(raw_stream(), ev, 0)); #endif @@ -146,6 +166,8 @@ class CUDAStream { backends::gpu::GPUDeviceGuard guard(place_.device); #ifdef PADDLE_WITH_HIP hipStreamDestroy(raw_stream()); +#elif defined(PADDLE_WITH_MUSA) + musaStreamDestroy(raw_stream()); #else cudaStreamDestroy(raw_stream()); #endif diff --git a/paddle/phi/core/distributed/CMakeLists.txt b/paddle/phi/core/distributed/CMakeLists.txt index 00000c3fff9e0..34046df6013a5 100644 --- a/paddle/phi/core/distributed/CMakeLists.txt +++ b/paddle/phi/core/distributed/CMakeLists.txt @@ -4,7 +4,7 @@ add_subdirectory(auto_parallel) set(DISTRIBUTED_COMMON_SRCS comm_context_manager.cc) -if(WITH_NCCL OR WITH_RCCL) +if(WITH_NCCL OR WITH_RCCL OR WITH_MCCL) list(APPEND DISTRIBUTED_COMMON_SRCS comm_task_manager.cc) list(APPEND DISTRIBUTED_COMMON_SRCS nccl_comm_context.cc nccl_comm_task.cc nccl_tools.cc) diff --git a/paddle/phi/core/distributed/auto_parallel/reshard/reshard_utils.cc b/paddle/phi/core/distributed/auto_parallel/reshard/reshard_utils.cc index e7a1ec15da307..9407d1fad7f42 100644 --- a/paddle/phi/core/distributed/auto_parallel/reshard/reshard_utils.cc +++ b/paddle/phi/core/distributed/auto_parallel/reshard/reshard_utils.cc @@ -101,7 +101,7 @@ CommContext* CreateOrGetCommContext(const DeviceContext& dev_ctx, store, unique_comm_key, dev_ctx.GetPlace(), rank, world_size); #endif } else { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) if (phi::GPUContext::classof(&dev_ctx)) { CommContextManager::CreateNCCLCommContext( store, unique_comm_key, rank, world_size); @@ -164,7 +164,7 @@ bool NeedComputationClipForPP( } Place GetDefaultPlace() { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) if (phi::backends::gpu::GetGPUDeviceCount() >= 0) { return paddle::DefaultGPUPlace(); } diff --git a/paddle/phi/core/distributed/auto_parallel/reshard/reshard_utils.h b/paddle/phi/core/distributed/auto_parallel/reshard/reshard_utils.h index 022dc06598064..41cfd4efca8fd 100644 --- a/paddle/phi/core/distributed/auto_parallel/reshard/reshard_utils.h +++ b/paddle/phi/core/distributed/auto_parallel/reshard/reshard_utils.h @@ -71,7 +71,7 @@ std::vector BalancedSplit(int64_t total_nums, int64_t num_of_pieces); CommContext* CreateOrGetCommContext(const DeviceContext& dev_ctx, const std::vector& process_ids); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) #define RESHARD_FUNCTOR_IMPL(dev_ctx, fn_name, dtype, ...) \ do { \ if (phi::CPUContext::classof(dev_ctx)) { \ @@ -123,7 +123,7 @@ CommContext* CreateOrGetCommContext(const DeviceContext& dev_ctx, RESHARD_FUNCTOR_IMPL(dev_ctx, fn_name, dtype, __VA_ARGS__); \ } while (0) -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) #define RESHARD_FUNCTOR_WITHOUT_DTYPE(dev_ctx, fn_name, ...) \ do { \ if (phi::CPUContext::classof(dev_ctx)) { \ diff --git a/paddle/phi/core/distributed/check/CMakeLists.txt b/paddle/phi/core/distributed/check/CMakeLists.txt index 1721a4a4602d1..964106feac402 100644 --- a/paddle/phi/core/distributed/check/CMakeLists.txt +++ b/paddle/phi/core/distributed/check/CMakeLists.txt @@ -1,6 +1,6 @@ set(CHECK_COMMON_SRCS static_check.cc) -if(WITH_NCCL OR WITH_RCCL) +if(WITH_NCCL OR WITH_RCCL OR WITH_MCCL) list(APPEND CHECK_COMMON_SRCS nccl_dynamic_check.cc) endif() diff --git a/paddle/phi/core/distributed/check/nccl_dynamic_check.cc b/paddle/phi/core/distributed/check/nccl_dynamic_check.cc index 9307af45bd622..4a7b931ad2b33 100644 --- a/paddle/phi/core/distributed/check/nccl_dynamic_check.cc +++ b/paddle/phi/core/distributed/check/nccl_dynamic_check.cc @@ -30,6 +30,16 @@ #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost #define gpuMemcpyHostToDevice hipMemcpyHostToDevice #define gpuFree hipFree +#elif defined(PADDLE_WITH_MCCL) +#include + +#include "paddle/phi/backends/dynload/mccl.h" + +#define gpuMalloc musaMalloc +#define gpuMemcpy musaMemcpy +#define gpuMemcpyDeviceToHost musaMemcpyDeviceToHost +#define gpuMemcpyHostToDevice musaMemcpyHostToDevice +#define gpuFree musaFree #else #include @@ -56,7 +66,7 @@ void NCCLDynamicCheck::CheckDataType(const phi::DenseTensor& tensor, void NCCLDynamicCheck::CheckDataType(const phi::DenseTensor& tensor, int root_rank, int cur_rank, - ncclComm_t comm) { + mcclComm_t comm) { constexpr int kSize = sizeof(int64_t); int64_t dtype_host = static_cast(tensor.dtype()); int64_t* dtype_device; @@ -64,10 +74,10 @@ void NCCLDynamicCheck::CheckDataType(const phi::DenseTensor& tensor, PADDLE_ENFORCE_GPU_SUCCESS( gpuMemcpy(dtype_device, &dtype_host, kSize, gpuMemcpyHostToDevice)); - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclBroadcast(dtype_device, + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mcclBroadcast(dtype_device, dtype_device, 1, - ncclInt64, + mcclInt64, root_rank, comm, kDefaultStream)); @@ -95,7 +105,7 @@ void NCCLDynamicCheck::CheckShape(const phi::DenseTensor& tensor, void NCCLDynamicCheck::CheckShape(const phi::DenseTensor& tensor, int root_rank, int cur_rank, - ncclComm_t comm) { + mcclComm_t comm) { CheckDataType(tensor, root_rank, cur_rank, comm); constexpr int kSize = sizeof(int64_t); @@ -106,10 +116,10 @@ void NCCLDynamicCheck::CheckShape(const phi::DenseTensor& tensor, PADDLE_ENFORCE_GPU_SUCCESS( gpuMemcpy(shape_device, &shape_host, kSize, gpuMemcpyHostToDevice)); - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclBroadcast(shape_device, + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mcclBroadcast(shape_device, shape_device, 1, - ncclInt64, + mcclInt64, root_rank, comm, kDefaultStream)); @@ -130,7 +140,7 @@ void NCCLDynamicCheck::CheckShape(const phi::DenseTensor& out_tensor, const std::vector& in_size_each_rank, int cur_rank, int world_size, - ncclComm_t comm) { + mcclComm_t comm) { CheckDataType(out_tensor, /*root_rank*/ 0, cur_rank, comm); CheckDataType(in_tensor, /*root_rank*/ 0, cur_rank, comm); @@ -143,11 +153,11 @@ void NCCLDynamicCheck::CheckShape(const phi::DenseTensor& out_tensor, PADDLE_ENFORCE_GPU_SUCCESS(gpuMalloc(&in_shape_device, kSize)); PADDLE_ENFORCE_GPU_SUCCESS(gpuMemcpy( in_shape_device, &in_shape_host, kSize, gpuMemcpyHostToDevice)); - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclReduce(in_shape_device, + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mcclReduce(in_shape_device, in_shape_device, 1, - ncclInt64, - ncclSum, + mcclInt64, + mcclSum, rank, comm, kDefaultStream)); @@ -167,7 +177,7 @@ void NCCLDynamicCheck::CheckGatherShape( int root_rank, int cur_rank, int world_size, - ncclComm_t comm) { + mcclComm_t comm) { std::vector shapes(world_size, 0); shapes[cur_rank] = in_tensor.numel(); int64_t* in_shape_device; @@ -178,11 +188,11 @@ void NCCLDynamicCheck::CheckGatherShape( world_size * sizeof(int64_t), gpuMemcpyHostToDevice)); - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclAllReduce(in_shape_device, + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mcclAllReduce(in_shape_device, in_shape_device, world_size, - ncclInt64, - ncclSum, + mcclInt64, + mcclSum, comm, kDefaultStream)); PADDLE_ENFORCE_GPU_SUCCESS(gpuMemcpy(shapes.data(), diff --git a/paddle/phi/core/distributed/check/nccl_dynamic_check.h b/paddle/phi/core/distributed/check/nccl_dynamic_check.h index 23e8386d6f2af..502ec886211e1 100644 --- a/paddle/phi/core/distributed/check/nccl_dynamic_check.h +++ b/paddle/phi/core/distributed/check/nccl_dynamic_check.h @@ -21,6 +21,8 @@ #if defined(PADDLE_WITH_RCCL) using gpuStream_t = hipStream_t; +#elif defined(PADDLE_WITH_MCCL) +using gpuStream_t = musaStream_t; #else using gpuStream_t = cudaStream_t; #endif @@ -36,21 +38,21 @@ struct NCCLDynamicCheck { static void CheckDataType(const phi::DenseTensor& tensor, int root_rank, int cur_rank, - ncclComm_t comm); + mcclComm_t comm); static void CheckShape(const phi::DenseTensor& tensor, int64_t shape); static void CheckShape(const phi::DenseTensor& tensor, int root_rank, int cur_rank, - ncclComm_t comm); + mcclComm_t comm); static void CheckShape(const phi::DenseTensor& out_tensor, const phi::DenseTensor& in_tensor, const std::vector& in_size_each_rank, int cur_rank, int world_size, - ncclComm_t comm); + mcclComm_t comm); // can be used to check gather and all gather static void CheckGatherShape(const phi::DenseTensor& in_tensor, @@ -58,7 +60,7 @@ struct NCCLDynamicCheck { int root_rank, int cur_rank, int world_size, - ncclComm_t comm); + mcclComm_t comm); private: // `0` represents default stream for both cuda & hip diff --git a/paddle/phi/core/distributed/comm_context_manager.cc b/paddle/phi/core/distributed/comm_context_manager.cc index 5fd7861cc52b2..2aee7c7c85104 100644 --- a/paddle/phi/core/distributed/comm_context_manager.cc +++ b/paddle/phi/core/distributed/comm_context_manager.cc @@ -29,7 +29,7 @@ #include "paddle/phi/core/distributed/store/gloo_store.h" #endif -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) #include "paddle/phi/backends/context_pool.h" #include "paddle/phi/common/memory_utils.h" #include "paddle/phi/core/distributed/nccl_comm_context.h" @@ -49,13 +49,13 @@ namespace distributed { int CommContextManager::device_id = -1; void CommContextManager::SetDeviceId(int dev_id) { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) phi::backends::gpu::SetDeviceId(dev_id); CommContextManager::device_id = dev_id; #endif } -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) void CommContextManager::CreateNCCLCommContext( const std::shared_ptr& store, const std::string& unique_comm_key, @@ -67,16 +67,16 @@ void CommContextManager::CreateNCCLCommContext( if (comm_context_manager.Has(unique_comm_key)) { return; } - ncclUniqueId nccl_id; + mcclUniqueId nccl_id; if (rank == 0 || (p2p_opt && p2p_opt->is_p2p_op && p2p_opt->p2p_rank == 0)) { - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclGetUniqueId(&nccl_id)); + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mcclGetUniqueId(&nccl_id)); } std::string unique_key = "NCCLCommContext/" + unique_comm_key + hash_key; if (rank == 0 || (p2p_opt && p2p_opt->is_p2p_op && p2p_opt->p2p_rank == 0)) { std::vector nccl_id_wrapper( reinterpret_cast(&nccl_id), - reinterpret_cast(&nccl_id) + NCCL_UNIQUE_ID_BYTES); + reinterpret_cast(&nccl_id) + MCCL_UNIQUE_ID_BYTES); store->set(unique_key, nccl_id_wrapper); } else { const auto& nccl_id_wrapper = store->get(unique_key); @@ -231,8 +231,8 @@ CommContext* CommContextManager::Get(const std::string& unique_comm_key) const { return id_to_comm_context_.at(unique_comm_key).get(); } -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) -int CommContextManager::GetRingId(const ncclComm_t& comm) const { +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +int CommContextManager::GetRingId(const mcclComm_t& comm) const { for (auto iter = id_to_comm_context_.begin(); iter != id_to_comm_context_.end(); ++iter) { diff --git a/paddle/phi/core/distributed/comm_context_manager.h b/paddle/phi/core/distributed/comm_context_manager.h index 8c4d802294986..5c3f3101dcada 100644 --- a/paddle/phi/core/distributed/comm_context_manager.h +++ b/paddle/phi/core/distributed/comm_context_manager.h @@ -24,7 +24,7 @@ #include "paddle/phi/common/place.h" #include "paddle/phi/core/distributed/comm_context.h" -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) #include "paddle/phi/backends/gpu/forwards.h" #endif @@ -57,8 +57,8 @@ class CommContextManager { CommContext* Get(const std::string& unique_comm_key) const; -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) - int GetRingId(const ncclComm_t& comm) const; +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) + int GetRingId(const mcclComm_t& comm) const; #endif bool Has(const std::string& unique_comm_key) const; @@ -71,7 +71,7 @@ class CommContextManager { std::vector GetGroupRanks(const std::string& pg_key) const; -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) static void CreateNCCLCommContext(const std::shared_ptr& store, const std::string& unique_comm_key, int rank, diff --git a/paddle/phi/core/distributed/comm_task.h b/paddle/phi/core/distributed/comm_task.h index 47ba01b980479..ca7f8495495d2 100644 --- a/paddle/phi/core/distributed/comm_task.h +++ b/paddle/phi/core/distributed/comm_task.h @@ -25,6 +25,9 @@ #if defined(PADDLE_WITH_RCCL) #include "paddle/phi/backends/dynload/rccl.h" #endif +#if defined(PADDLE_WITH_MCCL) +#include "paddle/phi/backends/dynload/mccl.h" +#endif #if defined(PADDLE_WITH_NCCL) #include "paddle/phi/backends/dynload/nccl.h" #endif @@ -43,7 +46,7 @@ class CommTask { int gid = 0, uint64_t seq = 0, int64_t numel = 0, - ncclComm_t nccl_comm = nullptr, + mcclComm_t nccl_comm = nullptr, gpuStream_t nccl_stream = nullptr, CommType comm_type = CommType::UNKNOWN) : backend_(backend), @@ -89,7 +92,7 @@ class CommTask { std::shared_ptr GetStore() { return store_; } void SetStore(std::shared_ptr store) { store_ = store; } - ncclComm_t nccl_comm() { return nccl_comm_; } + mcclComm_t nccl_comm() { return nccl_comm_; } gpuStream_t nccl_stream() { return nccl_stream_; } virtual std::string GetTraceMsg() { @@ -160,7 +163,7 @@ class CommTask { int gid_; uint64_t seq_{0}; int64_t numel_; - ncclComm_t nccl_comm_; + mcclComm_t nccl_comm_; gpuStream_t nccl_stream_; CommType comm_type_; bool start_trace_updated_{false}; diff --git a/paddle/phi/core/distributed/comm_task_manager.cc b/paddle/phi/core/distributed/comm_task_manager.cc index ae7de42291358..822b3892ec364 100644 --- a/paddle/phi/core/distributed/comm_task_manager.cc +++ b/paddle/phi/core/distributed/comm_task_manager.cc @@ -32,7 +32,7 @@ #include "paddle/phi/core/distributed/store/store.h" #include "paddle/phi/core/enforce.h" -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) #include "paddle/phi/core/distributed/comm_task_manager.h" #include "paddle/phi/core/distributed/nccl_comm_context.h" #endif diff --git a/paddle/phi/core/distributed/nccl_comm_context.cc b/paddle/phi/core/distributed/nccl_comm_context.cc index 8da676e74d911..4600d2e14cdbb 100644 --- a/paddle/phi/core/distributed/nccl_comm_context.cc +++ b/paddle/phi/core/distributed/nccl_comm_context.cc @@ -30,16 +30,16 @@ namespace distributed { // set this flag to `true` and recompile to enable dynamic checks constexpr bool FLAGS_enable_nccl_dynamic_check = false; -NCCLCommContext::NCCLCommContext(int rank, int size, ncclUniqueId nccl_id) +NCCLCommContext::NCCLCommContext(int rank, int size, mcclUniqueId nccl_id) : CommContext(rank, size) { - NCCL_CHECK( - phi::dynload::ncclCommInitRank(&nccl_comm_, size_, nccl_id, rank_)); - NCCL_CHECK(phi::dynload::ncclGetVersion(&nccl_version_)); + MCCL_CHECK( + phi::dynload::mcclCommInitRank(&nccl_comm_, size_, nccl_id, rank_)); + MCCL_CHECK(phi::dynload::mcclGetVersion(&nccl_version_)); } int NCCLCommContext::GetNcclVersion() { return nccl_version_; } -ncclComm_t NCCLCommContext::GetNcclComm() { return nccl_comm_; } +mcclComm_t NCCLCommContext::GetNcclComm() { return nccl_comm_; } gpuStream_t NCCLCommContext::GetStream() { return dev_ctx_->stream(); } @@ -77,7 +77,7 @@ void NCCLCommContext::Broadcast(phi::DenseTensor* out_tensor, if (FLAGS_enable_nccl_dynamic_check) { NCCLDynamicCheck::CheckShape(*out_tensor, root, rank_, nccl_comm_); } - NCCL_CHECK(phi::dynload::ncclBroadcast(in_tensor.data(), + MCCL_CHECK(phi::dynload::mcclBroadcast(in_tensor.data(), out_tensor->data(), in_tensor.numel(), ToNCCLDataType(in_tensor.type()), @@ -100,7 +100,7 @@ void NCCLCommContext::AllGather(phi::DenseTensor* out_tensor, rank_, nccl_comm_); } - NCCL_CHECK(phi::dynload::ncclAllGather(in_tensor.data(), + MCCL_CHECK(phi::dynload::mcclAllGather(in_tensor.data(), out_tensor->data(), in_tensor.numel(), ToNCCLDataType(in_tensor.type()), @@ -109,7 +109,7 @@ void NCCLCommContext::AllGather(phi::DenseTensor* out_tensor, } void NCCLCommContext::ReduceScatter(phi::DenseTensor* out_tensor, const phi::DenseTensor& in_tensor, - ncclRedOp_t reduce_type, + mcclRedOp_t reduce_type, gpuStream_t stream) { phi::distributed::CommStaticCheck::ScatterLikeShape(*out_tensor, in_tensor, @@ -122,7 +122,7 @@ void NCCLCommContext::ReduceScatter(phi::DenseTensor* out_tensor, rank_, nccl_comm_); } - NCCL_CHECK(phi::dynload::ncclReduceScatter(in_tensor.data(), + MCCL_CHECK(phi::dynload::mcclReduceScatter(in_tensor.data(), out_tensor->data(), out_tensor->numel(), ToNCCLDataType(in_tensor.type()), @@ -141,7 +141,7 @@ void NCCLCommContext::Send(const phi::DenseTensor& in_tensor, NCCLDynamicCheck::CheckShape(in_tensor, rank_, rank_, nccl_comm_); } - NCCL_CHECK(phi::dynload::ncclSend(in_tensor.data(), + MCCL_CHECK(phi::dynload::mcclSend(in_tensor.data(), count, ToNCCLDataType(in_tensor.dtype()), peer, @@ -160,7 +160,7 @@ void NCCLCommContext::Recv(phi::DenseTensor* out_tensor, NCCLDynamicCheck::CheckShape(*out_tensor, peer, rank_, nccl_comm_); } - NCCL_CHECK(phi::dynload::ncclRecv(out_tensor->data(), + MCCL_CHECK(phi::dynload::mcclRecv(out_tensor->data(), count, ToNCCLDataType(out_tensor->dtype()), peer, @@ -172,7 +172,7 @@ void NCCLCommContext::Recv(phi::DenseTensor* out_tensor, void NCCLCommContext::AllReduce(phi::DenseTensor* out_tensor, const phi::DenseTensor& in_tensor, - ncclRedOp_t reduce_type, + mcclRedOp_t reduce_type, gpuStream_t stream) { phi::distributed::CommStaticCheck::SameShape(*out_tensor, in_tensor, @@ -185,7 +185,7 @@ void NCCLCommContext::AllReduce(phi::DenseTensor* out_tensor, rank_, nccl_comm_); } - NCCL_CHECK(phi::dynload::ncclAllReduce(in_tensor.data(), + MCCL_CHECK(phi::dynload::mcclAllReduce(in_tensor.data(), out_tensor->data(), in_tensor.numel(), ToNCCLDataType(in_tensor.type()), @@ -196,7 +196,7 @@ void NCCLCommContext::AllReduce(phi::DenseTensor* out_tensor, void NCCLCommContext::Reduce(phi::DenseTensor* out_tensor, const phi::DenseTensor& in_tensor, - ncclRedOp_t reduce_type, + mcclRedOp_t reduce_type, int root, gpuStream_t stream) { phi::distributed::CommStaticCheck::SameShape(*out_tensor, @@ -210,7 +210,7 @@ void NCCLCommContext::Reduce(phi::DenseTensor* out_tensor, rank_, nccl_comm_); } - NCCL_CHECK(phi::dynload::ncclReduce(in_tensor.data(), + MCCL_CHECK(phi::dynload::mcclReduce(in_tensor.data(), out_tensor->data(), in_tensor.numel(), ToNCCLDataType(in_tensor.type()), @@ -221,23 +221,23 @@ void NCCLCommContext::Reduce(phi::DenseTensor* out_tensor, } void NCCLCommContext::GroupStart() { - NCCL_CHECK(phi::dynload::ncclGroupStart()); + MCCL_CHECK(phi::dynload::mcclGroupStart()); } -void NCCLCommContext::GroupEnd() { NCCL_CHECK(phi::dynload::ncclGroupEnd()); } +void NCCLCommContext::GroupEnd() { MCCL_CHECK(phi::dynload::mcclGroupEnd()); } -#if NCCL_VERSION_CODE >= 21100 -void NCCLCommContext::RedOpCreatePreMulSum(ncclRedOp_t* op, +// #if NCCL_VERSION_CODE >= 21100 +void NCCLCommContext::RedOpCreatePreMulSum(mcclRedOp_t* op, void* scalar, - ncclDataType_t dtype, - ncclScalarResidence_t residence) { - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclRedOpCreatePreMulSum( + mcclDataType_t dtype, + mcclScalarResidence_t residence) { + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mcclRedOpCreatePreMulSum( op, scalar, dtype, residence, nccl_comm_)); } -void NCCLCommContext::RedOpDestroy(ncclRedOp_t op) { - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclRedOpDestroy(op, nccl_comm_)); +void NCCLCommContext::RedOpDestroy(mcclRedOp_t op) { + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mcclRedOpDestroy(op, nccl_comm_)); } -#endif +// #endif } // namespace distributed } // namespace phi diff --git a/paddle/phi/core/distributed/nccl_comm_context.h b/paddle/phi/core/distributed/nccl_comm_context.h index 609b5e0defe07..e7a73f1204672 100644 --- a/paddle/phi/core/distributed/nccl_comm_context.h +++ b/paddle/phi/core/distributed/nccl_comm_context.h @@ -18,6 +18,11 @@ #include #endif +#ifdef PADDLE_WITH_MUSA +#include +#include +#endif + #ifdef PADDLE_WITH_HIP #include #endif @@ -29,6 +34,8 @@ #if defined(PADDLE_WITH_RCCL) #include "paddle/phi/backends/dynload/rccl.h" +#elif defined(PADDLE_WITH_MCCL) +#include "paddle/phi/backends/dynload/mccl.h" #else #include "paddle/phi/backends/dynload/nccl.h" #endif @@ -39,12 +46,12 @@ namespace distributed { class NCCLCommContext final : public CommContext { public: - NCCLCommContext(int rank, int size, ncclUniqueId nccl_id); + NCCLCommContext(int rank, int size, mcclUniqueId nccl_id); ~NCCLCommContext() override = default; int GetNcclVersion(); - ncclComm_t GetNcclComm(); + mcclComm_t GetNcclComm(); gpuStream_t GetStream(); @@ -80,7 +87,7 @@ class NCCLCommContext final : public CommContext { void ReduceScatter(phi::DenseTensor* out_tensor, const phi::DenseTensor& in_tensor, - ncclRedOp_t reduce_type, + mcclRedOp_t reduce_type, gpuStream_t stream); void AllGather(phi::DenseTensor* out_tensor, @@ -89,12 +96,12 @@ class NCCLCommContext final : public CommContext { void AllReduce(phi::DenseTensor* out_tensor, const phi::DenseTensor& in_tensor, - ncclRedOp_t reduce_type, + mcclRedOp_t reduce_type, gpuStream_t stream); void Reduce(phi::DenseTensor* out_tensor, const phi::DenseTensor& in_tensor, - ncclRedOp_t reduce_type, + mcclRedOp_t reduce_type, int root, gpuStream_t stream); @@ -102,25 +109,25 @@ class NCCLCommContext final : public CommContext { void GroupEnd(); -#if NCCL_VERSION_CODE >= 21100 +// #if NCCL_VERSION_CODE >= 21100 // Creates a new reduction operator which pre-multiplies input values by a // given scalar locally before reducing them with peer values via summation. - void RedOpCreatePreMulSum(ncclRedOp_t* op, + void RedOpCreatePreMulSum(mcclRedOp_t* op, void* scalar, - ncclDataType_t dtype, - ncclScalarResidence_t residence); + mcclDataType_t dtype, + mcclScalarResidence_t residence); // Destroys the reduction operator op. The operator must have been created by // ncclRedOpCreatePreMul with the matching communicator comm. - void RedOpDestroy(ncclRedOp_t op); -#endif + void RedOpDestroy(mcclRedOp_t op); +// #endif private: DISABLE_COPY_AND_ASSIGN(NCCLCommContext); int nccl_version_; - ncclComm_t nccl_comm_; + mcclComm_t nccl_comm_; std::unique_ptr dev_ctx_; diff --git a/paddle/phi/core/distributed/nccl_comm_task.cc b/paddle/phi/core/distributed/nccl_comm_task.cc index 4e2efea0068eb..5f11c8101df93 100644 --- a/paddle/phi/core/distributed/nccl_comm_task.cc +++ b/paddle/phi/core/distributed/nccl_comm_task.cc @@ -33,7 +33,7 @@ NCCLCommTask::NCCLCommTask(const phi::Place& place, int64_t numel, bool sync_op, bool use_calc_stream, - ncclComm_t nccl_comm, + mcclComm_t nccl_comm, gpuStream_t stream, CommType comm_type, int64_t timeout) @@ -62,6 +62,8 @@ void NCCLCommTask::StartRecord() { if (!start_event_created_) { #ifdef PADDLE_WITH_CUDA CUDA_CHECK(cudaEventCreateWithFlags(&nccl_start_event_, cuda_event_flags_)); +#elif defined(PADDLE_WITH_MUSA) + MUSA_CHECK(musaEventCreateWithFlags(&nccl_start_event_, musa_event_flags_)); #else // PADDLE_WITH_HIP HIP_CHECK(hipEventCreateWithFlags(&nccl_start_event_, hip_event_flags_)); #endif @@ -69,6 +71,8 @@ void NCCLCommTask::StartRecord() { } #ifdef PADDLE_WITH_CUDA CUDA_CHECK(cudaEventRecord(nccl_start_event_, nccl_stream_)); +#elif defined(PADDLE_WITH_MUSA) + MUSA_CHECK(musaEventRecord(nccl_start_event_, nccl_stream_)); #else // PADDLE_WITH_HIP HIP_CHECK(hipEventRecord(nccl_start_event_, nccl_stream_)); #endif @@ -78,6 +82,8 @@ void NCCLCommTask::EndRecord() { if (!end_event_created_) { #ifdef PADDLE_WITH_CUDA CUDA_CHECK(cudaEventCreateWithFlags(&nccl_end_event_, cuda_event_flags_)); +#elif defined(PADDLE_WITH_MUSA) + MUSA_CHECK(musaEventCreateWithFlags(&nccl_end_event_, musa_event_flags_)); #else // PADDLE_WITH_HIP HIP_CHECK(hipEventCreateWithFlags(&nccl_end_event_, hip_event_flags_)); #endif @@ -85,6 +91,8 @@ void NCCLCommTask::EndRecord() { } #ifdef PADDLE_WITH_CUDA CUDA_CHECK(cudaEventRecord(nccl_end_event_, nccl_stream_)); +#elif defined(PADDLE_WITH_MUSA) + MUSA_CHECK(musaEventRecord(nccl_end_event_, nccl_stream_)); #else // PADDLE_WITH_HIP HIP_CHECK(hipEventRecord(nccl_end_event_, nccl_stream_)); #endif @@ -103,6 +111,19 @@ void NCCLCommTask::ClearRecord() { end_event_created_ = false; } } +#elif defined(PADDLE_WITH_MUSA) +void NCCLCommTask::ClearRecord() { + if (start_event_created_) { + backends::gpu::GPUDeviceGuard guard(place_.device); + MUSA_CHECK(musaEventDestroy(nccl_start_event_)); + start_event_created_ = false; + } + if (end_event_created_) { + backends::gpu::GPUDeviceGuard guard(place_.device); + MUSA_CHECK(musaEventDestroy(nccl_end_event_)); + end_event_created_ = false; + } +} #else // PADDLE_WITH_HIP void NCCLCommTask::ClearRecord() { if (start_event_created_) { @@ -129,6 +150,16 @@ bool NCCLCommTask::CudaEventQuery(gpuEvent_t event) { // ignore and clear the error if not ready CUDA_CHECK(cudaGetLastError()); } +#elif defined(PADDLE_WITH_MUSA) + musaError_t ret = musaEventQuery(event); + if (ret == musaSuccess) { + return true; + } else if (ret != musaErrorNotReady) { + MUSA_CHECK(ret); + } else { + // ignore and clear the error if not ready + MUSA_CHECK(musaGetLastError()); + } #else // PADDLE_WITH_HIP hipError_t ret = hipEventQuery(event); if (ret == hipSuccess) { @@ -143,7 +174,7 @@ bool NCCLCommTask::CudaEventQuery(gpuEvent_t event) { return false; } -std::string GetNCCLErrorDetail(ncclResult_t result) { +std::string GetNCCLErrorDetail(mcclResult_t result) { std::string detail; std::string last_error; #ifdef ENABLE_NCCL_GET_LAST_ERROR @@ -151,10 +182,10 @@ std::string GetNCCLErrorDetail(ncclResult_t result) { ", Last error: " + std::string(phi::dynload::ncclGetLastError(NULL)); #endif switch (result) { - case ncclUnhandledCudaError: + case mcclUnhandledCudaError: detail = "ncclUnhandledCudaError: Call to CUDA function failed."; break; - case ncclSystemError: + case mcclSystemError: detail = "ncclSystemError: System call (e.g. socket, malloc) or external " "library call failed or device error. "; @@ -164,13 +195,13 @@ std::string GetNCCLErrorDetail(ncclResult_t result) { detail += "It can be also caused by unexpected exit of a remote peer."; #endif break; - case ncclInternalError: + case mcclInternalError: detail = "ncclInternalError: Internal check failed."; break; - case ncclInvalidArgument: + case mcclInvalidArgument: detail = "ncclInvalidArgument: Invalid value for an argument."; break; - case ncclInvalidUsage: + case mcclInvalidUsage: detail = "ncclInvalidUsage: This usually reflects invalid usage of NCCL " "library."; @@ -194,10 +225,10 @@ std::string NCCLCommTask::GetCommErrors() { return comm_error_; } - ncclResult_t nccl_async_error; - NCCL_CHECK( - phi::dynload::ncclCommGetAsyncError(nccl_comm_, &nccl_async_error)); - if (nccl_async_error != ncclSuccess) { + mcclResult_t nccl_async_error; + MCCL_CHECK( + phi::dynload::mcclCommGetAsyncError(nccl_comm_, &nccl_async_error)); + if (nccl_async_error != mcclSuccess) { comm_error_ = "\n\t Find nccl comm error: " + GetNCCLErrorDetail(nccl_async_error); } @@ -241,7 +272,7 @@ void NCCLCommTask::AbortComm() { if (aborted_) { return; } - NCCL_CHECK(phi::dynload::ncclCommAbort(nccl_comm_)); + MCCL_CHECK(phi::dynload::mcclCommAbort(nccl_comm_)); aborted_ = true; nccl_comm_ = nullptr; diff --git a/paddle/phi/core/distributed/nccl_comm_task.h b/paddle/phi/core/distributed/nccl_comm_task.h index fca9004cf0b2d..11bbbd1c9dcf7 100644 --- a/paddle/phi/core/distributed/nccl_comm_task.h +++ b/paddle/phi/core/distributed/nccl_comm_task.h @@ -21,6 +21,8 @@ #if defined(PADDLE_WITH_RCCL) #include "paddle/phi/backends/dynload/rccl.h" +#elif defined(PADDLE_WITH_MCCL) +#include "paddle/phi/backends/dynload/mccl.h" #else #include "paddle/phi/backends/dynload/nccl.h" #endif @@ -42,7 +44,7 @@ class NCCLCommTask : public CommTask { int64_t numel = 0, bool sync_op = true, bool use_calc_stream = false, - ncclComm_t = nullptr, + mcclComm_t = nullptr, gpuStream_t = nullptr, CommType comm_type = CommType::UNKNOWN, int64_t timeout = DefaultTimeout); @@ -71,6 +73,8 @@ class NCCLCommTask : public CommTask { #ifdef PADDLE_WITH_CUDA unsigned int cuda_event_flags_ = cudaEventDisableTiming; +#elif defined(PADDLE_WITH_MUSA) + unsigned int musa_event_flags_ = musaEventDisableTiming; #else // PADDLE_WITH_HIP unsigned int hip_event_flags_ = hipEventDisableTiming; #endif diff --git a/paddle/phi/core/distributed/nccl_tools.cc b/paddle/phi/core/distributed/nccl_tools.cc index a5388796d1f45..24a1f3ee7891d 100644 --- a/paddle/phi/core/distributed/nccl_tools.cc +++ b/paddle/phi/core/distributed/nccl_tools.cc @@ -19,74 +19,74 @@ #include "paddle/common/errors.h" #include "paddle/phi/core/enforce.h" -#if NCCL_VERSION_CODE >= 21300 +// #if NCCL_VERSION_CODE >= 21300 #define ENABLE_NCCL_GET_LAST_ERROR #define NCCL_REMOTE_ERROR -#endif +// #endif namespace phi { namespace distributed { -ncclRedOp_t ToNCCLRedType(ReduceOp reduction) { - static const std::unordered_map red_type = { - {ReduceOp::MIN, ncclMin}, - {ReduceOp::MAX, ncclMax}, - {ReduceOp::SUM, ncclSum}, - {ReduceOp::PRODUCT, ncclProd}, +mcclRedOp_t ToNCCLRedType(ReduceOp reduction) { + static const std::unordered_map red_type = { + {ReduceOp::MIN, mcclMin}, + {ReduceOp::MAX, mcclMax}, + {ReduceOp::SUM, mcclSum}, + {ReduceOp::PRODUCT, mcclProd}, }; auto it = red_type.find(reduction); PADDLE_ENFORCE_EQ(it != red_type.end(), true, phi::errors::InvalidArgument( - "Invalid nccl reduction. Must be ncclMin | ncclMax | " - "ncclProd | ncclSum")); + "Invalid nccl reduction. Must be mcclMin | mcclMax | " + "mcclProd | mcclSum")); return it->second; } -std::string SerializeNCCLUniqueId(const ncclUniqueId& ncclID) { +std::string SerializeNCCLUniqueId(const mcclUniqueId& ncclID) { const uint8_t* bytes = reinterpret_cast(&ncclID); std::ostringstream oss; - for (auto i = 0; i < NCCL_UNIQUE_ID_BYTES; ++i) { + for (auto i = 0; i < MCCL_UNIQUE_ID_BYTES; ++i) { oss << std::hex << static_cast(bytes[i]); } return oss.str(); } -std::string NCCLDTypeToString(ncclDataType_t dtype) { +std::string NCCLDTypeToString(mcclDataType_t dtype) { #define PD_NCCL_DTYPE_TO_STR(__nccl_dtype, __str_dtype) \ if (dtype == __nccl_dtype) return __str_dtype; - PD_NCCL_DTYPE_TO_STR(ncclFloat, "float32"); - PD_NCCL_DTYPE_TO_STR(ncclFloat32, "float32"); - PD_NCCL_DTYPE_TO_STR(ncclHalf, "float16"); - PD_NCCL_DTYPE_TO_STR(ncclFloat16, "float16"); -#if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000 - PD_NCCL_DTYPE_TO_STR(ncclBfloat16, "bfloat16"); -#endif - PD_NCCL_DTYPE_TO_STR(ncclDouble, "float64"); - PD_NCCL_DTYPE_TO_STR(ncclFloat64, "float64"); + PD_NCCL_DTYPE_TO_STR(mcclFloat, "float32"); + PD_NCCL_DTYPE_TO_STR(mcclFloat32, "float32"); + PD_NCCL_DTYPE_TO_STR(mcclHalf, "float16"); + PD_NCCL_DTYPE_TO_STR(mcclFloat16, "float16"); +// // #if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000 +// PD_NCCL_DTYPE_TO_STR(mcclBfloat16, "bfloat16"); +// // #endif + PD_NCCL_DTYPE_TO_STR(mcclDouble, "float64"); + PD_NCCL_DTYPE_TO_STR(mcclFloat64, "float64"); - PD_NCCL_DTYPE_TO_STR(ncclInt8, "int8"); - PD_NCCL_DTYPE_TO_STR(ncclChar, "int8"); - PD_NCCL_DTYPE_TO_STR(ncclUint8, "uint8"); - PD_NCCL_DTYPE_TO_STR(ncclInt32, "int32"); - PD_NCCL_DTYPE_TO_STR(ncclInt, "int32"); - PD_NCCL_DTYPE_TO_STR(ncclUint32, "uint32"); - PD_NCCL_DTYPE_TO_STR(ncclInt64, "int64"); - PD_NCCL_DTYPE_TO_STR(ncclUint64, "uint64"); + PD_NCCL_DTYPE_TO_STR(mcclInt8, "int8"); + PD_NCCL_DTYPE_TO_STR(mcclChar, "int8"); + PD_NCCL_DTYPE_TO_STR(mcclUint8, "uint8"); + PD_NCCL_DTYPE_TO_STR(mcclInt32, "int32"); + PD_NCCL_DTYPE_TO_STR(mcclInt, "int32"); + PD_NCCL_DTYPE_TO_STR(mcclUint32, "uint32"); + PD_NCCL_DTYPE_TO_STR(mcclInt64, "int64"); + PD_NCCL_DTYPE_TO_STR(mcclUint64, "uint64"); #undef PD_NCCL_DTYPE_TO_STR PADDLE_THROW(phi::errors::InvalidArgument( "This datatype %d in nccl is not supported.", static_cast(dtype))); } -std::string NCCLRedTypeToString(ncclRedOp_t op) { - if (op == ncclSum) return "SUM"; - if (op == ncclProd) return "PROD"; - if (op == ncclMin) return "MIN"; - if (op == ncclMax) return "MAX"; -#if NCCL_VERSION_CODE >= 21000 - if (op == ncclAvg) return "AVG"; -#endif +std::string NCCLRedTypeToString(mcclRedOp_t op) { + if (op == mcclSum) return "SUM"; + if (op == mcclProd) return "PROD"; + if (op == mcclMin) return "MIN"; + if (op == mcclMax) return "MAX"; +// #if NCCL_VERSION_CODE >= 21000 + if (op == mcclAvg) return "AVG"; +// #endif return "UDF_" + std::to_string(op); } diff --git a/paddle/phi/core/distributed/nccl_tools.h b/paddle/phi/core/distributed/nccl_tools.h index 0ab380a417783..e256d4ef4d009 100644 --- a/paddle/phi/core/distributed/nccl_tools.h +++ b/paddle/phi/core/distributed/nccl_tools.h @@ -21,6 +21,9 @@ #ifdef PADDLE_WITH_RCCL #include #include "paddle/phi/backends/dynload/rccl.h" +#elif defined(PADDLE_WITH_MCCL) +#include +#include "paddle/phi/backends/dynload/mccl.h" #else #include #include "paddle/phi/backends/dynload/nccl.h" @@ -32,7 +35,7 @@ namespace distributed { #define NCCL_CHECK(cmd) \ do { \ ncclResult_t r = cmd; \ - if (r != ncclSuccess) { \ + if (r != mcclSuccess) { \ PADDLE_THROW( \ phi::errors::External("Failed, NCCL error %s:%d '%s'\n", \ __FILE__, \ @@ -41,6 +44,18 @@ namespace distributed { } \ } while (0) +#define MCCL_CHECK(cmd) \ + do { \ + mcclResult_t r = cmd; \ + if (r != mcclSuccess) { \ + PADDLE_THROW( \ + phi::errors::External("Failed, MCCL error %s:%d '%s'\n", \ + __FILE__, \ + __LINE__, \ + phi::dynload::mcclGetErrorString(r))); \ + } \ + } while (0) + #ifdef PADDLE_WITH_NCCL #define CUDA_CHECK(expr) \ do { \ @@ -52,6 +67,17 @@ namespace distributed { cudaGetErrorString(r))); \ } \ } while (0) +#elif defined(PADDLE_WITH_MCCL) +#define MUSA_CHECK(expr) \ + do { \ + musaError_t r = expr; \ + if (r != musaSuccess) { \ + PADDLE_THROW(phi::errors::External("Failed, musa error %s:%d '%s'\n", \ + __FILE__, \ + __LINE__, \ + musaGetErrorString(r))); \ + } \ + } while (0) #else // PADDLE_WITH_RCCL #define HIP_CHECK(expr) \ do { \ @@ -65,13 +91,13 @@ namespace distributed { } while (0) #endif -ncclRedOp_t ToNCCLRedType(ReduceOp reduction); +mcclRedOp_t ToNCCLRedType(ReduceOp reduction); -std::string SerializeNCCLUniqueId(const ncclUniqueId& ncclID); +std::string SerializeNCCLUniqueId(const mcclUniqueId& ncclID); -std::string NCCLDTypeToString(ncclDataType_t dtype); +std::string NCCLDTypeToString(mcclDataType_t dtype); -std::string NCCLRedTypeToString(ncclRedOp_t op); +std::string NCCLRedTypeToString(mcclRedOp_t op); } // namespace distributed } // namespace phi diff --git a/paddle/phi/core/enforce.h b/paddle/phi/core/enforce.h index 61e502951f24e..0c21ffac88703 100644 --- a/paddle/phi/core/enforce.h +++ b/paddle/phi/core/enforce.h @@ -23,6 +23,16 @@ limitations under the License. */ #include #endif // PADDLE_WITH_CUDA +#ifdef PADDLE_WITH_MUSA +#include +#include +#include +#include +#include +#include +#include +#endif + #ifdef PADDLE_WITH_HIP #include #include @@ -55,6 +65,17 @@ limitations under the License. */ #endif // __APPLE__ #endif // PADDLE_WITH_CUDA +#ifdef PADDLE_WITH_MUSA +#include "paddle/phi/backends/dynload/mufft.h" +#include "paddle/phi/backends/dynload/mublas.h" +#include "paddle/phi/backends/dynload/mudnn.h" +#include "paddle/phi/backends/dynload/murand.h" +#if !defined(__APPLE__) && defined(PADDLE_WITH_MCCL) +#include +#include "paddle/phi/backends/dynload/mccl.h" +#endif // __APPLE__ +#endif + #ifdef PADDLE_WITH_HIP #include "paddle/phi/backends/dynload/hipfft.h" #include "paddle/phi/backends/dynload/hiprand.h" @@ -69,7 +90,7 @@ limitations under the License. */ // Note: these headers for simplify demangle type string #include "paddle/phi/core/type_defs.h" // Note: this header for simplify HIP and CUDA type string -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) #include "paddle/phi/backends/gpu/gpu_types.h" #endif #if defined(PADDLE_WITH_XPU_BKCL) @@ -326,6 +347,17 @@ struct EnforceNotMet : public std::exception { abort(); \ } \ } while (0) +#elif defined(__MUSACC__) +#define PADDLE_ENFORCE(_IS_NOT_ERROR, __FORMAT, ...) \ + do { \ + if (!(_IS_NOT_ERROR)) { \ + printf("Error: %s:%d Assertion `%s` failed. " __FORMAT "\n", \ + __FILE__, \ + __LINE__, \ + #_IS_NOT_ERROR, \ + ##__VA_ARGS__); \ + } \ + } while (0) #else #define PADDLE_ENFORCE(COND, ...) \ do { \ @@ -570,7 +602,7 @@ DEFINE_EXTERNAL_API_TYPE(cufftResult_t, CUFFT_SUCCESS); DEFINE_EXTERNAL_API_TYPE(CUresult, CUDA_SUCCESS); #if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL) -DEFINE_EXTERNAL_API_TYPE(ncclResult_t, ncclSuccess); +DEFINE_EXTERNAL_API_TYPE(ncclResult_t, mcclSuccess); #endif } // namespace details @@ -666,7 +698,7 @@ inline std::string build_nvidia_error_msg(CUresult stat) { /**************** NCCL ERROR ****************/ #if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL) inline bool is_error(ncclResult_t nccl_result) { - return nccl_result != ncclSuccess; + return nccl_result != mcclSuccess; } inline std::string build_nvidia_error_msg(ncclResult_t nccl_result) { @@ -867,7 +899,7 @@ inline std::string build_rocm_error_msg(rocblas_status stat) { /****** RCCL ERROR ******/ #if !defined(__APPLE__) && defined(PADDLE_WITH_RCCL) inline bool is_error(ncclResult_t nccl_result) { - return nccl_result != ncclSuccess; + return nccl_result != mcclSuccess; } inline std::string build_rocm_error_msg(ncclResult_t nccl_result) { @@ -903,7 +935,7 @@ DEFINE_EXTERNAL_API_TYPE(rocblas_status, rocblas_status_success); DEFINE_EXTERNAL_API_TYPE(hipfftResult_t, HIPFFT_SUCCESS); #if !defined(__APPLE__) && defined(PADDLE_WITH_RCCL) -DEFINE_EXTERNAL_API_TYPE(ncclResult_t, ncclSuccess); +DEFINE_EXTERNAL_API_TYPE(ncclResult_t, mcclSuccess); #endif } // namespace details @@ -958,7 +990,7 @@ inline void retry_sleep(unsigned millisecond) { } \ if (UNLIKELY(__cond__ != __success_type__)) { \ auto __summary__ = phi::errors::External( \ - ::phi::enforce::build_rocm_error_msg(__cond__)); \ + ::phi::enforce::build_musa_error_msg(__cond__)); \ __THROW_ERROR_INTERNAL__(__summary__); \ } \ } while (0) @@ -966,6 +998,234 @@ inline void retry_sleep(unsigned millisecond) { #undef DEFINE_EXTERNAL_API_TYPE #endif // PADDLE_WITH_HIP + + + + + + + + + + + + + + + +/**************************************************************************/ +/***************************** MUSA ERROR **********************************/ +#ifdef PADDLE_WITH_MUSA + +/***** MUSA ERROR *****/ +inline bool is_error(musaError_t e) { return e != musaSuccess; } + +inline std::string build_musa_error_msg(musaError_t e) { + std::ostringstream sout; + sout << " Musa error(" << e << "), " << musaGetErrorString(e) << "."; + return sout.str(); +} + +/***** MURAND ERROR *****/ +inline bool is_error(murandStatus_t stat) { + return stat != MURAND_STATUS_SUCCESS; +} + +inline const char* murandGetErrorString(murandStatus_t stat) { + switch (stat) { + case MURAND_STATUS_SUCCESS: + return "MURAND_STATUS_SUCCESS"; + case MURAND_STATUS_VERSION_MISMATCH: + return "MURAND_STATUS_VERSION_MISMATCH"; + case MURAND_STATUS_NOT_CREATED: + return "MURAND_STATUS_NOT_CREATED"; + case MURAND_STATUS_ALLOCATION_FAILED: + return "MURAND_STATUS_ALLOCATION_FAILED"; + case MURAND_STATUS_TYPE_ERROR: + return "MURAND_STATUS_TYPE_ERROR"; + case MURAND_STATUS_OUT_OF_RANGE: + return "MURAND_STATUS_OUT_OF_RANGE"; + case MURAND_STATUS_LENGTH_NOT_MULTIPLE: + return "MURAND_STATUS_LENGTH_NOT_MULTIPLE"; + case MURAND_STATUS_DOUBLE_PRECISION_REQUIRED: + return "MURAND_STATUS_DOUBLE_PRECISION_REQUIRED"; + case MURAND_STATUS_LAUNCH_FAILURE: + return "MURAND_STATUS_LAUNCH_FAILURE"; + case MURAND_STATUS_INTERNAL_ERROR: + return "MURAND_STATUS_INTERNAL_ERROR"; + case MURAND_STATUS_NOT_IMPLEMENTED: + return "MURAND_STATUS_NOT_IMPLEMENTED"; + default: + return "Unknown murand status"; + } +} + +inline std::string build_musa_error_msg(murandStatus_t stat) { + std::string msg(" Murand error, "); + return msg + murandGetErrorString(stat) + " "; +} + +/***** mudnn ERROR *****/ +// inline bool is_error(mudnnStatus_t stat) { +// return stat != cudnnStatusSuccess; +// } + +// inline std::string build_rocm_error_msg(miopenStatus_t stat) { +// std::string msg(" Miopen error, "); +// return msg + phi::dynload::miopenGetErrorString(stat) + " "; +// } + +/***** MUBLAS ERROR *****/ +inline bool is_error(mublasStatus stat) { + return stat != MUBLAS_STATUS_SUCCESS; +} + +inline const char* mublasGetErrorString(mublasStatus stat) { + switch (stat) { + case MUBLAS_STATUS_SUCCESS: + return "MUBLAS_STATUS_SUCCESS"; + case MUBLAS_STATUS_INVALID_HANDLE: + return "MUBLAS_STATUS_INVALID_HANDLE"; + case MUBLAS_STATUS_NOT_IMPLEMENTED: + return "MUBLAS_STATUS_NOT_IMPLEMENTED"; + case MUBLAS_STATUS_INVALID_POINTER: + return "MUBLAS_STATUS_INVALID_POINTER"; + case MUBLAS_STATUS_INVALID_SIZE: + return "MUBLAS_STATUS_INVALID_SIZE"; + case MUBLAS_STATUS_MEMORY_ERROR: + return "MUBLAS_STATUS_MEMORY_ERROR"; + case MUBLAS_STATUS_INTERNAL_ERROR: + return "MUBLAS_STATUS_INTERNAL_ERROR"; + case MUBLAS_STATUS_PERF_DEGRADED: + return "MUBLAS_STATUS_PERF_DEGRADED"; + case MUBLAS_STATUS_SIZE_QUERY_MISMATCH: + return "MUBLAS_STATUS_SIZE_QUERY_MISMATCH"; + case MUBLAS_STATUS_SIZE_INCREASED: + return "MUBLAS_STATUS_SIZE_INCREASED"; + case MUBLAS_STATUS_SIZE_UNCHANGED: + return "MUBLAS_STATUS_SIZE_UNCHANGED"; + case MUBLAS_STATUS_INVALID_VALUE: + return "MUBLAS_STATUS_INVALID_VALUE"; + case MUBLAS_STATUS_CONTINUE: + return "MUBLAS_STATUS_CONTINUE"; + case MUBLAS_STATUS_CHECK_NUMERICS_FAIL: + return "MUBLAS_STATUS_CHECK_NUMERICS_FAIL"; + default: + return "Unknown mublas status"; + } +} + +inline std::string build_musa_error_msg(mublasStatus stat) { + std::string msg(" mublas error, "); + return msg + mublasGetErrorString(stat) + " "; +} + +/****** MCCL ERROR ******/ +#if !defined(__APPLE__) && defined(PADDLE_WITH_MCCL) +inline bool is_error(mcclResult_t mccl_result) { + return mccl_result != mcclSuccess; +} + +inline std::string build_musa_error_msg(mcclResult_t mccl_result) { + std::string msg(" Mccl error, "); + return msg + phi::dynload::mcclGetErrorString(mccl_result) + " "; +} +#endif // not(__APPLE__) and PADDLE_WITH_MCCL + +/***** MUFFT ERROR *****/ +inline bool is_error(mufftResult_t stat) { return stat != MUFFT_SUCCESS; } + +inline std::string build_musa_error_msg(mufftResult_t stat) { + std::string msg(" MUFFT error, "); + return msg + phi::dynload::mufftGetErrorString(stat) + " "; +} + +namespace details { + +template +struct ExternalApiType {}; + +#define DEFINE_EXTERNAL_API_TYPE(type, success_value) \ + template <> \ + struct ExternalApiType { \ + using Type = type; \ + static constexpr Type kSuccess = success_value; \ + } + +DEFINE_EXTERNAL_API_TYPE(musaError_t, musaSuccess); +DEFINE_EXTERNAL_API_TYPE(murandStatus_t, MURAND_STATUS_SUCCESS); +DEFINE_EXTERNAL_API_TYPE(mublasStatus, MUBLAS_STATUS_SUCCESS); +DEFINE_EXTERNAL_API_TYPE(mufftResult_t, MUFFT_SUCCESS); + +#if !defined(__APPLE__) && defined(PADDLE_WITH_MCCL) +DEFINE_EXTERNAL_API_TYPE(mcclResult_t, mcclSuccess); +#endif + +} // namespace details + +#define PADDLE_ENFORCE_GPU_SUCCESS(COND) \ + do { \ + auto __cond__ = (COND); \ + using __CUDA_STATUS_TYPE__ = decltype(__cond__); \ + constexpr auto __success_type__ = \ + ::phi::enforce::details::ExternalApiType< \ + __CUDA_STATUS_TYPE__>::kSuccess; \ + if (UNLIKELY(__cond__ != __success_type__)) { \ + auto __summary__ = phi::errors::External( \ + ::phi::enforce::build_musa_error_msg(__cond__)); \ + __THROW_ERROR_INTERNAL__(__summary__); \ + } \ + } while (0) + +#define PADDLE_WARN_GPU_SUCCESS(COND) \ + do { \ + auto __cond__ = (COND); \ + using __CUDA_STATUS_TYPE__ = decltype(__cond__); \ + constexpr auto __success_type__ = \ + ::phi::enforce::details::ExternalApiType< \ + __CUDA_STATUS_TYPE__>::kSuccess; \ + if (UNLIKELY(__cond__ != __success_type__)) { \ + ::phi::enforce::ThrowWarnInternal( \ + ::phi::enforce::build_musa_error_msg(__cond__)); \ + } \ + } while (0) + +inline void retry_sleep(unsigned millisecond) { +#ifdef _WIN32 + Sleep(millisecond); +#else + sleep(millisecond); +#endif +} + +#define PADDLE_RETRY_CUDA_SUCCESS(COND) \ + do { \ + auto __cond__ = (COND); \ + int retry_count = 1; \ + using __CUDA_STATUS_TYPE__ = decltype(__cond__); \ + constexpr auto __success_type__ = \ + ::phi::enforce::details::ExternalApiType< \ + __CUDA_STATUS_TYPE__>::kSuccess; \ + while (UNLIKELY(__cond__ != __success_type__) && retry_count < 5) { \ + ::phi::enforce::retry_sleep(10000); \ + __cond__ = (COND); \ + ++retry_count; \ + } \ + if (UNLIKELY(__cond__ != __success_type__)) { \ + auto __summary__ = phi::errors::External( \ + ::phi::enforce::build_musa_error_msg(__cond__)); \ + __THROW_ERROR_INTERNAL__(__summary__); \ + } \ + } while (0) + +#undef DEFINE_EXTERNAL_API_TYPE +#endif // PADDLE_WITH_MUSA + + + + + + } // namespace enforce using namespace enforce; // NOLINT } // namespace phi diff --git a/paddle/phi/core/flags.cc b/paddle/phi/core/flags.cc index a6764dfcf1c31..9304b42be1644 100644 --- a/paddle/phi/core/flags.cc +++ b/paddle/phi/core/flags.cc @@ -14,7 +14,7 @@ // limitations under the License. #include "paddle/phi/core/flags.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) #include "paddle/phi/backends/gpu/cuda/cudnn_workspace_helper.h" #endif @@ -120,7 +120,7 @@ PHI_DEFINE_EXPORTED_bool( // NOTE(zhiqiu): better to share the flags, otherwise we will have too many // flags. -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) /** * CUDA related related FLAG @@ -215,7 +215,7 @@ PHI_DEFINE_EXPORTED_bool( true, "Whether enable api kernel fallback to CPU one when not found"); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) /** * CUDNN related FLAG * Name: FLAGS_cudnn_deterministic @@ -322,7 +322,7 @@ PHI_DEFINE_EXPORTED_bool( "batch_norm, default is False."); #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) /** * NCCL related FLAG @@ -541,7 +541,7 @@ PHI_DEFINE_EXPORTED_double( // NOTE(zhiqiu): better to share the flags, otherwise we will have too many // flags. -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || \ defined(PADDLE_WITH_CUSTOM_DEVICE) || defined(PADDLE_WITH_XPU) /** @@ -785,7 +785,7 @@ PHI_DEFINE_EXPORTED_string(tracer_mkldnn_ops_off, * Example: * Note: Check kernel launch status after every kernel compute. */ -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) PHI_DEFINE_EXPORTED_bool( check_kernel_launch, false, @@ -800,7 +800,7 @@ PHI_DEFINE_EXPORTED_bool( * Example: * Note: Disable cudnn in conv2d. */ -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) PHI_DEFINE_EXPORTED_bool(conv2d_disable_cudnn, false, "Disable cudnn in conv2d"); @@ -819,7 +819,7 @@ PHI_DEFINE_EXPORTED_bool(use_fast_math, * Note: Get host by name time. */ #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_XPU) || \ - defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_CUSTOM_DEVICE) + defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || defined(PADDLE_WITH_CUSTOM_DEVICE) PHI_DEFINE_EXPORTED_int32(get_host_by_name_time, 120, "The maximum time for get host by name time"); @@ -1190,11 +1190,11 @@ PHI_DEFINE_EXPORTED_bool(multi_node_sample_use_gpu_table, * Note: nccl blocking wait. */ -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) PHI_DEFINE_EXPORTED_bool(nccl_blocking_wait, false, "nccl blocking wait"); #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) PHI_DEFINE_EXPORTED_bool(benchmark_nccl, false, "enable nccl debug mode to synchronize nccl comm"); @@ -1428,7 +1428,7 @@ PHI_DEFINE_EXPORTED_int32( PHI_DEFINE_EXPORTED_bool(print_ir, false, "Whether print ir debug str."); -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) || \ defined(PADDLE_WITH_XPU_BKCL) /** * Communication library related FLAG diff --git a/paddle/phi/core/generator.cc b/paddle/phi/core/generator.cc index 82d37be80d3c3..a2fe426b0ec47 100644 --- a/paddle/phi/core/generator.cc +++ b/paddle/phi/core/generator.cc @@ -63,7 +63,7 @@ const std::shared_ptr& DefaultXPUGenerator(int64_t device_id) { } const std::shared_ptr& DefaultCUDAGenerator(int64_t device_id) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) static int64_t num_cuda_devices = -1; static std::once_flag num_devices_init_flag; @@ -278,8 +278,7 @@ uint64_t Generator::Random64() { std::pair Generator::IncrementOffset( uint64_t increment_offset) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ - defined(PADDLE_WITH_CUSTOM_DEVICE) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) std::lock_guard lock(this->mu_); uint64_t cur_offset = this->state_.thread_offset; VLOG(10) << "cur_offset = " << cur_offset diff --git a/paddle/phi/core/hostdevice.h b/paddle/phi/core/hostdevice.h index decebbe66a538..3295a2f6b3739 100644 --- a/paddle/phi/core/hostdevice.h +++ b/paddle/phi/core/hostdevice.h @@ -18,6 +18,10 @@ #include #endif +#ifdef __MUSACC__ +#include +#endif + #if defined(__xpu__) #include @@ -26,7 +30,7 @@ #include "xpu/kernel/math.h" #endif -#if (defined(__CUDACC__) || defined(__HIPCC__) || defined(__xpu__)) +#if (defined(__CUDACC__) || defined(__HIPCC__) || defined(__xpu__) || defined(__MUSACC__)) #define HOSTDEVICE __host__ __device__ #define DEVICE __device__ #define HOST __host__ diff --git a/paddle/phi/core/kernel_factory.cc b/paddle/phi/core/kernel_factory.cc index a5c5a3994a81b..6e534511802bb 100644 --- a/paddle/phi/core/kernel_factory.cc +++ b/paddle/phi/core/kernel_factory.cc @@ -124,7 +124,7 @@ const Kernel& KernelFactory::SelectKernelWithGPUDNN( return empty_kernel; } KernelKey kernel_key = KernelKey(const_kernel_key); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) if (kernel_key.backend() == Backend::GPUDNN) { auto kernel_iter = iter->second.find( {Backend::GPUDNN, phi::DataLayout::ALL_LAYOUT, kernel_key.dtype()}); @@ -239,7 +239,7 @@ KernelResult KernelFactory::SelectKernelOrThrowError( KernelKey kernel_key = KernelKey(const_kernel_key.backend(), phi::DataLayout::ALL_LAYOUT, const_kernel_key.dtype()); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) if (kernel_key.backend() == Backend::GPUDNN) { auto kernel_iter = iter->second.find( {Backend::GPUDNN, phi::DataLayout::ALL_LAYOUT, kernel_key.dtype()}); diff --git a/paddle/phi/core/kernel_registry.cc b/paddle/phi/core/kernel_registry.cc index fa9d531b6534d..77ae9b45c9d68 100644 --- a/paddle/phi/core/kernel_registry.cc +++ b/paddle/phi/core/kernel_registry.cc @@ -34,7 +34,7 @@ void SetKernelArgsDef(const std::vector& args_type, #if defined(PADDLE_WITH_DNNL) || arg_type == std::type_index(typeid(const OneDNNContext&)) #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || arg_type == std::type_index(typeid(const GPUContext&)) #elif defined(PADDLE_WITH_XPU) && !defined(PADDLE_WITH_XPU_KP) || arg_type == std::type_index(typeid(const XPUContext&)) diff --git a/paddle/phi/core/kernel_registry.h b/paddle/phi/core/kernel_registry.h index b24e39b6c75bf..19f76f60f9a1b 100644 --- a/paddle/phi/core/kernel_registry.h +++ b/paddle/phi/core/kernel_registry.h @@ -1199,7 +1199,7 @@ struct KernelRegistrar { meta_kernel_fn, \ BACKEND_LIST_EXCEPT_CUSTOM) -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) #define _DEVICE GPU, #elif defined(PADDLE_WITH_XPU) #define _DEVICE XPU, diff --git a/paddle/phi/core/kernel_utils.h b/paddle/phi/core/kernel_utils.h index 715b4f76392d8..3b55ccd3dbc36 100644 --- a/paddle/phi/core/kernel_utils.h +++ b/paddle/phi/core/kernel_utils.h @@ -300,7 +300,7 @@ struct KernelImpl { /* DeviceContext Helpers */ PD_SPECIALIZE_KernelCallHelper_FOR_DEVICE_CONTEXT(CPUContext); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) PD_SPECIALIZE_KernelCallHelper_FOR_DEVICE_CONTEXT(GPUContext); #endif #ifdef PADDLE_WITH_XPU diff --git a/paddle/phi/core/mixed_vector.cc b/paddle/phi/core/mixed_vector.cc index 857bd546befcd..aba6a0f7bfca2 100644 --- a/paddle/phi/core/mixed_vector.cc +++ b/paddle/phi/core/mixed_vector.cc @@ -33,7 +33,7 @@ template void CopyToCPUHelper(std::vector *cpu_, phi::Allocator::AllocationPtr *gpu_, size_t *gpu_memory_size_) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) // COPY GPU Data To CPU auto *dev_ctx = static_cast( phi::DeviceContextPool::Instance().Get((*gpu_)->place())); @@ -55,7 +55,7 @@ void CopyCPUDataToCUDAHelper(std::vector *cpu_, phi::Allocator::AllocationPtr *gpu_, size_t *gpu_memory_size_, const phi::Place &place) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) void *src = cpu_->data(); *gpu_memory_size_ = cpu_->size() * sizeof(T); // sizeof(T) (*gpu_) = memory_utils::Alloc(place, *gpu_memory_size_); diff --git a/paddle/phi/core/string_tensor.cc b/paddle/phi/core/string_tensor.cc index 35c59c2d8d787..700db5e8d4382 100644 --- a/paddle/phi/core/string_tensor.cc +++ b/paddle/phi/core/string_tensor.cc @@ -116,9 +116,11 @@ void StringTensor::init_holder() { if (place.GetType() == phi::AllocationType::CPU) { std::memset(ptr, 0, bytes_size); } else if (place.GetType() == phi::AllocationType::GPU) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) #ifdef PADDLE_WITH_HIP hipMemset(ptr, 0, bytes_size); +#elif defined(PADDLE_WITH_MUSA) + musaMemset(ptr, 0, bytes_size); #else cudaMemset(ptr, 0, bytes_size); #endif diff --git a/paddle/phi/core/tensor_utils.cc b/paddle/phi/core/tensor_utils.cc index 17fdef1b9cfbd..03d8b3a0f661e 100644 --- a/paddle/phi/core/tensor_utils.cc +++ b/paddle/phi/core/tensor_utils.cc @@ -65,7 +65,7 @@ void Copy(const Context& dev_ctx, #ifdef PADDLE_WITH_DNNL dst->set_layout(src.layout()); #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) } else if (dst_place.GetType() == AllocationType::GPU || dst_place.GetType() == AllocationType::GPUPINNED) { dst_ptr = dev_ctx.Alloc( @@ -106,7 +106,7 @@ void Copy(const Context& dev_ctx, if (src_place.GetType() == AllocationType::CPU && dst_place.GetType() == AllocationType::CPU) { memory_utils::Copy(src_place, dst_ptr, src_place, src_ptr, size); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) } else if ((src_place.GetType() == AllocationType::CPU || src_place.GetType() == AllocationType::GPUPINNED) && // NOLINT (dst_place.GetType() == AllocationType::CPU || @@ -394,7 +394,7 @@ template void Copy(const DeviceContext& dev_ctx, bool blocking, TensorArray* dst); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) template void Copy(const GPUContext& dev_ctx, const DenseTensor& src, Place dst_place, @@ -476,7 +476,7 @@ void TensorFromVector(const std::vector& src, if (dst_place.GetType() == AllocationType::CPU) { memory_utils::Copy(dst_place, dst_ptr, src_place, src_ptr, size); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) else if (dst_place.GetType() == AllocationType::GPU) { // NOLINT memory_utils::Copy(dst_place, dst_ptr, @@ -530,7 +530,7 @@ void TensorFromVector(const std::vector& src, if (dst_place.GetType() == AllocationType::CPU) { memory_utils::Copy(dst_place, dst_ptr, src_place, src_ptr, size); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) else if (dst_place.GetType() == AllocationType::GPU) { // NOLINT memory_utils::Copy(dst_place, dst_ptr, @@ -622,7 +622,7 @@ void TensorFromArray(const T* src, if (dst_place.GetType() == AllocationType::CPU) { memory_utils::Copy(dst_place, dst_ptr, src_place, src_ptr, size); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) else if (dst_place.GetType() == AllocationType::GPU) { // NOLINT memory_utils::Copy(dst_place, dst_ptr, @@ -722,7 +722,7 @@ void TensorToVector(const phi::DenseTensor& src, if (src.place().GetType() == AllocationType::CPU) { memory_utils::Copy(dst_place, dst_ptr, src.place(), src_ptr, size); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) else if (src.place().GetType() == AllocationType::GPU) { // NOLINT memory_utils::Copy(dst_place, dst_ptr, @@ -764,7 +764,7 @@ void TensorToVector(const phi::DenseTensor& src, if (src.place().GetType() == AllocationType::CPU) { memory_utils::Copy(dst_place, dst_ptr, src.place(), src_ptr, size); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) else if (src.place().GetType() == AllocationType::GPU) { // NOLINT memory_utils::Copy(dst_place, dst_ptr, diff --git a/paddle/phi/core/utils/data_type.h b/paddle/phi/core/utils/data_type.h index 449d7cbe8966d..ea1caf4ac067d 100644 --- a/paddle/phi/core/utils/data_type.h +++ b/paddle/phi/core/utils/data_type.h @@ -211,34 +211,35 @@ inline int TransToProtoVarType(const DataType& dtype) { } } -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) -inline ncclDataType_t ToNCCLDataType(DataType type) { +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) +inline mcclDataType_t ToNCCLDataType(DataType type) { if (type == DataType::FLOAT32) { - return ncclFloat; + return mcclFloat; } else if (type == DataType::FLOAT64) { - return ncclDouble; + return mcclDouble; } else if (type == DataType::INT32) { - return ncclInt; + return mcclInt; } else if (type == DataType::INT64) { - return ncclInt64; + return mcclInt64; } else if (type == DataType::FLOAT16) { - return ncclFloat16; + return mcclFloat16; } else if (type == DataType::UINT8) { - return ncclUint8; + return mcclUint8; } else if (type == DataType::INT8) { - return ncclInt8; + return mcclInt8; } else if (type == DataType::BOOL) { - return ncclUint8; -#if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000 - } else if (type == DataType::BFLOAT16) { - return ncclBfloat16; -#endif + return mcclUint8; + // } else if (type == DataType::BFLOAT16) { + // return ncclBfloat16; } else { PADDLE_THROW( errors::Unimplemented("This datatype in nccl is not supported.")); } } #endif + + + #if defined(PADDLE_WITH_XPU_BKCL) inline BKCLDataType ToBKCLDataType(DataType type) { if (type == DataType::FLOAT32) { diff --git a/paddle/phi/core/utils/type_info.cc b/paddle/phi/core/utils/type_info.cc index b419338401eea..63c9cf63f9a32 100644 --- a/paddle/phi/core/utils/type_info.cc +++ b/paddle/phi/core/utils/type_info.cc @@ -54,12 +54,12 @@ template class TypeInfoTraits; template class TypeInfoTraits; template class TypeInfoTraits; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || \ defined(PADDLE_WITH_XPU_KP) template class TypeInfoTraits; #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) template class TypeInfoTraits; #endif diff --git a/paddle/phi/core/utils/visit_place.h b/paddle/phi/core/utils/visit_place.h index 6318b17647cd6..34a8fca61fbbe 100644 --- a/paddle/phi/core/utils/visit_place.h +++ b/paddle/phi/core/utils/visit_place.h @@ -25,7 +25,7 @@ typename Visitor::result_type VisitPlace(const phi::Place& place, const Visitor& visitor) { switch (place.GetType()) { case phi::AllocationType::GPU: { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) phi::GPUPlace p(place.GetDeviceId()); return visitor(p); #else @@ -35,7 +35,7 @@ typename Visitor::result_type VisitPlace(const phi::Place& place, #endif } case phi::AllocationType::GPUPINNED: { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) phi::GPUPinnedPlace p; return visitor(p); #else diff --git a/paddle/phi/core/visit_type.h b/paddle/phi/core/visit_type.h index 7ee12e26d7d0e..6c61c3964b52d 100644 --- a/paddle/phi/core/visit_type.h +++ b/paddle/phi/core/visit_type.h @@ -150,7 +150,7 @@ namespace phi { ///////// BOOL and Floating and Integral Dispatch Marco /////////// -#if (NCCL_VERSION_CODE >= 21000) && !defined(PADDLE_WITH_RCCL) +#if (NCCL_VERSION_CODE >= 21000) && !defined(PADDLE_WITH_RCCL) && !defined(PADDLE_WITH_MCCL) #define PD_VISIT_BOOL_AND_FLOATING_AND_INTEGRAL_TYPES_GPU(TYPE, NAME, ...) \ [&] { \ const auto& __dtype__ = TYPE; \ @@ -355,7 +355,7 @@ namespace phi { "`"); \ } \ }() -#if defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_XPU) #define PD_VISIT_ALL_TYPES(TYPE, NAME, ...) \ [&] { \ const auto& __dtype__ = TYPE; \ diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc index eee92aa138044..ac3eb1f3cc12f 100644 --- a/paddle/phi/infermeta/multiary.cc +++ b/paddle/phi/infermeta/multiary.cc @@ -932,7 +932,7 @@ void CoalesceTensorInferMeta(const std::vector& input, size_of_dtype = static_cast(phi::SizeOf(dtype)); } if (config.is_runtime) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) int64_t numel = 0; for (auto item : input) { const auto& dim = item->dims(); diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt index f38a842a66987..2df3f34b57936 100644 --- a/paddle/phi/kernels/CMakeLists.txt +++ b/paddle/phi/kernels/CMakeLists.txt @@ -22,6 +22,9 @@ add_subdirectory(autotune) copy_if_different(${kernel_declare_file} ${kernel_declare_file_final}) file(GLOB kernel_h "*.h" "selected_rows/*.h" "sparse/*.h" "strings/*.h") +if(WITH_MUSA) + list(REMOVE_ITEM kernel_cu "sparse/*.h") +endif() file(GLOB kernel_impl_h "impl/*.h" "selected_rows/impl/*.h") file(GLOB kernel_primitive_h "primitive/*.h") @@ -40,6 +43,43 @@ file( "strings/gpu/*.cu" "fusion/gpu/*.cu") +if(WITH_MUSA) + # 创建要排除的文件模式列表 + file( + GLOB files_to_remove + RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" + "sparse/gpu/*.cu" + "gpudnn/*.cu") + + list(REMOVE_ITEM kernel_cu ${files_to_remove}) + message(STATUS "files_to_remove:${files_to_remove}") + + list( + REMOVE_ITEM + kernel_cu + "strings/gpu/strings_lower_upper_kernel.cu" + "strings/gpu/strings_copy_kernel.cu" + "fusion/gpu/block_multi_head_attention_kernel.cu" + "gpu/cudnn_lstm_kernel.cu" + "gpu/cudnn_lstm_grad_kernel.cu" + "gpu/instance_norm_kernel.cu" + "gpu/instance_norm_grad_kernel.cu" + "gpu/log_softmax_kernel.cu" + "gpu/log_softmax_grad_kernel.cu" + "gpu/weighted_sample_neighbors_kernel.cu" + "gpu/cross_entropy_kernel.cu" + "gpu/cross_entropy_grad_kernel.cu" + "gpu/gelu_kernel.cu" + "gpu/gelu_grad_kernel.cu" + "gpu/rnn_kernel.cu.cc" + "gpu/rnn_grad_kernel.cu.cc" + "gpu/clip_by_norm_kernel.cu" + "selected_rows/gpu/clip_by_norm_kernel.cu" + "gpu/softmax_grad_kernel.cu" + "gpu/softmax_kernel.cu" + ) +endif() + if(APPLE OR WIN32) list(REMOVE_ITEM kernel_cu "fusion/gpu/fusion_group_kernel.cu") endif() @@ -177,32 +217,6 @@ if(NOT WITH_CUDNN_FRONTEND) "fusion/gpu/fused_dconv_drelu_dbn_kernel.cu") endif() -# Note(qili93): remove kernels not supported on DCU yet -if(WITH_ROCM) - list( - REMOVE_ITEM - kernel_cu - "gpu/affine_grid_grad_kernel.cu" - "gpu/apply_per_channel_scale_kernel.cu" - "gpu/cholesky_solve_kernel.cu" - "gpu/eigh_kernel.cu" - "gpu/eigvalsh_kernel.cu" - "gpu/lstsq_kernel.cu" - "gpu/lu_kernel.cu" - "gpu/matrix_rank_kernel.cu" - "gpu/matrix_rank_tol_kernel.cu" - "gpu/multiclass_nms3_kernel.cu" - "gpu/put_along_axis_grad_kernel.cu" - "gpu/put_along_axis_kernel.cu" - "gpu/qr_kernel.cu" - "gpu/svd_kernel.cu" - "gpudnn/mha_cudnn_frontend.cu" - "fusion/gpu/block_multi_head_attention_kernel.cu" - "fusion/gpu/fused_bn_add_activation_grad_kernel.cu" - "fusion/gpu/fused_bn_add_activation_kernel.cu" - "fusion/gpu/fusion_transpose_flatten_concat_kernel.cu") -endif() - set(cc_search_pattern "*.cc" "cpu/*.cc" @@ -219,16 +233,16 @@ set(cc_search_pattern "fusion/*.cc" "stride/*.cc" "fusion/cpu/*.cc") +if(WITH_MUSA) + list(REMOVE_ITEM cc_search_pattern "sparse/*.cc") + list(REMOVE_ITEM cc_search_pattern "sparse/cpu/*.cc") +endif() if(WITH_MKLDNN) set(cc_search_pattern ${cc_search_pattern} "legacy/onednn/*.cc" "onednn/*.cc" "fusion/onednn/*.cc") endif() -if(WITH_CUSTOM_DEVICE) - set(cc_search_pattern ${cc_search_pattern} "custom/*.cc") -endif() - file( GLOB kernel_cc RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" @@ -252,7 +266,7 @@ file( "xpu/*.cc" "legacy/xpu/*.cc" "selected_rows/xpu/*.cc" "fusion/xpu/*.cc" "sparse/xpu/*.cc") -if(WITH_GPU OR WITH_ROCM) +if(WITH_GPU OR WITH_ROCM OR WITH_MUSA) collect_srcs(kernels_srcs SRCS ${kernel_cu}) kernel_declare("${kernel_cu}") endif() diff --git a/paddle/phi/kernels/array_kernel.cc b/paddle/phi/kernels/array_kernel.cc index 8a599dcf9d80d..5389a26479213 100644 --- a/paddle/phi/kernels/array_kernel.cc +++ b/paddle/phi/kernels/array_kernel.cc @@ -134,7 +134,7 @@ PD_REGISTER_KERNEL(create_array, phi::dtype::complex, phi::dtype::complex) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) PD_REGISTER_KERNEL(create_array, GPU, ALL_LAYOUT, @@ -178,7 +178,7 @@ PD_REGISTER_KERNEL(array_read, phi::dtype::complex, phi::dtype::complex) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) PD_REGISTER_KERNEL(array_read, GPU, ALL_LAYOUT, @@ -208,7 +208,7 @@ PD_REGISTER_KERNEL(array_write, phi::dtype::complex, phi::dtype::complex) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) PD_REGISTER_KERNEL(array_write, GPU, ALL_LAYOUT, @@ -238,7 +238,7 @@ PD_REGISTER_KERNEL(array_to_tensor, phi::dtype::complex, phi::dtype::complex) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) PD_REGISTER_KERNEL(array_to_tensor, GPU, ALL_LAYOUT, diff --git a/paddle/phi/kernels/assign_kernel.cc b/paddle/phi/kernels/assign_kernel.cc index b4504f83818d7..eb884d53f3cd6 100644 --- a/paddle/phi/kernels/assign_kernel.cc +++ b/paddle/phi/kernels/assign_kernel.cc @@ -139,7 +139,7 @@ PD_REGISTER_KERNEL(assign_value, int8_t, int64_t) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) PD_REGISTER_KERNEL_FOR_ALL_DTYPE(assign, GPU, ALL_LAYOUT, diff --git a/paddle/phi/kernels/autotune/gpu_timer.h b/paddle/phi/kernels/autotune/gpu_timer.h index b04c46351c2cf..01ba364ad3d3d 100644 --- a/paddle/phi/kernels/autotune/gpu_timer.h +++ b/paddle/phi/kernels/autotune/gpu_timer.h @@ -30,11 +30,15 @@ #include #endif +#ifdef PADDLE_WITH_MUSA +#include +#endif + namespace phi { -#ifdef PADDLE_WITH_HIP -static void RecordEventTimerCallback(hipStream_t stream, - hipError_t status, +#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +static void RecordEventTimerCallback(musaStream_t stream, + musaError_t status, void *user_data) { struct timeval time_now {}; gettimeofday(&time_now, nullptr); @@ -60,6 +64,9 @@ class GpuTimer { #ifdef PADDLE_WITH_HIP hipEventCreate(&start_); hipEventCreate(&stop_); +#elif defined(PADDLE_WITH_MUSA) + musaEventCreate(&start_); + musaEventCreate(&stop_); #else cudaEventCreate(&start_); cudaEventCreate(&stop_); @@ -74,6 +81,9 @@ class GpuTimer { #ifdef PADDLE_WITH_HIP hipEventDestroy(start_); hipEventDestroy(stop_); +#elif defined(PADDLE_WITH_MUSA) + musaEventDestroy(start_); + musaEventDestroy(stop_); #else cudaEventDestroy(start_); cudaEventDestroy(stop_); @@ -83,6 +93,8 @@ class GpuTimer { void Start(gpuStream_t stream) { #ifdef PADDLE_WITH_HIP hipEventRecord(start_, stream); +#elif defined(PADDLE_WITH_MUSA) + musaEventRecord(start_, stream); #else cudaEventRecord(start_, stream); #endif @@ -91,6 +103,8 @@ class GpuTimer { void Stop(gpuStream_t stream) { #ifdef PADDLE_WITH_HIP hipEventRecord(stop_, stream); +#elif defined(PADDLE_WITH_MUSA) + musaEventRecord(stop_, stream); #else cudaEventRecord(stop_, stream); #endif @@ -101,6 +115,9 @@ class GpuTimer { #ifdef PADDLE_WITH_HIP hipEventSynchronize(stop_); hipEventElapsedTime(&milliseconds, start_, stop_); +#elif defined(PADDLE_WITH_MUSA) + musaEventSynchronize(stop_); + musaEventElapsedTime(&milliseconds, start_, stop_); #else cudaEventSynchronize(stop_); cudaEventElapsedTime(&milliseconds, start_, stop_); @@ -144,6 +161,12 @@ class CalculateStreamTimer { RecordEventTimerCallback, reinterpret_cast(&start_time_), 0)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS( + musaStreamAddCallback(calculated_stream_, + RecordEventTimerCallback, + reinterpret_cast(&start_time_), + 0)); #else PADDLE_ENFORCE_GPU_SUCCESS( cudaStreamAddCallback(calculated_stream_, @@ -163,6 +186,12 @@ class CalculateStreamTimer { RecordEventTimerCallback, reinterpret_cast(&end_time_), 0)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS( + musaStreamAddCallback(calculated_stream_, + RecordEventTimerCallback, + reinterpret_cast(&end_time_), + 0)); #else PADDLE_ENFORCE_GPU_SUCCESS( cudaStreamAddCallback(calculated_stream_, @@ -178,6 +207,8 @@ class CalculateStreamTimer { if (calculated_stream_ != nullptr) { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipStreamSynchronize(calculated_stream_)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(musaStreamSynchronize(calculated_stream_)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(calculated_stream_)); #endif @@ -189,6 +220,8 @@ class CalculateStreamTimer { if (calculated_stream_ != nullptr) { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipStreamSynchronize(calculated_stream_)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(musaStreamSynchronize(calculated_stream_)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(calculated_stream_)); #endif diff --git a/paddle/phi/kernels/batch_norm_kernel.cc b/paddle/phi/kernels/batch_norm_kernel.cc index bf04c99dab0a3..dba08b0de366a 100644 --- a/paddle/phi/kernels/batch_norm_kernel.cc +++ b/paddle/phi/kernels/batch_norm_kernel.cc @@ -97,7 +97,7 @@ PD_REGISTER_KERNEL(batch_norm_infer, } #endif #endif -#ifdef PADDLE_WITH_HIP +#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) PD_REGISTER_KERNEL(batch_norm_infer, GPU, ALL_LAYOUT, diff --git a/paddle/phi/kernels/check_memory_continue_kernel.cc b/paddle/phi/kernels/check_memory_continue_kernel.cc index 6e496a355302f..9f4b51281cd37 100644 --- a/paddle/phi/kernels/check_memory_continue_kernel.cc +++ b/paddle/phi/kernels/check_memory_continue_kernel.cc @@ -88,7 +88,7 @@ PD_REGISTER_KERNEL(check_memory_continue, float, double) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) PD_REGISTER_KERNEL(check_memory_continue, GPU, ALL_LAYOUT, diff --git a/paddle/phi/kernels/coalesce_tensor_kernel.cc b/paddle/phi/kernels/coalesce_tensor_kernel.cc index a60369af449f4..2d0ab05a8de78 100644 --- a/paddle/phi/kernels/coalesce_tensor_kernel.cc +++ b/paddle/phi/kernels/coalesce_tensor_kernel.cc @@ -309,6 +309,20 @@ PD_REGISTER_KERNEL(coalesce_tensor, } #endif +#ifdef PADDLE_WITH_MUSA +PD_REGISTER_KERNEL(coalesce_tensor, + GPU, + ALL_LAYOUT, + phi::CoalesceTensorKernel, + phi::dtype::float16, + int, + float, + double) { + kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND); + kernel->OutputAt(1).SetDataType(phi::DataType::UNDEFINED); +} +#endif + #ifdef PADDLE_WITH_XPU PD_REGISTER_KERNEL(coalesce_tensor, XPU, diff --git a/paddle/phi/kernels/cpu/cum_maxmin_grad_kernel.cc b/paddle/phi/kernels/cpu/cum_maxmin_grad_kernel.cc index 47e804b7de277..acd84a80be2ad 100644 --- a/paddle/phi/kernels/cpu/cum_maxmin_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/cum_maxmin_grad_kernel.cc @@ -38,10 +38,10 @@ void CummaxGradKernel(const Context& dev_ctx, } if (dtype == DataType::INT32) { phi::funcs::cpu_scatter_add_kernel( - *x_grad, axis, indices, out_grad, true, dev_ctx); + *x_grad, axis, indices, out_grad, dev_ctx); } else if (dtype == DataType::INT64) { phi::funcs::cpu_scatter_add_kernel( - *x_grad, axis, indices, out_grad, true, dev_ctx); + *x_grad, axis, indices, out_grad, dev_ctx); } } @@ -61,10 +61,10 @@ void CumminGradKernel(const Context& dev_ctx, } if (dtype == DataType::INT32) { phi::funcs::cpu_scatter_add_kernel( - *x_grad, axis, indices, out_grad, true, dev_ctx); + *x_grad, axis, indices, out_grad, dev_ctx); } else if (dtype == DataType::INT64) { phi::funcs::cpu_scatter_add_kernel( - *x_grad, axis, indices, out_grad, true, dev_ctx); + *x_grad, axis, indices, out_grad, dev_ctx); } } diff --git a/paddle/phi/kernels/cpu/decode_jpeg_kernel.cc b/paddle/phi/kernels/cpu/decode_jpeg_kernel.cc index aceced1ce8531..0b11e3d6f98da 100644 --- a/paddle/phi/kernels/cpu/decode_jpeg_kernel.cc +++ b/paddle/phi/kernels/cpu/decode_jpeg_kernel.cc @@ -29,4 +29,4 @@ void DecodeJpegKernel(const Context& dev_ctx, } // namespace phi PD_REGISTER_KERNEL( - decode_jpeg, CPU, ALL_LAYOUT, phi::DecodeJpegKernel, uint8_t) {} + decode_jpeg, CPU, ALL_LAYOUT, phi::DecodeJpegKernel, uint8_t) {} \ No newline at end of file diff --git a/paddle/phi/kernels/cpu/gelu_grad_kernel.cc b/paddle/phi/kernels/cpu/gelu_grad_kernel.cc index 65ee3c1851003..81ed7170d7a24 100644 --- a/paddle/phi/kernels/cpu/gelu_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/gelu_grad_kernel.cc @@ -64,7 +64,7 @@ struct GeluGradFunctor { } else { #if defined(PADDLE_WITH_MKLML) && !defined(_WIN32) && !defined(__APPLE__) && \ !defined(__OSX__) && !defined(PADDLE_WITH_CUDA) && \ - !defined(PADDLE_WITH_HIP) + !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA) auto x_data = x.data(); auto dx_data = dx.data(); auto dout_data = dout.data(); diff --git a/paddle/phi/kernels/cpu/gelu_kernel.cc b/paddle/phi/kernels/cpu/gelu_kernel.cc index dbab3bd326664..47ab1a7839066 100644 --- a/paddle/phi/kernels/cpu/gelu_kernel.cc +++ b/paddle/phi/kernels/cpu/gelu_kernel.cc @@ -53,7 +53,7 @@ struct GeluFunctor { } else { #if defined(PADDLE_WITH_MKLML) && !defined(_WIN32) && !defined(__APPLE__) && \ !defined(__OSX__) && !defined(PADDLE_WITH_CUDA) && \ - !defined(PADDLE_WITH_HIP) + !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA) auto x_data = x.data(); auto out_data = out.data(); int n = std::min(x.size(), out.size()); diff --git a/paddle/phi/kernels/cpu/put_along_axis_grad_kernel.cc b/paddle/phi/kernels/cpu/put_along_axis_grad_kernel.cc index aeb2071b136de..dd7b762849d16 100644 --- a/paddle/phi/kernels/cpu/put_along_axis_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/put_along_axis_grad_kernel.cc @@ -25,14 +25,11 @@ namespace phi { template void PutAlongAxisGradKernel(const Context& dev_ctx, - const DenseTensor& x, + const DenseTensor& x UNUSED, const DenseTensor& index, - const DenseTensor& value, - const DenseTensor& out, const DenseTensor& out_grad, int axis, - const std::string& reduce, - bool include_self, + const std::string& reduce UNUSED, DenseTensor* x_grad, DenseTensor* value_grad) { PADDLE_ENFORCE_EQ( @@ -43,135 +40,31 @@ void PutAlongAxisGradKernel(const Context& dev_ctx, const auto& index_type = index.dtype(); if (x_grad) { phi::Copy(dev_ctx, out_grad, dev_ctx.GetPlace(), false, x_grad); - if (include_self == false || reduce == "assign") { - if (index_type == DataType::INT32) { - phi::funcs::cpu_scatter_input_grad_kernel( - // Here passing an unused argument out_grad, because it's - // convenient to instantiate a bunch of template function with the - // same arguments list. - out_grad, - axis, - index, - *x_grad, - include_self, - dev_ctx); - } else { - phi::funcs::cpu_scatter_input_grad_kernel( - out_grad, axis, index, *x_grad, include_self, dev_ctx); - } - } else if (reduce == "multiply" || reduce == "mul" || reduce == "amin" || - reduce == "amax") { - if (index_type == DataType::INT32) { - phi::funcs::cpu_scatter_mul_min_max_input_grad_kernel( - out_grad, - axis, - index, - out, - x, - value, - *x_grad, - reduce, - include_self, - dev_ctx); - } else { - phi::funcs::cpu_scatter_mul_min_max_input_grad_kernel( - out_grad, - axis, - index, - out, - x, - value, - *x_grad, - reduce, - include_self, - dev_ctx); - } - } else if (reduce == "mean") { - if (index_type == DataType::INT32) { - phi::funcs::cpu_scatter_mean_input_grad_kernel( - // Here passing an unused argument out_grad, because it's - // convenient to instantiate a bunch of template function with the - // same arguments list. - out_grad, - axis, - index, - *x_grad, - include_self, - dev_ctx); - } else { - phi::funcs::cpu_scatter_mean_input_grad_kernel( - out_grad, axis, index, *x_grad, include_self, dev_ctx); - } + if (index_type == DataType::INT32) { + phi::funcs::cpu_scatter_input_grad_kernel( + // Here passing an unused argument out_grad, because it's + // convenient to instantiate a bunch of template function with the + // same arguments list. + out_grad, + axis, + index, + *x_grad, + dev_ctx); + } else { + phi::funcs::cpu_scatter_input_grad_kernel( + out_grad, axis, index, *x_grad, dev_ctx); } } if (value_grad) { value_grad->Resize(index.dims()); dev_ctx.template Alloc(value_grad); - auto* grad_data = value_grad->data(); - int64_t grad_size = value_grad->numel(); - memset(grad_data, 0, sizeof(T) * grad_size); - if (reduce == "assign") { - if (index_type == DataType::INT32) { - phi::funcs::cpu_scatter_value_grad_kernel( - out_grad, axis, index, *value_grad, include_self, dev_ctx); - } else if (index_type == DataType::INT64) { - phi::funcs::cpu_scatter_value_grad_kernel( - out_grad, axis, index, *value_grad, include_self, dev_ctx); - } - } else if (reduce == "add" || reduce == "mean") { - if (index_type == DataType::INT32) { - phi::funcs::cpu_scatter_add_mean_value_grad_kernel( - out_grad, - axis, - index, - out, - x, - value, - *value_grad, - reduce, - include_self, - dev_ctx); - } else { - phi::funcs::cpu_scatter_add_mean_value_grad_kernel( - out_grad, - axis, - index, - out, - x, - value, - *value_grad, - reduce, - include_self, - dev_ctx); - } - } else if (reduce == "mul" || reduce == "multiply" || reduce == "amin" || - reduce == "amax") { - if (index_type == DataType::INT32) { - phi::funcs::cpu_scatter_mul_min_max_value_grad_kernel( - out_grad, - axis, - index, - out, - x, - value, - *value_grad, - reduce, - include_self, - dev_ctx); - } else { - phi::funcs::cpu_scatter_mul_min_max_value_grad_kernel( - out_grad, - axis, - index, - out, - x, - value, - *value_grad, - reduce, - include_self, - dev_ctx); - } + if (index_type == DataType::INT32) { + phi::funcs::cpu_scatter_value_grad_kernel( + out_grad, axis, index, *value_grad, dev_ctx); + } else { + phi::funcs::cpu_scatter_value_grad_kernel( + out_grad, axis, index, *value_grad, dev_ctx); } } } diff --git a/paddle/phi/kernels/cpu/put_along_axis_kernel.cc b/paddle/phi/kernels/cpu/put_along_axis_kernel.cc index 4411755d61cba..5417f9463a62f 100644 --- a/paddle/phi/kernels/cpu/put_along_axis_kernel.cc +++ b/paddle/phi/kernels/cpu/put_along_axis_kernel.cc @@ -30,7 +30,6 @@ void PutAlongAxisKernel(const Context& dev_ctx, const DenseTensor& value, int axis, const std::string& reduce, - bool include_self, DenseTensor* out) { PADDLE_ENFORCE_EQ( dev_ctx.GetPlace().GetType() == phi::AllocationType::CPU, @@ -42,56 +41,31 @@ void PutAlongAxisKernel(const Context& dev_ctx, if (reduce == "add") { if (index_type == DataType::INT32) { phi::funcs::cpu_scatter_add_kernel( - *out, axis, index, value, include_self, dev_ctx); + *out, axis, index, value, dev_ctx); } else if (index_type == DataType::INT64) { phi::funcs::cpu_scatter_add_kernel( - *out, axis, index, value, include_self, dev_ctx); + *out, axis, index, value, dev_ctx); } } else if (reduce == "multiply" || reduce == "mul") { if (index_type == DataType::INT32) { phi::funcs::cpu_scatter_mul_kernel( - *out, axis, index, value, include_self, dev_ctx); + *out, axis, index, value, dev_ctx); } else if (index_type == DataType::INT64) { phi::funcs::cpu_scatter_mul_kernel( - *out, axis, index, value, include_self, dev_ctx); + *out, axis, index, value, dev_ctx); } } else if (reduce == "assign") { if (index_type == DataType::INT32) { phi::funcs::cpu_scatter_assign_kernel( - *out, axis, index, value, include_self, dev_ctx); + *out, axis, index, value, dev_ctx); } else if (index_type == DataType::INT64) { phi::funcs::cpu_scatter_assign_kernel( - *out, axis, index, value, include_self, dev_ctx); - } - } else if (reduce == "mean") { - if (index_type == DataType::INT32) { - phi::funcs::cpu_scatter_mean_kernel( - *out, axis, index, value, include_self, dev_ctx); - } else if (index_type == DataType::INT64) { - phi::funcs::cpu_scatter_mean_kernel( - *out, axis, index, value, include_self, dev_ctx); - } - } else if (reduce == "amax") { - if (index_type == DataType::INT32) { - phi::funcs::cpu_scatter_max_kernel( - *out, axis, index, value, include_self, dev_ctx); - } else if (index_type == DataType::INT64) { - phi::funcs::cpu_scatter_max_kernel( - *out, axis, index, value, include_self, dev_ctx); - } - } else if (reduce == "amin") { - if (index_type == DataType::INT32) { - phi::funcs::cpu_scatter_min_kernel( - *out, axis, index, value, include_self, dev_ctx); - } else if (index_type == DataType::INT64) { - phi::funcs::cpu_scatter_min_kernel( - *out, axis, index, value, include_self, dev_ctx); + *out, axis, index, value, dev_ctx); } } else { PADDLE_THROW(errors::InvalidArgument( "can not support reduce: '%s' for scatter kernel, only " - "support reduce op: 'add', 'assign', 'mul', 'mean', 'amin', 'amax' and " - "'multiply', the " + "support reduce op: 'add', 'assign', 'mul' and 'multiply', the " "default reduce " "op is 'assign' ", reduce)); diff --git a/paddle/phi/kernels/cpu/repeat_interleave_grad_kernel.cc b/paddle/phi/kernels/cpu/repeat_interleave_grad_kernel.cc index 66f3ef0cd790d..b7b33d4290dae 100644 --- a/paddle/phi/kernels/cpu/repeat_interleave_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/repeat_interleave_grad_kernel.cc @@ -104,8 +104,7 @@ PD_REGISTER_KERNEL(repeat_interleave_with_tensor_index_grad, float, double, int, - int64_t, - phi::dtype::bfloat16) {} + int64_t) {} PD_REGISTER_KERNEL(repeat_interleave_grad, CPU, @@ -114,5 +113,4 @@ PD_REGISTER_KERNEL(repeat_interleave_grad, float, double, int, - int64_t, - phi::dtype::bfloat16) {} + int64_t) {} diff --git a/paddle/phi/kernels/cpu/repeat_interleave_kernel.cc b/paddle/phi/kernels/cpu/repeat_interleave_kernel.cc index 8b00d7e38f304..388e243eff42a 100644 --- a/paddle/phi/kernels/cpu/repeat_interleave_kernel.cc +++ b/paddle/phi/kernels/cpu/repeat_interleave_kernel.cc @@ -25,8 +25,7 @@ PD_REGISTER_KERNEL(repeat_interleave, float, double, int, - int64_t, - phi::dtype::bfloat16) {} + int64_t) {} PD_REGISTER_KERNEL(repeat_interleave_with_tensor_index, CPU, @@ -35,5 +34,4 @@ PD_REGISTER_KERNEL(repeat_interleave_with_tensor_index, float, double, int, - int64_t, - phi::dtype::bfloat16) {} + int64_t) {} diff --git a/paddle/phi/kernels/cpu/set_value_grad_kernel.cc b/paddle/phi/kernels/cpu/set_value_grad_kernel.cc index 237a892dbb356..ed35513d98550 100644 --- a/paddle/phi/kernels/cpu/set_value_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/set_value_grad_kernel.cc @@ -35,20 +35,3 @@ PD_REGISTER_KERNEL(set_value_grad, phi::dtype::float16, phi::dtype::complex, phi::dtype::complex) {} - -PD_REGISTER_KERNEL(set_value_with_scalar_grad, - CPU, - ALL_LAYOUT, - phi::SetValueWithScalarGradKernel, - float, - double, - int, - int64_t, - bool, - int16_t, - uint8_t, - int8_t, - phi::dtype::bfloat16, - phi::dtype::float16, - phi::dtype::complex, - phi::dtype::complex) {} diff --git a/paddle/phi/kernels/cpu/take_along_axis_grad_kernel.cc b/paddle/phi/kernels/cpu/take_along_axis_grad_kernel.cc index 4e5fc0c305100..8a7238203ec64 100644 --- a/paddle/phi/kernels/cpu/take_along_axis_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/take_along_axis_grad_kernel.cc @@ -50,11 +50,10 @@ void TakeAlongAxisGradKernel(const Context& dev_ctx, axis, index, out_grad, - true, dev_ctx); // the gradient of gather is scatter } else if (index_type == phi::DataType::INT64) { phi::funcs::cpu_scatter_add_kernel( - *x_grad, axis, index, out_grad, true, dev_ctx); + *x_grad, axis, index, out_grad, dev_ctx); } } diff --git a/paddle/phi/kernels/cpu/take_along_axis_kernel.cc b/paddle/phi/kernels/cpu/take_along_axis_kernel.cc index d006f688ae243..d1b4a24b54eba 100644 --- a/paddle/phi/kernels/cpu/take_along_axis_kernel.cc +++ b/paddle/phi/kernels/cpu/take_along_axis_kernel.cc @@ -38,11 +38,9 @@ void TakeAlongAxisKernel(const Context& dev_ctx, const auto& index_type = index.dtype(); if (index_type == DataType::INT32) { - phi::funcs::cpu_gather_kernel( - x, axis, index, *out, true, dev_ctx); + phi::funcs::cpu_gather_kernel(x, axis, index, *out, dev_ctx); } else if (index_type == DataType::INT64) { - phi::funcs::cpu_gather_kernel( - x, axis, index, *out, true, dev_ctx); + phi::funcs::cpu_gather_kernel(x, axis, index, *out, dev_ctx); } } diff --git a/paddle/phi/kernels/custom/c_embedding_grad_kernel.cc b/paddle/phi/kernels/custom/c_embedding_grad_kernel.cc deleted file mode 100644 index ff61688513b13..0000000000000 --- a/paddle/phi/kernels/custom/c_embedding_grad_kernel.cc +++ /dev/null @@ -1,93 +0,0 @@ -// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/phi/kernels/c_embedding_grad_kernel.h" -#include "glog/logging.h" -#include "paddle/phi/api/backward/backward_api.h" -#include "paddle/phi/api/include/api.h" -#include "paddle/phi/backends/all_context.h" -#include "paddle/phi/common/float16.h" -#include "paddle/phi/core/kernel_registry.h" - -namespace phi { - -#ifdef PADDLE_WITH_CUSTOM_DEVICE -template -void CEmbeddingGradKernel(const Context& dev_ctx, - const DenseTensor& w, - const DenseTensor& ids, - const DenseTensor& out_grad, - int64_t start_index, - DenseTensor* w_grad) { - w_grad->Resize(w.dims()); - dev_ctx.template Alloc(w_grad, w.dtype()); - const auto& index_type = ids.dtype(); - if (index_type == phi::DataType::INT32 || - index_type == phi::DataType::INT64) { - auto K = ids.numel(); - auto N = w.dims()[0]; - auto D = w.dims()[1]; - - auto x_tmp = std::make_shared(); - x_tmp->ShareDataWith(ids).Resize({K}); - auto w_tmp = std::make_shared(); - w_tmp->set_meta(w.meta()); - dev_ctx.Alloc(w_tmp.get(), w_tmp->dtype()); - auto out_grad_tmp = std::make_shared(); - out_grad_tmp->ShareDataWith(out_grad).Resize({K, D}); - paddle::Tensor x_tensor(x_tmp), w_tensor(w_tmp), - out_grad_tensor(out_grad_tmp); - - auto start_index_tensor = paddle::experimental::full_like( - x_tensor, start_index, x_tensor.dtype(), x_tensor.place()); - auto end_index_tensor = paddle::experimental::full_like( - x_tensor, start_index + N, x_tensor.dtype(), x_tensor.place()); - auto ids_mask_tensor = paddle::experimental::logical_and( - x_tensor.greater_equal(start_index_tensor), - x_tensor.less_than(end_index_tensor)); - auto real_ids_tensor = (x_tensor - start_index_tensor) - .multiply(paddle::experimental::cast( - ids_mask_tensor, x_tensor.dtype())); - auto out_grad_tensor_mul_mask = - paddle::experimental::reshape(out_grad_tensor, {K, D}) - .multiply(paddle::experimental::reshape( - paddle::experimental::cast(ids_mask_tensor, w.dtype()), - {K, 1})); - paddle::Tensor w_grad_tensor; - paddle::experimental::embedding_grad(real_ids_tensor, - w_tensor, - out_grad_tensor_mul_mask, - -1, - false, - &w_grad_tensor); - w_grad->ShareDataWith( - *reinterpret_cast(w_grad_tensor.impl().get())); - - } else { - PADDLE_THROW(phi::errors::Unavailable( - "Custom Device c_embedding_grad ids only support int32 or int64.")); - } -} -#endif -} // namespace phi - -#ifdef PADDLE_WITH_CUSTOM_DEVICE -PD_REGISTER_KERNEL(c_embedding_grad, - Custom, - ALL_LAYOUT, - phi::CEmbeddingGradKernel, - float, - phi::dtype::float16, - phi::dtype::bfloat16) {} -#endif diff --git a/paddle/phi/kernels/custom/c_embedding_kernel.cc b/paddle/phi/kernels/custom/c_embedding_kernel.cc deleted file mode 100644 index 0cacf61d46f3a..0000000000000 --- a/paddle/phi/kernels/custom/c_embedding_kernel.cc +++ /dev/null @@ -1,84 +0,0 @@ -// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/phi/kernels/c_embedding_kernel.h" -#include "glog/logging.h" -#include "paddle/phi/api/backward/backward_api.h" -#include "paddle/phi/api/include/api.h" -#include "paddle/phi/backends/all_context.h" -#include "paddle/phi/common/float16.h" -#include "paddle/phi/core/kernel_registry.h" - -namespace phi { - -#ifdef PADDLE_WITH_CUSTOM_DEVICE -template -void CEmbeddingKernel(const Context& dev_ctx, - const DenseTensor& w, - const DenseTensor& ids, - int64_t start_index, - int64_t vocab_size, - DenseTensor* out) { - const auto& index_type = ids.dtype(); - if (index_type == phi::DataType::INT32 || - index_type == phi::DataType::INT64) { - auto out_dims = out->dims(); - auto K = ids.numel(); - auto N = w.dims()[0]; - auto D = w.dims()[1]; - - auto x_tmp = std::make_shared(); - x_tmp->ShareDataWith(ids).Resize({K}); - auto w_tmp = std::make_shared(); - w_tmp->ShareDataWith(w).Resize({N, D}); - paddle::Tensor x_tensor(x_tmp), w_tensor(w_tmp); - - auto start_index_tensor = paddle::experimental::full_like( - x_tensor, start_index, x_tensor.dtype(), x_tensor.place()); - auto end_index_tensor = paddle::experimental::full_like( - x_tensor, start_index + N, x_tensor.dtype(), x_tensor.place()); - auto ids_mask_tensor = paddle::experimental::logical_and( - x_tensor.greater_equal(start_index_tensor), - x_tensor.less_than(end_index_tensor)); - auto ids_tensor = (x_tensor - start_index_tensor) - .multiply(paddle::experimental::cast( - ids_mask_tensor, x_tensor.dtype())); - auto out_tensor = - paddle::experimental::reshape( - paddle::experimental::cast(ids_mask_tensor, w_tensor.dtype()), - {K, 1}) - .multiply(paddle::experimental::reshape( - paddle::experimental::embedding( - ids_tensor, w_tensor, -1, false), - {K, D})); - out->ShareDataWith( - *reinterpret_cast(out_tensor.impl().get())) - .Resize(out_dims); - } else { - PADDLE_THROW(phi::errors::Unavailable( - "Custom Device c_embedding ids only support int32 or int64.")); - } -} -#endif -} // namespace phi - -#ifdef PADDLE_WITH_CUSTOM_DEVICE -PD_REGISTER_KERNEL(c_embedding, - Custom, - ALL_LAYOUT, - phi::CEmbeddingKernel, - float, - phi::dtype::float16, - phi::dtype::bfloat16) {} -#endif diff --git a/paddle/phi/kernels/dist_grad_kernel.cc b/paddle/phi/kernels/dist_grad_kernel.cc index 088a4fe4ffd26..170f9a3a4d608 100644 --- a/paddle/phi/kernels/dist_grad_kernel.cc +++ b/paddle/phi/kernels/dist_grad_kernel.cc @@ -97,7 +97,7 @@ void DistGradKernel(const Context& dev_ctx, PD_REGISTER_KERNEL( dist_grad, CPU, ALL_LAYOUT, phi::DistGradKernel, float, double) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) PD_REGISTER_KERNEL(dist_grad, GPU, ALL_LAYOUT, diff --git a/paddle/phi/kernels/empty_kernel.cc b/paddle/phi/kernels/empty_kernel.cc index d2391a5702d4b..60fc5236abc94 100644 --- a/paddle/phi/kernels/empty_kernel.cc +++ b/paddle/phi/kernels/empty_kernel.cc @@ -74,7 +74,7 @@ PD_REGISTER_KERNEL(empty_like, kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) PD_REGISTER_KERNEL(empty, GPU, ALL_LAYOUT, diff --git a/paddle/phi/kernels/flatten_grad_kernel.cc b/paddle/phi/kernels/flatten_grad_kernel.cc index ebe1b1d24e50a..2b7c400bc6464 100644 --- a/paddle/phi/kernels/flatten_grad_kernel.cc +++ b/paddle/phi/kernels/flatten_grad_kernel.cc @@ -46,7 +46,7 @@ PD_REGISTER_KERNEL(flatten_grad, int, int64_t) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) PD_REGISTER_KERNEL(flatten_grad, GPU, ALL_LAYOUT, diff --git a/paddle/phi/kernels/flatten_kernel.cc b/paddle/phi/kernels/flatten_kernel.cc index dc61e6a650efa..6b22ac7518179 100644 --- a/paddle/phi/kernels/flatten_kernel.cc +++ b/paddle/phi/kernels/flatten_kernel.cc @@ -75,7 +75,7 @@ PD_REGISTER_KERNEL(flatten, int, int64_t) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) PD_REGISTER_KERNEL(flatten_infer, GPU, ALL_LAYOUT, diff --git a/paddle/phi/kernels/full_kernel.cc b/paddle/phi/kernels/full_kernel.cc index cd603dd57e64d..1886f5af4c1cb 100644 --- a/paddle/phi/kernels/full_kernel.cc +++ b/paddle/phi/kernels/full_kernel.cc @@ -49,7 +49,7 @@ PD_REGISTER_KERNEL(full_batch_size_like, bool) { kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) PD_REGISTER_KERNEL(full_batch_size_like, GPU, ALL_LAYOUT, diff --git a/paddle/phi/kernels/funcs/CMakeLists.txt b/paddle/phi/kernels/funcs/CMakeLists.txt index d124e269e5c00..f2d43a19a246d 100644 --- a/paddle/phi/kernels/funcs/CMakeLists.txt +++ b/paddle/phi/kernels/funcs/CMakeLists.txt @@ -8,16 +8,16 @@ file( GLOB func_cc_srcs RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.cc") -if(WITH_GPU OR WITH_ROCM) +if(WITH_GPU OR WITH_ROCM OR WITH_MUSA) file( GLOB func_cu_srcs RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.cu") endif() -# Note(qili93): remove kernels not supported on DCU yet -if(WITH_ROCM) - list(REMOVE_ITEM func_cu_srcs "weight_only_gemv.cu") +if(WITH_MUSA) + list(REMOVE_ITEM func_cu_srcs + "softmax.cu") endif() collect_srcs(kernels_srcs SRCS ${func_cc_srcs} ${func_cu_srcs}) diff --git a/paddle/phi/kernels/funcs/activation_functor.h b/paddle/phi/kernels/funcs/activation_functor.h index 06b59644cf11d..dcad9755ee4e0 100644 --- a/paddle/phi/kernels/funcs/activation_functor.h +++ b/paddle/phi/kernels/funcs/activation_functor.h @@ -3013,7 +3013,7 @@ struct SquareGradGradFunctor : public BaseActivationFunctor { static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; -#if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__) +#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) || defined(__xpu__) template struct CudaLogitFunctor : public BaseActivationFunctor { diff --git a/paddle/phi/kernels/funcs/algorithm.h b/paddle/phi/kernels/funcs/algorithm.h index 5f66f6f1abd4d..cab4d32a99826 100644 --- a/paddle/phi/kernels/funcs/algorithm.h +++ b/paddle/phi/kernels/funcs/algorithm.h @@ -40,7 +40,7 @@ HOSTDEVICE inline int64_t BinarySearch(const T *x, int64_t num, const T &val) { template HOSTDEVICE inline size_t LowerBound(const T1 *x, size_t num, const T2 &val) { -#if defined(__CUDA_ARCH__) || defined(__HIPCC__) // @{ Group LowerBound +#if defined(__CUDA_ARCH__) || defined(__HIPCC__) || defined(__MUSACC__) // @{ Group LowerBound // The following code is from // https://en.cppreference.com/w/cpp/algorithm/lower_bound auto *first = x; @@ -63,7 +63,7 @@ HOSTDEVICE inline size_t LowerBound(const T1 *x, size_t num, const T2 &val) { template HOSTDEVICE inline size_t UpperBound(const T1 *x, size_t num, const T2 &val) { -#if defined(__CUDA_ARCH__) || defined(__HIPCC__) // @{ Group UpperBound +#if defined(__CUDA_ARCH__) || defined(__HIPCC__) || defined(__MUSACC__) // @{ Group UpperBound // The following code is from // https://en.cppreference.com/w/cpp/algorithm/upper_bound auto *first = x; diff --git a/paddle/phi/kernels/funcs/blas/blas.h b/paddle/phi/kernels/funcs/blas/blas.h index 140eca890480f..69e13d29874d5 100644 --- a/paddle/phi/kernels/funcs/blas/blas.h +++ b/paddle/phi/kernels/funcs/blas/blas.h @@ -175,7 +175,7 @@ class Blas { T* c, const int* ldc) const; -#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) +#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)&& !defined(PADDLE_WITH_MUSA) template void MatMulWithHead(const phi::DenseTensor& mat_a, const MatDescriptor& dim_a, @@ -303,7 +303,7 @@ class Blas { int batchCount) const; #if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA) && \ - !defined(PADDLE_WITH_HIP) + !defined(PADDLE_WITH_HIP)&& !defined(PADDLE_WITH_MUSA) template void BatchedGEMMWithHead(CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, @@ -360,7 +360,7 @@ class Blas { T* B, int ldb) const; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) template void BatchedGETRF(int n, T** a, int* ipiv, int* info, int batch_size) const; @@ -445,7 +445,7 @@ class BlasT : private Blas { Base()->template CSRMM(args...); } -#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) +#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)&& !defined(PADDLE_WITH_MUSA) template void MatMulWithHead(ARGS... args) const { Base()->template MatMulWithHead(args...); @@ -543,7 +543,7 @@ class BlasT : private Blas { Base()->template TRSM(args...); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) template void BatchedGETRF(ARGS... args) const { Base()->template BatchedGETRF(args...); @@ -593,3 +593,7 @@ inline BlasT GetBlas(const DeviceContext& dev_ctx) { #ifdef PADDLE_WITH_HIP #include "paddle/phi/kernels/funcs/blas/blas_impl.hip.h" #endif + +#ifdef PADDLE_WITH_MUSA +#include "paddle/phi/kernels/funcs/blas/blas_impl.mu.h" +#endif diff --git a/paddle/phi/kernels/funcs/blas/blas_impl.h b/paddle/phi/kernels/funcs/blas/blas_impl.h index ffafe15b8fcf2..a4233d9a4147a 100644 --- a/paddle/phi/kernels/funcs/blas/blas_impl.h +++ b/paddle/phi/kernels/funcs/blas/blas_impl.h @@ -1451,7 +1451,7 @@ void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, #endif } -#if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA) && \ +#if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_MUSA) && \ !defined(PADDLE_WITH_HIP) // @{ Group Blas MKLML: BatchedGEMMWithHead template <> template @@ -1698,7 +1698,7 @@ void Blas::MatMul(const T *mat_a, } #if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA) && \ - !defined(PADDLE_WITH_HIP) + !defined(PADDLE_WITH_HIP)&& !defined(PADDLE_WITH_MUSA) // @{ Group Blas MKLML: MatMulWithHead /* * Multiple two matrixes with multiple heads diff --git a/paddle/phi/kernels/funcs/blas/blas_impl.mu.h b/paddle/phi/kernels/funcs/blas/blas_impl.mu.h new file mode 100644 index 0000000000000..c6391acab6d89 --- /dev/null +++ b/paddle/phi/kernels/funcs/blas/blas_impl.mu.h @@ -0,0 +1,1602 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#if defined(__MUSACC__) +#include +#endif +#include "glog/logging.h" +#include "paddle/utils/flags.h" + +#include "paddle/phi/backends/dynload/mublas.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/flags.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +PHI_DECLARE_bool(enable_cublas_tensor_op_math); +PHI_DECLARE_bool(gemm_use_half_precision_compute_type); + +namespace phi { +namespace funcs { + +template +struct CUBlas; + +template <> +struct CUBlas { + template + static void GEMM(ARGS... args) { + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasSgemm(args...)); + } + + template + static void AXPY(ARGS... args) { + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasSaxpy(args...)); + } + + template + static void SCAL(ARGS... args) { + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasSscal(args...)); + } + + template + static void VCOPY(ARGS... args) { + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasScopy(args...)); + } + + template + static void GEMV(ARGS... args) { + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasSgemv(args...)); + } + + template + static void GEMM_BATCH(ARGS... args) { + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasSgemmBatched(args...)); + } + + template + static void GEMM_STRIDED_BATCH(ARGS... args) { + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::mublasSgemmStridedBatched(args...)); + } + + template + static void GEMM_EX(phi::GPUContext *dev_ctx, + mublasOperation_t transa, + mublasOperation_t transb, + int m, + int n, + int k, + const float *alpha, + const void *A, + musaDataType_t Atype, + int lda, + const void *B, + musaDataType_t Btype, + int ldb, + const float *beta, + void *C, + musaDataType_t Ctype, + int ldc) { +// Because the gcc 4.8 doesn't expand template parameter pack that +// appears in a lambda-expression, I can not use template parameter pack +// here. + // VLOG(5) << "use_tensor_op_math: " + // << (dev_ctx->tensor_core_available() ? "True" : "False"); + // dev_ctx->TensorCoreCublasCallIfAvailable([&](mublasHandle_t handle) { + // PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasSgemmEx(handle, + // transa, + // transb, + // m, + // n, + // k, + // alpha, + // A, + // Atype, + // lda, + // B, + // Btype, + // ldb, + // beta, + // C, + // Ctype, + // ldc)); + // }); + PADDLE_THROW( + phi::errors::Unimplemented("murrently there are not mublasSgemmEx.")); + } + + template + static void TRSM(ARGS... args) { + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasStrsm(args...)); + } + + template + static void GETRF_BATCH(ARGS... args) { + // PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasSgetrfBatched(args...)); + PADDLE_THROW( + phi::errors::Unimplemented("murrently there are not mublasSgetrfBatched.")); + } + + template + static void GETRI_BATCH(ARGS... args) { + // PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasSgetriBatched(args...)); + PADDLE_THROW( + phi::errors::Unimplemented("murrently there are not mublasSgetriBatched.")); + } + + template + static void MATINV_BATCH(ARGS... args) { + // PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasSmatinvBatched(args...)); + PADDLE_THROW( + phi::errors::Unimplemented("murrently there are not mublasSmatinvBatched.")); + } + + template + static void GETRS_BATCH(ARGS... args) { + // PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasSgetrsBatched(args...)); + PADDLE_THROW( + phi::errors::Unimplemented("murrently there are not mublasSgetrsBatched.")); + } + + template + static void TRSM_BATCH(ARGS... args) { + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasStrsmBatched(args...)); + } +}; + +template <> +struct CUBlas { + template + static void GEMM(ARGS... args) { + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasDgemm(args...)); + } + + template + static void AXPY(ARGS... args) { + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasDaxpy(args...)); + } + + template + static void SCAL(ARGS... args) { + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasDscal(args...)); + } + + template + static void VCOPY(ARGS... args) { + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasDcopy(args...)); + } + + template + static void GEMV(ARGS... args) { + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasDgemv(args...)); + } + + template + static void GEMM_BATCH(ARGS... args) { + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasDgemmBatched(args...)); + } + + template + static void GEMM_STRIDED_BATCH(ARGS... args) { + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::mublasDgemmStridedBatched(args...)); + } + + template + static void GEMM_EX(ARGS... args UNUSED) { + PADDLE_THROW( + phi::errors::Unimplemented("murrently there are not mublasDgemmEx.")); + } + + template + static void TRSM(ARGS... args) { + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasDtrsm(args...)); + } + + template + static void GETRF_BATCH(ARGS... args) { + // PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasDgetrfBatched(args...)); + PADDLE_THROW( + phi::errors::Unimplemented("murrently there are not mublasDgetrfBatched.")); + } + + template + static void GETRI_BATCH(ARGS... args) { + // PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasDgetriBatched(args...)); + PADDLE_THROW( + phi::errors::Unimplemented("murrently there are not mublasDgetriBatched.")); + } + + template + static void MATINV_BATCH(ARGS... args) { + // PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasDmatinvBatched(args...)); + PADDLE_THROW( + phi::errors::Unimplemented("murrently there are not mublasDmatinvBatched.")); + } + + template + static void GETRS_BATCH(ARGS... args) { + // PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasDgetrsBatched(args...)); + PADDLE_THROW( + phi::errors::Unimplemented("murrently there are not mublasDgetrsBatched.")); + } + + template + static void TRSM_BATCH(ARGS... args) { + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasDtrsmBatched(args...)); + } +}; + +template <> +struct CUBlas { + using float16 = phi::dtype::float16; + + static void GEMM(mublasHandle_t handle, + mublasOperation_t transa, + mublasOperation_t transb, + int m, + int n, + int k, + const float16 *alpha, + const float16 *A, + int lda, + const float16 *B, + int ldb, + const float16 *beta, + float16 *C, + int ldc) { + // PADDLE_ENFORCE_GPU_SUCCESS( + // phi::dynload::mublasHgemm(handle, + // transa, + // transb, + // m, + // n, + // k, + // reinterpret_cast(alpha), + // reinterpret_cast(A), + // lda, + // reinterpret_cast(B), + // ldb, + // reinterpret_cast(beta), + // reinterpret_cast<__half *>(C), + // ldc)); + PADDLE_THROW( + phi::errors::Unimplemented("murrently there are not mublasHgemm.")); + } + + static void GEMM_BATCH(phi::GPUContext *dev_ctx, + mublasOperation_t transa, + mublasOperation_t transb, + int m, + int n, + int k, + const float *alpha, + const float16 **A, + musaDataType_t Atype, + int lda, + const float16 **B, + musaDataType_t Btype, + int ldb, + const float *beta, + float16 **C, + musaDataType_t Ctype, + int ldc, + int batchCount, + musaDataType_t computeType) { + PADDLE_THROW(phi::errors::Unimplemented( + "mublasGemmBatchedEx is not supported")); + } + + static void GEMM_STRIDED_BATCH(mublasHandle_t handle, + mublasOperation_t transa, + mublasOperation_t transb, + int m, + int n, + int k, + const float16 *alpha, + const float16 *A, + int lda, + long long int strideA, // NOLINT + const float16 *B, // NOLINT + int ldb, + long long int strideB, // NOLINT + const float16 *beta, + float16 *C, + int ldc, + long long int strideC, // NOLINT + int batchCount) { + PADDLE_THROW(phi::errors::Unimplemented( + "mublasHgemmStridedBatched is not supported")); + // PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasHgemmStridedBatched( + // handle, + // transa, + // transb, + // m, + // n, + // k, + // reinterpret_cast(alpha), + // reinterpret_cast(A), + // lda, + // strideA, + // reinterpret_cast(B), + // ldb, + // strideB, + // reinterpret_cast(beta), + // reinterpret_cast<__half *>(C), + // ldc, + // strideC, + // batchCount)); + } + + // NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply. + // https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode + template + static void GEMM_EX(phi::GPUContext *dev_ctx, + mublasOperation_t transa, + mublasOperation_t transb, + int m, + int n, + int k, + const void *alpha, + const void *A, + musaDataType_t Atype, + int lda, + const void *B, + musaDataType_t Btype, + int ldb, + const void *beta, + void *C, + musaDataType_t Ctype, + int ldc, + musaDataType_t computeType) { + mublasGemmAlgo_t algo = MUBLAS_GEMM_DEFAULT; + bool use_tensor_op_math = dev_ctx->tensor_core_available(); + if (use_tensor_op_math) { + algo = MUBLAS_GEMM_DEFAULT_TENSOR_OP; + } + VLOG(5) << "use_tensor_op_math: " + << (use_tensor_op_math ? "True" : "False"); + + dev_ctx->TensorCoreCublasCallIfAvailable([&](mublasHandle_t handle) { + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasGemmEx(handle, + transa, + transb, + m, + n, + k, + alpha, + A, + Atype, + lda, + B, + Btype, + ldb, + beta, + C, + Ctype, + ldc, + computeType, + algo)); + }); + } +}; + +template <> +struct CUBlas> { + static void GEMV(mublasHandle_t handle, + mublasOperation_t transa, + int m, + int n, + const phi::dtype::complex *alpha, + const phi::dtype::complex *A, + int lda, + const phi::dtype::complex *B, + int ldb, + const phi::dtype::complex *beta, + phi::dtype::complex *C, + int ldc) { + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasCgemv( + handle, + transa, + m, + n, + reinterpret_cast(alpha), + reinterpret_cast(A), + lda, + reinterpret_cast(B), + ldb, + reinterpret_cast(beta), + reinterpret_cast(C), + ldc)); + } + + static void AXPY(mublasHandle_t handle, + int n, + const phi::dtype::complex *alpha, + const phi::dtype::complex *X, + const int incX, + phi::dtype::complex *Y, + const int incY) { + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasCaxpy( + handle, + n, + reinterpret_cast(alpha), + reinterpret_cast(X), + incX, + reinterpret_cast(Y), + incY)); + } + + static void GEMM_STRIDED_BATCH(mublasHandle_t handle, + mublasOperation_t transa, + mublasOperation_t transb, + int m, + int n, + int k, + const phi::dtype::complex *alpha, + const phi::dtype::complex *A, + int lda, + long long int strideA, // NOLINT + const phi::dtype::complex *B, // NOLINT + int ldb, + long long int strideB, // NOLINT + const phi::dtype::complex *beta, + phi::dtype::complex *C, + int ldc, + long long int strideC, // NOLINT + int batchCount) { + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasCgemmStridedBatched( + handle, + transa, + transb, + m, + n, + k, + reinterpret_cast(alpha), + reinterpret_cast(A), + lda, + strideA, + reinterpret_cast(B), + ldb, + strideB, + reinterpret_cast(beta), + reinterpret_cast(C), + ldc, + strideC, + batchCount)); + } + + static void GEMM(mublasHandle_t handle, + mublasOperation_t transa, + mublasOperation_t transb, + int m, + int n, + int k, + const phi::dtype::complex *alpha, + const phi::dtype::complex *A, + int lda, + const phi::dtype::complex *B, + int ldb, + const phi::dtype::complex *beta, + phi::dtype::complex *C, + int ldc) { + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasCgemm( + handle, + transa, + transb, + m, + n, + k, + reinterpret_cast(alpha), + reinterpret_cast(A), + lda, + reinterpret_cast(B), + ldb, + reinterpret_cast(beta), + reinterpret_cast(C), + ldc)); + } + + static void TRSM(mublasHandle_t handle, + mublasSideMode_t side, + mublasFillMode_t uplo, + mublasOperation_t transa, + mublasDiagType_t diag, + int m, + int n, + const phi::dtype::complex *alpha, + const phi::dtype::complex *A, + int lda, + phi::dtype::complex *B, + int ldb) { + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasCtrsm( + handle, + side, + uplo, + transa, + diag, + m, + n, + reinterpret_cast(alpha), + reinterpret_cast(A), + lda, + reinterpret_cast(B), + ldb)); + } + + // NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply. + // https://docs.nvidia.com/muda/mublas/index.html#mublassetmathmode + template + static void GEMM_EX(phi::GPUContext *dev_ctx, + mublasOperation_t transa, + mublasOperation_t transb, + int m, + int n, + int k, + const void *alpha, + const void *A, + musaDataType_t Atype, + int lda, + const void *B, + musaDataType_t Btype, + int ldb, + const void *beta, + void *C, + musaDataType_t Ctype, + int ldc, + musaDataType_t computeType) { + mublasGemmAlgo_t algo = MUBLAS_GEMM_DEFAULT; + bool use_tensor_op_math = dev_ctx->tensor_core_available(); + if (use_tensor_op_math) { + algo = MUBLAS_GEMM_DEFAULT_TENSOR_OP; + } + VLOG(5) << "use_tensor_op_math: " + << (use_tensor_op_math ? "True" : "False"); + + dev_ctx->TensorCoreCublasCallIfAvailable([&](mublasHandle_t handle) { + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasGemmEx(handle, + transa, + transb, + m, + n, + k, + alpha, + A, + Atype, + lda, + B, + Btype, + ldb, + beta, + C, + Ctype, + ldc, + computeType, + algo)); + }); + } + + static void TRSM_BATCH(mublasHandle_t handle, + mublasSideMode_t side, + mublasFillMode_t uplo, + mublasOperation_t transa, + mublasDiagType_t diag, + int m, + int n, + const phi::dtype::complex *alpha, + const phi::dtype::complex **A, + int lda, + phi::dtype::complex **B, + int ldb, + int batch_size) { + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasCtrsmBatched( + handle, + side, + uplo, + transa, + diag, + m, + n, + reinterpret_cast(alpha), + reinterpret_cast(A), + lda, + reinterpret_cast(B), + ldb, + batch_size)); + } +}; + +template <> +struct CUBlas> { + static void GEMV(mublasHandle_t handle, + mublasOperation_t transa, + int m, + int n, + const phi::dtype::complex *alpha, + const phi::dtype::complex *A, + int lda, + const phi::dtype::complex *B, + int ldb, + const phi::dtype::complex *beta, + phi::dtype::complex *C, + int ldc) { + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasZgemv( + handle, + transa, + m, + n, + reinterpret_cast(alpha), + reinterpret_cast(A), + lda, + reinterpret_cast(B), + ldb, + reinterpret_cast(beta), + reinterpret_cast(C), + ldc)); + } + + static void AXPY(mublasHandle_t handle, + int n, + const phi::dtype::complex *alpha, + const phi::dtype::complex *X, + const int incX, + phi::dtype::complex *Y, + const int incY) { + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasZaxpy( + handle, + n, + reinterpret_cast(alpha), + reinterpret_cast(X), + incX, + reinterpret_cast(Y), + incY)); + } + + static void GEMM_STRIDED_BATCH( + mublasHandle_t handle, + mublasOperation_t transa, + mublasOperation_t transb, + int m, + int n, + int k, + const phi::dtype::complex *alpha, + const phi::dtype::complex *A, + int lda, + long long int strideA, // NOLINT + const phi::dtype::complex *B, // NOLINT + int ldb, + long long int strideB, // NOLINT + const phi::dtype::complex *beta, + phi::dtype::complex *C, + int ldc, + long long int strideC, // NOLINT + int batchCount) { + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasZgemmStridedBatched( + handle, + transa, + transb, + m, + n, + k, + reinterpret_cast(alpha), + reinterpret_cast(A), + lda, + strideA, + reinterpret_cast(B), + ldb, + strideB, + reinterpret_cast(beta), + reinterpret_cast(C), + ldc, + strideC, + batchCount)); + } + + static void GEMM(mublasHandle_t handle, + mublasOperation_t transa, + mublasOperation_t transb, + int m, + int n, + int k, + const phi::dtype::complex *alpha, + const phi::dtype::complex *A, + int lda, + const phi::dtype::complex *B, + int ldb, + const phi::dtype::complex *beta, + phi::dtype::complex *C, + int ldc) { + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasZgemm( + handle, + transa, + transb, + m, + n, + k, + reinterpret_cast(alpha), + reinterpret_cast(A), + lda, + reinterpret_cast(B), + ldb, + reinterpret_cast(beta), + reinterpret_cast(C), + ldc)); + } + + static void TRSM(mublasHandle_t handle, + mublasSideMode_t side, + mublasFillMode_t uplo, + mublasOperation_t transa, + mublasDiagType_t diag, + int m, + int n, + const phi::dtype::complex *alpha, + const phi::dtype::complex *A, + int lda, + phi::dtype::complex *B, + int ldb) { + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasZtrsm( + handle, + side, + uplo, + transa, + diag, + m, + n, + reinterpret_cast(alpha), + reinterpret_cast(A), + lda, + reinterpret_cast(B), + ldb)); + } + + static void TRSM_BATCH(mublasHandle_t handle, + mublasSideMode_t side, + mublasFillMode_t uplo, + mublasOperation_t transa, + mublasDiagType_t diag, + int m, + int n, + const phi::dtype::complex *alpha, + const phi::dtype::complex **A, + int lda, + phi::dtype::complex **B, + int ldb, + int batch_size) { + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasZtrsmBatched( + handle, + side, + uplo, + transa, + diag, + m, + n, + reinterpret_cast(alpha), + reinterpret_cast(A), + lda, + reinterpret_cast(B), + ldb, + batch_size)); + } + + // NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply. + // https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode + template + static void GEMM_EX(phi::GPUContext *dev_ctx, + mublasOperation_t transa, + mublasOperation_t transb, + int m, + int n, + int k, + const void *alpha, + const void *A, + musaDataType_t Atype, + int lda, + const void *B, + musaDataType_t Btype, + int ldb, + const void *beta, + void *C, + musaDataType_t Ctype, + int ldc, + musaDataType_t computeType) { + mublasGemmAlgo_t algo = MUBLAS_GEMM_DEFAULT; + bool use_tensor_op_math = dev_ctx->tensor_core_available(); + if (use_tensor_op_math) { + algo = MUBLAS_GEMM_DEFAULT_TENSOR_OP; + } + VLOG(5) << "use_tensor_op_math: " + << (use_tensor_op_math ? "True" : "False"); + + dev_ctx->TensorCoreCublasCallIfAvailable([&](mublasHandle_t handle) { + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasGemmEx(handle, + transa, + transb, + m, + n, + k, + alpha, + A, + Atype, + lda, + B, + Btype, + ldb, + beta, + C, + Ctype, + ldc, + computeType, + algo)); + }); + + } +}; + +template <> +template +void Blas::GEMM(CBLAS_TRANSPOSE transA, + CBLAS_TRANSPOSE transB, + int M, + int N, + int K, + T alpha, + const T *A, + const T *B, + T beta, + T *C) const { + int lda = (transA == CblasNoTrans) ? K : M; + int ldb = (transB == CblasNoTrans) ? N : K; + mublasOperation_t cuTransA = + (transA == CblasNoTrans) ? MUBLAS_OP_N : MUBLAS_OP_T; + mublasOperation_t cuTransB = + (transB == CblasNoTrans) ? MUBLAS_OP_N : MUBLAS_OP_T; + context_.CublasCall([&](mublasHandle_t handle) { + CUBlas::GEMM(handle, + cuTransB, + cuTransA, + N, + M, + K, + &alpha, + B, + ldb, + A, + lda, + &beta, + C, + N); + }); +} + +template <> +template <> +inline void Blas::GEMM(CBLAS_TRANSPOSE transA, + CBLAS_TRANSPOSE transB, + int M, + int N, + int K, + phi::dtype::float16 alpha, + const phi::dtype::float16 *A, + const phi::dtype::float16 *B, + phi::dtype::float16 beta, + phi::dtype::float16 *C) const { + // // Note that cublas follows fortran order, so the order is different from + // // the cblas convention. + int lda = (transA == CblasNoTrans) ? K : M; + int ldb = (transB == CblasNoTrans) ? N : K; + mublasOperation_t cuTransA = + (transA == CblasNoTrans) ? MUBLAS_OP_N : MUBLAS_OP_T; + mublasOperation_t cuTransB = + (transB == CblasNoTrans) ? MUBLAS_OP_N : MUBLAS_OP_T; + + float h_alpha = static_cast(alpha); + float h_beta = static_cast(beta); + auto &cuda_ctx = const_cast(context_); + CUBlas::GEMM_EX(&cuda_ctx, + cuTransB, + cuTransA, + N, + M, + K, + &h_alpha, + B, + MUSA_R_16F, + ldb, + A, + MUSA_R_16F, + lda, + &h_beta, + C, + MUSA_R_16F, + N, + (musaDataType_t)0);//MUSA_R_32F https://jira.mthreads.com/browse/SW-37038 +} + + + + +template <> +template <> +inline void Blas::GEMM(CBLAS_TRANSPOSE transA, + CBLAS_TRANSPOSE transB, + int M, + int N, + int K, + phi::dtype::bfloat16 alpha, + const phi::dtype::bfloat16 *A, + const phi::dtype::bfloat16 *B, + phi::dtype::bfloat16 beta, + phi::dtype::bfloat16 *C) const { + PADDLE_THROW(phi::errors::Unimplemented( + "cublasGemmEx with bfloat16 is not supported on cuda <= 11")); +} + +template <> +template <> +inline void Blas::GEMM(CBLAS_TRANSPOSE transA, + CBLAS_TRANSPOSE transB, + int M, + int N, + int K, + phi::dtype::complex alpha, + const phi::dtype::complex *A, + const phi::dtype::complex *B, + phi::dtype::complex beta, + phi::dtype::complex *C) const { + PADDLE_THROW(phi::errors::Unimplemented( + "Blas::GEMM for dtype complex is not supported on MUSA now!")); +} + +template <> +template <> +inline void Blas::GEMM(CBLAS_TRANSPOSE transA, + CBLAS_TRANSPOSE transB, + int M, + int N, + int K, + phi::dtype::complex alpha, + const phi::dtype::complex *A, + const phi::dtype::complex *B, + phi::dtype::complex beta, + phi::dtype::complex *C) const { + // Note that cublas follows fortran order, so the order is different from + // the cblas convention. + int lda = (transA == CblasNoTrans) ? K : M; + int ldb = (transB == CblasNoTrans) ? N : K; + mublasOperation_t cuTransA = + (transA == CblasNoTrans) ? MUBLAS_OP_N : MUBLAS_OP_T; + mublasOperation_t cuTransB = + (transB == CblasNoTrans) ? MUBLAS_OP_N : MUBLAS_OP_T; + + thrust::complex c_alpha = + thrust::complex(alpha.real, alpha.imag); + thrust::complex c_beta = + thrust::complex(beta.real, beta.imag); + auto &cuda_ctx = const_cast(context_); + CUBlas>::GEMM_EX(&cuda_ctx, + cuTransB, + cuTransA, + N, + M, + K, + &c_alpha, + B, + // Originally, this was MUSA_C_64F, but due to some bugs, it was necessary to manually specify a value + // jira:https://jira.mthreads.com/browse/SW-37038 + (musaDataType_t)5,//MUSA_C_64F + ldb, + A, + (musaDataType_t)5,//MUSA_C_64F + lda, + &c_beta, + C, + (musaDataType_t)5,//MUSA_C_64F + N, + (musaDataType_t)5);//MUSA_C_64F +} + + +template <> +template +void Blas::GEMM(bool transA, + bool transB, + int M, + int N, + int K, + T alpha, + const T *A, + int lda, + const T *B, + int ldb, + T beta, + T *C, + int ldc) const { + // Note that cublas follows fortran order, so the order is different from + // the cblas convention. + mublasOperation_t cuTransA = transA ? MUBLAS_OP_T : MUBLAS_OP_N; + mublasOperation_t cuTransB = transB ? MUBLAS_OP_T : MUBLAS_OP_N; + + if (FLAGS_enable_cublas_tensor_op_math && std::is_same::value) { + auto &cuda_ctx = const_cast(context_); + CUBlas::GEMM_EX(&cuda_ctx, + cuTransB, + cuTransA, + N, + M, + K, + &alpha, + B, + (musaDataType_t)0,//MUSA_R_32F, + ldb, + A, + (musaDataType_t)0,//MUSA_R_32F, + lda, + &beta, + C, + (musaDataType_t)0,//MUSA_R_32F, + ldc); + } else { + context_.CublasCall([&](mublasHandle_t handle) { + CUBlas::GEMM(handle, + cuTransB, + cuTransA, + N, + M, + K, + &alpha, + B, + ldb, + A, + lda, + &beta, + C, + ldc); + }); + } +} + +template <> +template <> +inline void Blas::GEMM(bool transA, + bool transB, + int M, + int N, + int K, + phi::dtype::float16 alpha, + const phi::dtype::float16 *A, + int lda, + const phi::dtype::float16 *B, + int ldb, + phi::dtype::float16 beta, + phi::dtype::float16 *C, + int ldc) const { + // Note that cublas follows fortran order, so the order is different from + // the cblas convention. + mublasOperation_t cuTransA = transA ? MUBLAS_OP_T : MUBLAS_OP_N; + mublasOperation_t cuTransB = transB ? MUBLAS_OP_T : MUBLAS_OP_N; + + context_.CublasCall([&](mublasHandle_t handle) { + CUBlas::GEMM(handle, + cuTransB, + cuTransA, + N, + M, + K, + &alpha, + B, + ldb, + A, + lda, + &beta, + C, + ldc); + }); +} +template <> +template <> +inline void Blas::GEMM(bool transA, + bool transB, + int M, + int N, + int K, + phi::dtype::bfloat16 alpha, + const phi::dtype::bfloat16 *A, + int lda, + const phi::dtype::bfloat16 *B, + int ldb, + phi::dtype::bfloat16 beta, + phi::dtype::bfloat16 *C, + int ldc) const { + PADDLE_THROW(phi::errors::Unimplemented( + "Blas::GEMM for dtype bfloat16 is not supported on MUSA now!")); +} + +template <> +template +void Blas::AXPY(int n, T alpha, const T *x, T *y) const { + context_.CublasCall([&](mublasHandle_t handle) { + CUBlas::AXPY(handle, n, &alpha, x, 1, y, 1); + }); +} + +template <> +template +void Blas::SCAL(int n, const T alpha, T *x) const { + context_.CublasCall( + [&](mublasHandle_t handle) { CUBlas::SCAL(handle, n, &alpha, x, 1); }); +} + +template <> +template +void Blas::VCOPY(int n, const T *x, T *y) const { + context_.CublasCall( + [&](mublasHandle_t handle) { CUBlas::VCOPY(handle, n, x, 1, y, 1); }); +} + +template <> +template +void Blas::GEMV(bool trans_a, + int M, + int N, + T alpha, + const T *A, + const T *B, + T beta, + T *C) const { + mublasOperation_t cuTransA = !trans_a ? MUBLAS_OP_T : MUBLAS_OP_N; + + context_.CublasCall([&](mublasHandle_t handle) { + CUBlas::GEMV(handle, cuTransA, N, M, &alpha, A, N, B, 1, &beta, C, 1); + }); +} + +template <> +template <> +inline void Blas::GEMV(bool trans_a, + int M, + int N, + phi::dtype::float16 alpha, + const phi::dtype::float16 *A, + const phi::dtype::float16 *B, + phi::dtype::float16 beta, + phi::dtype::float16 *C) const { + // Because cublas doesn't support half gemv, we use cublasHgemm to achieve it. + if (trans_a) { + this->template GEMM( + CblasNoTrans, CblasNoTrans, 1, N, M, alpha, B, A, beta, C); + } else { + this->template GEMM( + CblasNoTrans, CblasNoTrans, M, 1, N, alpha, A, B, beta, C); + } +} + +template <> +template <> +inline void Blas::GEMV(bool trans_a, + int M, + int N, + phi::dtype::bfloat16 alpha, + const phi::dtype::bfloat16 *A, + const phi::dtype::bfloat16 *B, + phi::dtype::bfloat16 beta, + phi::dtype::bfloat16 *C) const { + // Because cublas doesn't support bfloat gemv, we use cublasHgemm to achieve + // it. + if (trans_a) { + this->template GEMM( + CblasNoTrans, CblasNoTrans, 1, N, M, alpha, B, A, beta, C); + } else { + this->template GEMM( + CblasNoTrans, CblasNoTrans, M, 1, N, alpha, A, B, beta, C); + } +} + + +template <> +template +void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, + CBLAS_TRANSPOSE transB, + int M, + int N, + int K, + T alpha, + const T *A, + const T *B, + T beta, + T *C, + int batchCount, + int64_t strideA, + int64_t strideB) const { + // Note that cublas follows fortran order, so the order is different from + // the cblas convention. + int lda = (transA == CblasNoTrans) ? K : M; + int ldb = (transB == CblasNoTrans) ? N : K; + int ldc = N; + mublasOperation_t cuTransA = + (transA == CblasNoTrans) ? MUBLAS_OP_N : MUBLAS_OP_T; + mublasOperation_t cuTransB = + (transB == CblasNoTrans) ? MUBLAS_OP_N : MUBLAS_OP_T; + const int64_t strideC = M * N; + context_.CublasCall([&](mublasHandle_t handle) { + CUBlas::GEMM_STRIDED_BATCH(handle, + cuTransB, + cuTransA, + N, + M, + K, + &alpha, + B, + ldb, + strideB, + A, + lda, + strideA, + &beta, + C, + ldc, + strideC, + batchCount); + }); +} + +template <> +template <> +inline void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, + CBLAS_TRANSPOSE transB, + int M, + int N, + int K, + phi::dtype::bfloat16 alpha, + const phi::dtype::bfloat16 *A, + const phi::dtype::bfloat16 *B, + phi::dtype::bfloat16 beta, + phi::dtype::bfloat16 *C, + int batchCount, + int64_t strideA, + int64_t strideB) const { + // Note that cublas follows fortran order, so the order is different from + // the cblas convention. + // int lda = (transA == CblasNoTrans) ? K : M; + // int ldb = (transB == CblasNoTrans) ? N : K; + // int ldc = N; + // mublasOperation_t cuTransA = + // (transA == CblasNoTrans) ? MUBLAS_OP_N : MUBLAS_OP_T; + // mublasOperation_t cuTransB = + // (transB == CblasNoTrans) ? MUBLAS_OP_N : MUBLAS_OP_T; + // const int64_t strideC = M * N; + + // float h_alpha = static_cast(alpha); + // float h_beta = static_cast(beta); + + // mublasGemmAlgo_t algo = MUBLAS_GEMM_DEFAULT; + // bool use_tensor_op_math = context_.tensor_core_available(); + // if (use_tensor_op_math) { + // algo = MUBLAS_GEMM_DEFAULT_TENSOR_OP; + // } + // VLOG(5) << "use_tensor_op_math: " << (use_tensor_op_math ? "True" : "False"); + + // context_.TensorCoreCublasCallIfAvailable([&](mublasHandle_t handle) { + // PADDLE_ENFORCE_GPU_SUCCESS( + // phi::dynload::mublasGemmStridedBatchedEx(handle, + // cuTransB, + // cuTransA, + // N, + // M, + // K, + // &h_alpha, + // B, + // MUSA_R_16BF, + // ldb, + // strideB, + // A, + // MUSA_R_16BF, + // lda, + // strideA, + // &h_beta, + // C, + // MUSA_R_16BF, + // ldc, + // strideC, + // batchCount, + // MUBLAS_COMPUTE_32F, + // algo)); + // }); + PADDLE_THROW( + phi::errors::Unimplemented("murrently there are not mublasGemmStridedBatchedEx.")); +} + +template <> +template +void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, + CBLAS_TRANSPOSE transB, + int M, + int N, + int K, + T alpha, + const T **A, + const T **B, + T beta, + T **C, + int batchCount) const { + for (int k = 0; k < batchCount; ++k) { + this->template GEMM( + transA, transB, M, N, K, alpha, A[k], B[k], beta, C[k]); + } +} + +#if defined(__MUSACC__) +template <> +template <> +inline void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, + CBLAS_TRANSPOSE transB, + int M, + int N, + int K, + double alpha, + const double **A, + const double **B, + double beta, + double **C, + int batchCount) const { + // Note that cublas follows fortran order, so the order is different from + // the cblas convention. + int lda = (transA == CblasNoTrans) ? K : M; + int ldb = (transB == CblasNoTrans) ? N : K; + int ldc = N; + mublasOperation_t cuTransA = + (transA == CblasNoTrans) ? MUBLAS_OP_N : MUBLAS_OP_T; + mublasOperation_t cuTransB = + (transB == CblasNoTrans) ? MUBLAS_OP_N : MUBLAS_OP_T; + thrust::device_vector A_ptr(A, A + batchCount); + thrust::device_vector B_ptr(B, B + batchCount); + thrust::device_vector C_ptr(C, C + batchCount); + + context_.CublasCall([&](mublasHandle_t handle) { + CUBlas::GEMM_BATCH(handle, + cuTransB, + cuTransA, + N, + M, + K, + &alpha, + B_ptr.data().get(), + ldb, + A_ptr.data().get(), + lda, + &beta, + C_ptr.data().get(), + ldc, + batchCount); + }); +} + +template <> +template <> +inline void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, + CBLAS_TRANSPOSE transB, + int M, + int N, + int K, + float alpha, + const float **A, + const float **B, + float beta, + float **C, + int batchCount) const { + // Note that cublas follows fortran order, so the order is different from + // the cblas convention. + int lda = (transA == CblasNoTrans) ? K : M; + int ldb = (transB == CblasNoTrans) ? N : K; + int ldc = N; + mublasOperation_t cuTransA = + (transA == CblasNoTrans) ? MUBLAS_OP_N : MUBLAS_OP_T; + mublasOperation_t cuTransB = + (transB == CblasNoTrans) ? MUBLAS_OP_N : MUBLAS_OP_T; + thrust::device_vector A_ptr(A, A + batchCount); + thrust::device_vector B_ptr(B, B + batchCount); + thrust::device_vector C_ptr(C, C + batchCount); + + context_.CublasCall([&](mublasHandle_t handle) { + CUBlas::GEMM_BATCH(handle, + cuTransB, + cuTransA, + N, + M, + K, + &alpha, + B_ptr.data().get(), + ldb, + A_ptr.data().get(), + lda, + &beta, + C_ptr.data().get(), + ldc, + batchCount); + }); +} + +template <> +template <> +inline void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, + CBLAS_TRANSPOSE transB, + int M, + int N, + int K, + phi::dtype::float16 alpha, + const phi::dtype::float16 **A, + const phi::dtype::float16 **B, + phi::dtype::float16 beta, + phi::dtype::float16 **C, + int batchCount) const { + PADDLE_THROW(phi::errors::Unimplemented( + "Blas::BatchedGEMM for dtype float16 is not supported on MUSA now!")); +} + +template <> +template <> +inline void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, + CBLAS_TRANSPOSE transB, + int M, + int N, + int K, + phi::dtype::bfloat16 alpha, + const phi::dtype::bfloat16 **A, + const phi::dtype::bfloat16 **B, + phi::dtype::bfloat16 beta, + phi::dtype::bfloat16 **C, + int batchCount) const { + PADDLE_THROW(phi::errors::Unimplemented( + "Blas::BatchedGEMM for bfloat16 is not supported on MUSA now!")); +} +#endif +template <> +template +void Blas::TRSM(CBLAS_SIDE side, + CBLAS_UPLO uplo, + CBLAS_TRANSPOSE transA, + CBLAS_DIAG diag, + int M, + int N, + T alpha, + const T *A, + int lda, + T *B, + int ldb) const { + // solve row major `op ( A ) X = α B` by taking it as `X' op ( A' ) = α B'` + // where ' stands for transpose + mublasSideMode_t cuSide = + (side == CblasLeft) ? MUBLAS_SIDE_RIGHT : MUBLAS_SIDE_LEFT; + mublasFillMode_t cuUplo = + (uplo == CblasLower) ? MUBLAS_FILL_MODE_UPPER : MUBLAS_FILL_MODE_LOWER; + // use CUBLAS_OP_C (conjugate transpose) for complex + mublasOperation_t cuTransA = + (transA == CblasNoTrans) ? MUBLAS_OP_N : MUBLAS_OP_T; + mublasDiagType_t cuDiag = + (diag == CblasUnit) ? MUBLAS_DIAG_UNIT : MUBLAS_DIAG_NON_UNIT; + + context_.CublasCall([&](mublasHandle_t handle) { + CUBlas::TRSM( + handle, cuSide, cuUplo, cuTransA, cuDiag, N, M, &alpha, A, lda, B, ldb); + }); +} + +template <> +template +void Blas::BatchedGETRF( + int n, T **a, int *ipiv, int *info, int batch_size) const { + context_.CublasCall([&](mublasHandle_t handle) { + CUBlas::GETRF_BATCH(handle, n, a, n, ipiv, info, batch_size); + }); +} + +template <> +template +void Blas::BatchedGETRI(int n, + const T **a, + const int *ipiv, + T **a_inv, + int *info, + int batch_size) const { + PADDLE_ENFORCE_NE( + a_inv, + a, + phi::errors::InvalidArgument( + "cuBLAS fuction 'cublasgetrfBatched' cannot be executed " + "in-place. The memory space of output matrix (address: %p) cannot " + "overlap memory space of input matrix (address: %p).", + a_inv, + a)); + context_.CublasCall([&](mublasHandle_t handle) { + CUBlas::GETRI_BATCH(handle, n, a, n, ipiv, a_inv, n, info, batch_size); + }); +} + +template <> +template +void Blas::BatchedMatInv( + int n, const T **a, T **a_inv, int *info, int batch_size) const { + context_.CublasCall([&](mublasHandle_t handle) { + CUBlas::MATINV_BATCH(handle, n, a, n, a_inv, n, info, batch_size); + }); +} + +template <> +template +void Blas::BatchedGETRS(CBLAS_TRANSPOSE trans, + int n, + int nrhs, + const T **a, + int lda, + int *ipiv, + T **b, + int ldb, + int *info, + int batch_size) const { + // use CUBLAS_OP_C (conjugate transpose) for complex + mublasOperation_t cuTrans = + (trans == CblasNoTrans) ? MUBLAS_OP_N : MUBLAS_OP_T; + context_.CublasCall([&](mublasHandle_t handle) { + CUBlas::GETRS_BATCH( + handle, cuTrans, n, nrhs, a, lda, ipiv, b, ldb, info, batch_size); + }); +} + +template <> +template +void Blas::BatchedTRSM(CBLAS_SIDE side, + CBLAS_UPLO uplo, + CBLAS_TRANSPOSE transA, + CBLAS_DIAG diag, + int M, + int N, + T alpha, + const T **A, + int lda, + T **B, + int ldb, + int batch_size) const { + // solve row major `op ( A ) X = α B` by taking it as `X' op ( A' ) = α B'` + // where ' stands for transpose + mublasSideMode_t cuSide = + (side == CblasLeft) ? MUBLAS_SIDE_RIGHT : MUBLAS_SIDE_LEFT; + mublasFillMode_t cuUplo = + (uplo == CblasLower) ? MUBLAS_FILL_MODE_UPPER : MUBLAS_FILL_MODE_LOWER; + // use CUBLAS_OP_C (conjugate transpose) for complex + mublasOperation_t cuTransA = + (transA == CblasNoTrans) ? MUBLAS_OP_N : MUBLAS_OP_T; + mublasDiagType_t cuDiag = + (diag == CblasUnit) ? MUBLAS_DIAG_UNIT : MUBLAS_DIAG_NON_UNIT; + + context_.CublasCall([&](mublasHandle_t handle) { + CUBlas::TRSM_BATCH(handle, + cuSide, + cuUplo, + cuTransA, + cuDiag, + N, + M, + &alpha, + A, + lda, + B, + ldb, + batch_size); + }); +} + +} // namespace funcs +} // namespace phi diff --git a/paddle/phi/kernels/funcs/broadcast_function.h b/paddle/phi/kernels/funcs/broadcast_function.h index 822801e10c357..c25ab4b55cb53 100644 --- a/paddle/phi/kernels/funcs/broadcast_function.h +++ b/paddle/phi/kernels/funcs/broadcast_function.h @@ -17,7 +17,7 @@ limitations under the License. */ #include #include "paddle/phi/kernels/funcs/elementwise_base.h" -#if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__) +#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) || defined(__xpu__) #include "paddle/phi/kernels/funcs/dims_simplifier.h" namespace kps = phi::kps; @@ -27,7 +27,7 @@ namespace kps = phi::kps; namespace phi { namespace funcs { -#if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__) +#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) || defined(__xpu__) enum BroadcastType { kMixed = 1, kBroadcast = 2, kElementwise = 3 }; diff --git a/paddle/phi/kernels/funcs/check_numerics_utils.h b/paddle/phi/kernels/funcs/check_numerics_utils.h index 76adc40c4f9f9..6d426d764e221 100644 --- a/paddle/phi/kernels/funcs/check_numerics_utils.h +++ b/paddle/phi/kernels/funcs/check_numerics_utils.h @@ -86,7 +86,7 @@ HOSTDEVICE static void PrintAndThrowError(const char* debug_info, int64_t num_nan, int64_t num_inf, int64_t num_zero) { -#if !defined(__HIPCC__) && !defined(__CUDA_ARCH__) +#if !defined(__HIPCC__) && !defined(__CUDA_ARCH__) && !defined(__MUSACC__) PADDLE_THROW(phi::errors::PreconditionNotMet( "There are NAN or INF (num_nan=%lld, num_inf=%lld, num_zero=%lld) in " "%s.", diff --git a/paddle/phi/kernels/funcs/concat_and_split_functor.cu b/paddle/phi/kernels/funcs/concat_and_split_functor.cu index f2b7de681bcfc..877bd056ac542 100644 --- a/paddle/phi/kernels/funcs/concat_and_split_functor.cu +++ b/paddle/phi/kernels/funcs/concat_and_split_functor.cu @@ -49,7 +49,7 @@ static inline void GetBlockDims(const phi::GPUContext& context, *grid_dims = dim3(grid_cols, grid_rows, 1); } -#ifndef PADDLE_WITH_HIP +#if !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA) #if !defined(_WIN32) #define PADDLE_ALIGN(x) __attribute__((aligned(x))) #else diff --git a/paddle/phi/kernels/funcs/detail/gru_cpu_kernel.h b/paddle/phi/kernels/funcs/detail/gru_cpu_kernel.h index e6d587a61e11a..2d210f3200937 100644 --- a/paddle/phi/kernels/funcs/detail/gru_cpu_kernel.h +++ b/paddle/phi/kernels/funcs/detail/gru_cpu_kernel.h @@ -29,7 +29,7 @@ template using EigenVector = phi::EigenVector; -#if !defined(__NVCC__) && !defined(__HIPCC___) // @{ Group for GRU CPU +#if !defined(__NVCC__) && !defined(__HIPCC___) && !defined(__MUSACC___) // @{ Group for GRU CPU template void hl_naive_gru_forward_reset_output(OpResetOutput op_reset_output, T *gate_value, diff --git a/paddle/phi/kernels/funcs/detail/gru_gpu_kernel.h b/paddle/phi/kernels/funcs/detail/gru_gpu_kernel.h index b491cbe120d06..d0f714831549b 100644 --- a/paddle/phi/kernels/funcs/detail/gru_gpu_kernel.h +++ b/paddle/phi/kernels/funcs/detail/gru_gpu_kernel.h @@ -144,7 +144,7 @@ __global__ void KeFastCollectiveGruGate(T *gate_value, } for (int i = 0; i < Tiled_size; ++i) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 +#if (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700) || defined(__MUSACC__) c0 = c0 + __shfl_sync(Tiled_mask, a0, i, Tiled_size) * b0[i]; #else c0 = c0 + __shfl(a0, i, Tiled_size) * b0[i]; @@ -206,7 +206,7 @@ __global__ void KeFastCollectiveGruOut(const T *gate_weight, } for (int i = 0; i < Tiled_size; ++i) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 +#if (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700) || defined(__MUSACC__) c0 = c0 + __shfl_sync(Tiled_mask, a0, i, Tiled_size) * b0[i]; #else c0 = c0 + __shfl(a0, i, Tiled_size) * b0[i]; diff --git a/paddle/phi/kernels/funcs/detail/gru_kernel.h b/paddle/phi/kernels/funcs/detail/gru_kernel.h index 9e2aef1940619..f5a16ade4fd23 100644 --- a/paddle/phi/kernels/funcs/detail/gru_kernel.h +++ b/paddle/phi/kernels/funcs/detail/gru_kernel.h @@ -44,7 +44,7 @@ class gru_resetOutput { (*value_reset_output + *value_reset_bias) * (*value_reset_gate); } } -#if !defined(__NVCC__) && !defined(__HIPCC___) // @{ Group GRU reset output +#if !defined(__NVCC__) && !defined(__HIPCC___) && !defined(__MUSACC___) // @{ Group GRU reset output #ifndef __AVX__ static const bool avx = false; #else @@ -90,7 +90,7 @@ class gru_finalOutput { ((*value_update_gate) * (*value_frame_state)); } } -#if !defined(__NVCC__) && !defined(__HIPCC___) // @{ Group GRU final output +#if !defined(__NVCC__) && !defined(__HIPCC___) && !defined(__MUSACC___)// @{ Group GRU final output #ifndef __AVX__ static const bool avx = false; #else @@ -150,7 +150,7 @@ class gru_stateGrad { *grad_output * (*value_update_gate), *value_frame_state, act_input); } } -#if !defined(__NVCC__) && !defined(__HIPCC___) // @{ Group GRU state grad +#if !defined(__NVCC__) && !defined(__HIPCC___) && !defined(__MUSACC___) // @{ Group GRU state grad #ifndef __AVX__ static const bool avx = false; #else @@ -211,7 +211,7 @@ class gru_resetGrad { *grad_reset_gate = activation(*grad_reset_gate, *value_reset_gate, act_gate); } -#if !defined(__NVCC__) && !defined(__HIPCC___) // @{ Group GRU reset grad +#if !defined(__NVCC__) && !defined(__HIPCC___) && !defined(__MUSACC___) // @{ Group GRU reset grad #ifndef __AVX__ static const bool avx = false; #else @@ -265,7 +265,7 @@ class gru { reset_output * (*grad_frame_state), *value_reset_gate, act_gate); *grad_reset_output = (*value_reset_gate) * (*grad_frame_state); } -#if !defined(__NVCC__) && !defined(__HIPCC___) // @{ Group GRU CPU +#if !defined(__NVCC__) && !defined(__HIPCC___) && !defined(__MUSACC___) // @{ Group GRU CPU #ifndef __AVX__ static const bool avx = false; #else diff --git a/paddle/phi/kernels/funcs/detail/lstm_cpu_kernel.h b/paddle/phi/kernels/funcs/detail/lstm_cpu_kernel.h index e8b8e957c80d1..b0702d560fa51 100644 --- a/paddle/phi/kernels/funcs/detail/lstm_cpu_kernel.h +++ b/paddle/phi/kernels/funcs/detail/lstm_cpu_kernel.h @@ -36,7 +36,7 @@ template using EigenVector = phi::EigenVector; -#if !defined(__NVCC__) && !defined(__HIPCC___) // @{ Group LSTM CPU +#if !defined(__NVCC__) && !defined(__HIPCC___) && !defined(__MUSACC___) // @{ Group LSTM CPU template void naive_lstm_forward_one_sequence(Op op, diff --git a/paddle/phi/kernels/funcs/detail/lstm_kernel.h b/paddle/phi/kernels/funcs/detail/lstm_kernel.h index 0846f05a0c2c5..264322521d477 100644 --- a/paddle/phi/kernels/funcs/detail/lstm_kernel.h +++ b/paddle/phi/kernels/funcs/detail/lstm_kernel.h @@ -59,7 +59,7 @@ class lstm { *state_atv = activation(*state, active_state); *output = (*value_og) * (*state_atv); } -#if !defined(__NVCC__) && !defined(__HIPCC___) // @{ Group LSTM FWD +#if !defined(__NVCC__) && !defined(__HIPCC___) && !defined(__MUSACC___) // @{ Group LSTM FWD #ifndef __AVX__ // If not compiled with AVX instructs. Disable AVX by default static const bool avx = false; #else @@ -163,7 +163,7 @@ class lstm { *checkFGrad = (*grad_fg) * (*prev_state); *checkOGrad = (*grad_og) * (*state); } -#if !defined(__NVCC__) && !defined(__HIPCC___) // @{ Group LSTM BWD +#if !defined(__NVCC__) && !defined(__HIPCC___) && !defined(__MUSACC___) // @{ Group LSTM BWD #ifndef __AVX__ // If not compiled with AVX instructs. Disable AVX by default static const bool avx = false; #else diff --git a/paddle/phi/kernels/funcs/detail/strided_memcpy.h b/paddle/phi/kernels/funcs/detail/strided_memcpy.h index 03e3bdde05ad0..555b1d3fb250e 100644 --- a/paddle/phi/kernels/funcs/detail/strided_memcpy.h +++ b/paddle/phi/kernels/funcs/detail/strided_memcpy.h @@ -17,7 +17,7 @@ limitations under the License. */ #include "paddle/phi/common/memory_utils.h" #include "paddle/phi/core/device_context.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) #include "paddle/phi/backends/gpu/gpu_context.h" #endif @@ -41,7 +41,7 @@ struct StridedMemcpyFunctor { auto& cpu_place = place; memory_utils::Copy(cpu_place, dst, cpu_place, src, sizeof(T)); } else { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) auto& gpu_place = place; auto& cuda_ctx = reinterpret_cast(dev_ctx); memory_utils::Copy( @@ -68,7 +68,7 @@ struct StridedMemcpyFunctor { memory_utils::Copy( cpu_place, dst, cpu_place, src, sizeof(T) * dst_dim[0]); } else { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) auto& gpu_place = place; auto& cuda_ctx = reinterpret_cast(dev_ctx); memory_utils::Copy(gpu_place, diff --git a/paddle/phi/kernels/funcs/diagonal.h b/paddle/phi/kernels/funcs/diagonal.h index 5504a337e88f2..6f4e5fceec473 100644 --- a/paddle/phi/kernels/funcs/diagonal.h +++ b/paddle/phi/kernels/funcs/diagonal.h @@ -14,7 +14,7 @@ #pragma once -#if defined(__NVCC__) || defined(__HIPCC__) +#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) #include #include @@ -109,7 +109,7 @@ DenseTensor Diagonal(const DeviceContext& context, int64_t pos = std::abs(offset) * offset_stride; int64_t dim_size = ret_strides.size(); -#if defined(__NVCC__) || defined(__HIPCC__) +#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) thrust::device_vector diag_vec(common::vectorize(dig_stride)); const int64_t* diag_arr = thrust::raw_pointer_cast(diag_vec.data()); thrust::device_vector ret_vec(ret_strides); @@ -146,7 +146,7 @@ std::vector ComputeDimStride(const std::vector dim) { return dim_strides; } -#if defined(__NVCC__) || defined(__HIPCC__) +#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) template __global__ void DiagonalCuda(const T* data1, T* data2, diff --git a/paddle/phi/kernels/funcs/distribution_helper.h b/paddle/phi/kernels/funcs/distribution_helper.h index abade7ac0ef87..f9c6a0934dc6a 100644 --- a/paddle/phi/kernels/funcs/distribution_helper.h +++ b/paddle/phi/kernels/funcs/distribution_helper.h @@ -21,6 +21,10 @@ limitations under the License. */ #include #endif +#ifdef __MUSACC__ +#include +#endif + #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_info.h" #include "paddle/phi/common/amp_type_traits.h" @@ -28,7 +32,7 @@ limitations under the License. */ #include "paddle/phi/core/generator.h" #include "paddle/phi/core/hostdevice.h" -#if defined(__NVCC__) || defined(__HIPCC__) +#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) #include "paddle/phi/kernels/funcs/index_impl.cu.h" #include "paddle/phi/kernels/primitive/kernel_primitives.h" #endif @@ -49,7 +53,7 @@ struct exponential_transform { explicit exponential_transform(T lambda) : lambda_(lambda) {} HOSTDEVICE inline T operator()(T val) const { -#if defined(__NVCC__) || defined(__HIPCC__) +#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) T log = -std::numeric_limits::epsilon() / 2; if (val < static_cast(1.) - std::numeric_limits::epsilon() / 2) { if (std::is_same::value) { @@ -113,7 +117,7 @@ struct normal_transform { T std_; }; -#if defined(__NVCC__) || defined(__HIPCC__) +#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) namespace kps = phi::kps; @@ -122,19 +126,19 @@ namespace kps = phi::kps; template struct normal_distribution; -#if defined(__NVCC__) +#if defined(__MUSACC__) template struct uniform_distribution { - __device__ inline T operator()(curandStatePhilox4_32_10_t *state) const { - return static_cast(curand_uniform(state)); + __device__ inline T operator()(murandStatePhilox4_32_10_t *state) const { + return static_cast(murand_uniform(state)); } static constexpr int kReturnsCount = 1; }; template <> struct uniform_distribution { - __device__ inline float4 operator()(curandStatePhilox4_32_10_t *state) const { - return curand_uniform4(state); + __device__ inline float4 operator()(murandStatePhilox4_32_10_t *state) const { + return murand_uniform4(state); } static constexpr int kReturnsCount = 4; }; @@ -142,16 +146,16 @@ struct uniform_distribution { template <> struct uniform_distribution { __device__ inline double2 operator()( - curandStatePhilox4_32_10_t *state) const { - return curand_uniform2_double(state); + murandStatePhilox4_32_10_t *state) const { + return murand_uniform2_double(state); } static constexpr int kReturnsCount = 2; }; template <> struct uniform_distribution { - __device__ inline uint4 operator()(curandStatePhilox4_32_10_t *state) const { - return curand4(state); + __device__ inline uint4 operator()(murandStatePhilox4_32_10_t *state) const { + return murand4(state); } static constexpr int kReturnsCount = 4; }; @@ -159,9 +163,9 @@ struct uniform_distribution { template <> struct uniform_distribution { __device__ inline ulonglong2 operator()( - curandStatePhilox4_32_10_t *state) const { + murandStatePhilox4_32_10_t *state) const { ulonglong2 result; - uint4 rand = curand4(state); + uint4 rand = murand4(state); result.x = (uint64_t)rand.x << 32 | rand.y; result.y = (uint64_t)rand.z << 32 | rand.w; return result; @@ -171,8 +175,8 @@ struct uniform_distribution { template <> struct normal_distribution { - __device__ inline float4 operator()(curandStatePhilox4_32_10_t *state) const { - return curand_normal4(state); + __device__ inline float4 operator()(murandStatePhilox4_32_10_t *state) const { + return murand_normal4(state); } static constexpr int kReturnsCount = 4; }; @@ -180,8 +184,8 @@ struct normal_distribution { template <> struct normal_distribution { __device__ inline double2 operator()( - curandStatePhilox4_32_10_t *state) const { - return curand_normal2_double(state); + murandStatePhilox4_32_10_t *state) const { + return murand_normal2_double(state); } static constexpr int kReturnsCount = 2; }; @@ -264,10 +268,10 @@ __global__ void DistributionKernel(size_t size, size_t stride) { size_t idx = static_cast(BLOCK_ID_X * BLOCK_NUM_X); static constexpr int kCount = DistOp::kReturnsCount; -#if defined(__NVCC__) - curandStatePhilox4_32_10_t state; - curand_init(seed, idx + THREAD_ID_X, offset, &state); - using SType = curandStatePhilox4_32_10_t; +#if defined(__MUSACC__) + murandStatePhilox4_32_10_t state; + murand_init(seed, idx + THREAD_ID_X, offset, &state); + using SType = murandStatePhilox4_32_10_t; #else hiprandStatePhilox4_32_10_t state; hiprand_init(seed, idx + THREAD_ID_X, offset, &state); diff --git a/paddle/phi/kernels/funcs/dropout_impl.cu.h b/paddle/phi/kernels/funcs/dropout_impl.cu.h index 985c028afb2a8..87283549f8e29 100644 --- a/paddle/phi/kernels/funcs/dropout_impl.cu.h +++ b/paddle/phi/kernels/funcs/dropout_impl.cu.h @@ -20,6 +20,12 @@ limitations under the License. */ #include #include #endif + +#ifdef PADDLE_WITH_MUSA +#include +#include +#endif + #ifdef PADDLE_WITH_HIP #include #include @@ -146,6 +152,10 @@ __global__ void VectorizedRandomGenerator( hiprandStatePhilox4_32_10_t state; hiprand_init(seed, idx + THREAD_ID_X, increment, &state); using SType = hiprandStatePhilox4_32_10_t; +#elif defined(PADDLE_WITH_MUSA) + murandStatePhilox4_32_10_t state; + murand_init(seed, idx + THREAD_ID_X, increment, &state); + using SType = murandStatePhilox4_32_10_t; #else curandStatePhilox4_32_10_t state; curand_init(seed, idx + THREAD_ID_X, increment, &state); @@ -216,6 +226,10 @@ __global__ void VectorizedGeneratorMask(const size_t n, hiprandStatePhilox4_32_10_t state; hiprand_init(seed, idx + THREAD_ID_X, increment, &state); using SType = hiprandStatePhilox4_32_10_t; +#elif defined(PADDLE_WITH_MUSA) + murandStatePhilox4_32_10_t state; + murand_init(seed, idx + THREAD_ID_X, increment, &state); + using SType = murandStatePhilox4_32_10_t; #else curandStatePhilox4_32_10_t state; curand_init(seed, idx + THREAD_ID_X, increment, &state); @@ -288,6 +302,11 @@ void DropoutFwGPUKernelDriver( hipMemsetAsync(y_data, 0, x_numel * sizeof(T), stream)); PADDLE_ENFORCE_GPU_SUCCESS( hipMemsetAsync(mask_data, 0, x_numel * sizeof(*mask_data), stream)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS( + musaMemsetAsync(y_data, 0, x_numel * sizeof(T), stream)); + PADDLE_ENFORCE_GPU_SUCCESS( + musaMemsetAsync(mask_data, 0, x_numel * sizeof(*mask_data), stream)); #else PADDLE_ENFORCE_GPU_SUCCESS( cudaMemsetAsync(y_data, 0, x_numel * sizeof(T), stream)); @@ -349,7 +368,7 @@ void DropoutFwGPUKernelDriver( } else { bool copy_in_kernel = GetSeedDataAndIncrement( dev_ctx, seed, is_fix_seed, seed_val, offset, &seed_data, &increment); -#ifdef PADDLE_WITH_HIP +#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) VectorizedRandomGenerator <<>>(0, size, @@ -449,6 +468,8 @@ void DropoutGradGPUKernelDriver(const phi::GPUContext& dev_ctx, if (upscale_in_train && dropout_prob == 1.0f) { #ifdef PADDLE_WITH_HIP hipMemset(grad_x->data(), 0, grad_x->numel() * sizeof(T)); +#elif defined(PADDLE_WITH_MUSA) + musaMemset(grad_x->data(), 0, grad_x->numel() * sizeof(T)); #else cudaMemset(grad_x->data(), 0, grad_x->numel() * sizeof(T)); #endif diff --git a/paddle/phi/kernels/funcs/elementwise_base.h b/paddle/phi/kernels/funcs/elementwise_base.h index c92acdaf4180b..5b2657704367e 100644 --- a/paddle/phi/kernels/funcs/elementwise_base.h +++ b/paddle/phi/kernels/funcs/elementwise_base.h @@ -22,7 +22,7 @@ limitations under the License. */ #include "paddle/phi/kernels/funcs/elementwise_utils.h" #include "paddle/phi/kernels/funcs/math_function.h" -#if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__) +#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) || defined(__xpu__) || defined(__MUSACC__) #include "paddle/phi/backends/gpu/gpu_launch_config.h" #include "paddle/phi/kernels/funcs/aligned_vector.h" #include "paddle/phi/kernels/funcs/function_traits.h" @@ -150,7 +150,7 @@ class MidWiseTransformIterator int64_t post_; }; -#if defined(__NVCC__) || defined(__HIPCC__) +#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) template class RowwiseTransformIterator : public thrust::iterator_adaptor, @@ -485,7 +485,7 @@ inline void ElementwiseGradPreProcess(const DenseTensor &dout, } } -#if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__) +#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) || defined(__xpu__) // static unroller template