diff --git a/.github/workflows/Slice-baseline.yml b/.github/workflows/Slice-baseline.yml index ca544625a45eb9..4ab346a7a2a4dc 100644 --- a/.github/workflows/Slice-baseline.yml +++ b/.github/workflows/Slice-baseline.yml @@ -2,6 +2,13 @@ name: Slice-baseline-paddle on: workflow_dispatch: + inputs: + PR_ID: + required: false + type: string + COMMIT_ID: + required: false + type: string schedule: - cron: '0 20 * * 0' @@ -43,3 +50,5 @@ jobs: docker_build_image: ${{ needs.build-docker.outputs.docker_build_image }} slice-check: 'true' SLICE_TEST_MODE: insert_baseline + MANUALLY_PR_ID: ${{ inputs.PR_ID }} + MANUALLY_COMMIT_ID: ${{ inputs.COMMIT_ID }} diff --git a/.github/workflows/_Linux-XPU.yml b/.github/workflows/_Linux-XPU.yml index cef20d6123de01..0991952dc629f8 100644 --- a/.github/workflows/_Linux-XPU.yml +++ b/.github/workflows/_Linux-XPU.yml @@ -206,7 +206,7 @@ jobs: CCACHE_DIR: /root/.ccache CCACHE_MAXSIZE: 150G CCACHE_LIMIT_MULTIPLE: 0.8 - IF_KUNLUN3: "OFF" + IF_KUNLUN3: "ON" GITHUB_API_TOKEN: ${{ secrets.GITHUB_TOKEN }} home_dir: ${{ github.workspace }}/../../../.. FLAGS_use_stride_kernel: "0" diff --git a/.github/workflows/_Slice.yml b/.github/workflows/_Slice.yml index 8f74843fb313e9..bbc32719c36e95 100644 --- a/.github/workflows/_Slice.yml +++ b/.github/workflows/_Slice.yml @@ -20,6 +20,12 @@ on: type: string required: false default: 'paddle' + MANUALLY_PR_ID: + type: string + required: false + MANUALLY_COMMIT_ID: + type: string + required: false env: PR_ID: ${{ github.event.pull_request.number || '0' }} @@ -47,6 +53,7 @@ jobs: slice: name: Slice test + needs: check-bypass if: ${{ inputs.can-skip != 'true' && needs.check-bypass.outputs.can-skip != 'true' }} runs-on: group: slice @@ -105,7 +112,11 @@ jobs: if [[ "${{ inputs.SLICE_BENCHMARK_FRAMEWORKS }}" == "torch" ]];then python3.10 -m pip install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --index-url https://download.pytorch.org/whl/cu118 else - python3.10 -m pip install $wheel_link + if [[ "${{ inputs.MANUALLY_PR_ID }}" == "" ]]; then + python3.10 -m pip install $wheel_link + else + python3.10 -m pip install https://paddle-github-action.bj.bcebos.com/PR/build/${{ inputs.MANUALLY_PR_ID }}/${{ inputs.MANUALLY_COMMIT_ID }}/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl + fi fi python3.10 -m pip install -r PaddleTest/framework/e2e/api_benchmark/requirement.txt cd PaddleTest/framework/slice_benchmark diff --git a/ci/check_approval.sh b/ci/check_approval.sh index 29fc804fa37452..f846d8a01d0f7d 100644 --- a/ci/check_approval.sh +++ b/ci/check_approval.sh @@ -309,6 +309,12 @@ if [ "${ALL_PADDLE_ENFORCE}" != "" ] && [ "${PR_ID}" != "" ]; then check_approval 1 luotao1 zhangbo9674 phlrain fi +CHINESE_CHECK=$(git diff -U0 upstream/$BRANCH |grep "^+" |grep -P '[\p{Han}]') +if [ "${CHINESE_CHECK}" != "" ] && [ "${PR_ID}" != "" ]; then + echo_line="Not recommended to use Chinese. You must have one RD (tianshuo78520a or swgu98 or zhangbo9674 or risemeup1) approval." + check_approval 1 tianshuo78520a swgu98 zhangbo9674 risemeup1 +fi + ALL_ADDED_LINES=$(git diff -U0 upstream/$BRANCH |grep "^+" || true) ALL_PADDLE_CHECK=$(echo $ALL_ADDED_LINES |grep -zoE "(PADDLE_ENFORCE[A-Z_]{0,9}|PADDLE_THROW)\(.[^,\);]*.[^;]*\);\s" || true) VALID_PADDLE_CHECK=$(echo "$ALL_PADDLE_CHECK" | grep -zoE '(PADDLE_ENFORCE[A-Z_]{0,9}|PADDLE_THROW)\(([^,;]+,)*[^";]*errors::.[^"]*".[^";]{20,}.[^;]*\);\s' || true) diff --git a/ci/kunlun_test.sh b/ci/kunlun_test.sh index e782a4b9787850..817099f8a342d7 100644 --- a/ci/kunlun_test.sh +++ b/ci/kunlun_test.sh @@ -159,9 +159,10 @@ set +x git clone --depth 1000 https://gitee.com/paddlepaddle/PaddleX.git cd PaddleX pip install -e . + pip install numpy==1.24.4 pypdfium2 #install paddle x dependency - paddlex --install PaddleClas + paddlex --install PaddleClas -y #download paddle dataset wget -q https://paddle-model-ecology.bj.bcebos.com/paddlex/data/cls_flowers_examples.tar -P ./dataset diff --git a/cmake/cinn.cmake b/cmake/cinn.cmake index aaa86f50aa8faf..6cf2ffe32881a1 100644 --- a/cmake/cinn.cmake +++ b/cmake/cinn.cmake @@ -26,12 +26,21 @@ else() endif() if(NOT DEFINED ENV{runtime_include_dir}) - message( - STATUS - "set runtime_include_dir: ${CMAKE_SOURCE_DIR}/paddle/cinn/runtime/cuda") - set(ENV{runtime_include_dir} "${CMAKE_SOURCE_DIR}/paddle/cinn/runtime/cuda") - add_definitions( - -DRUNTIME_INCLUDE_DIR="${CMAKE_SOURCE_DIR}/paddle/cinn/runtime/cuda") + if(WITH_GPU) + message( + STATUS + "set runtime_include_dir: ${CMAKE_SOURCE_DIR}/paddle/cinn/runtime/cuda") + set(ENV{runtime_include_dir} "${CMAKE_SOURCE_DIR}/paddle/cinn/runtime/cuda") + add_definitions( + -DRUNTIME_INCLUDE_DIR="${CMAKE_SOURCE_DIR}/paddle/cinn/runtime/cuda") + elseif(WITH_ROCM) + message( + STATUS + "set runtime_include_dir: ${CMAKE_SOURCE_DIR}/paddle/cinn/runtime/hip") + set(ENV{runtime_include_dir} "${CMAKE_SOURCE_DIR}/paddle/cinn/runtime/hip") + add_definitions( + -DRUNTIME_INCLUDE_DIR="${CMAKE_SOURCE_DIR}/paddle/cinn/runtime/hip") + endif() endif() if(WITH_TESTING) @@ -118,6 +127,10 @@ if(WITH_ROCM) add_definitions(-DCINN_WITH_HIP) endif() link_libraries(${ROCM_HIPRTC_LIB}) + + message( + STATUS "copy paddle/cinn/common/float16.h to $ENV{runtime_include_dir}") + file(COPY paddle/cinn/common/float16.h DESTINATION $ENV{runtime_include_dir}) endif() set(cinnapi_src CACHE INTERNAL "" FORCE) diff --git a/doc/README_cn.md b/doc/README_cn.md new file mode 100644 index 00000000000000..cb643ee6e9ac02 --- /dev/null +++ b/doc/README_cn.md @@ -0,0 +1,6 @@ +# 致读者和开发者 +感谢您阅读 PaddlePaddle 文档。 + +自 **2018年9月17日** 起,**0.15.0 及 develop** 分支的文档源码已迁移至 [FluidDoc Repo](https://github.com/PaddlePaddle/FluidDoc) 仓库 ,并将在该仓库中持续更新。 + +请前往 FluidDoc 仓库获取最新文档。 diff --git a/paddle/cinn/hlir/dialect/operator/transforms/pd_to_cinn_pass.cc b/paddle/cinn/hlir/dialect/operator/transforms/pd_to_cinn_pass.cc index 3ebf18825a7b0a..369da2ba855b9e 100644 --- a/paddle/cinn/hlir/dialect/operator/transforms/pd_to_cinn_pass.cc +++ b/paddle/cinn/hlir/dialect/operator/transforms/pd_to_cinn_pass.cc @@ -742,9 +742,7 @@ class SplitOpPattern : public pir::OpRewritePattern { using pir::OpRewritePattern::OpRewritePattern; bool Match(paddle::dialect::SplitOp op) const override { - const bool is_denied = CompatibleInfo::IsDeniedForCinn(*op.operation()); - - return !is_denied && PatternConstraint(op); + return PatternConstraint(op); } void Rewrite(paddle::dialect::SplitOp op, diff --git a/paddle/cinn/runtime/cuda/cinn_cuda_runtime_source.cuh b/paddle/cinn/runtime/cuda/cinn_cuda_runtime_source.cuh index cbfe4e05c09ad9..10fa55bb051c6b 100644 --- a/paddle/cinn/runtime/cuda/cinn_cuda_runtime_source.cuh +++ b/paddle/cinn/runtime/cuda/cinn_cuda_runtime_source.cuh @@ -338,7 +338,7 @@ extern "C" { __device__ inline int FN_INT32(pow)(int a, int b) { if (a == 0 && b < 0) { - return -1; + return 0; } float res = pow(__int2float_rd(a), __int2float_rd(b)); return __float2int_rn(res); @@ -418,6 +418,9 @@ __device__ inline long long int FN_INT64(exp)(long long int a) { __device__ inline long long int FN_INT64(pow)(long long int a, long long int b) { + if (a == 0 && b < 0) { + return 0; + } double res = pow(__ll2double_rd(a), __ll2double_rd(b)); return __double2ll_rn(res); } diff --git a/paddle/common/layout.h b/paddle/common/layout.h index 4c2fb90794eb52..016c8b828c72e5 100644 --- a/paddle/common/layout.h +++ b/paddle/common/layout.h @@ -85,6 +85,8 @@ inline DataLayout StringToDataLayout(const std::string& str) { return DataLayout::kAnyLayout; } else if (s == "MKLDNNLAYOUT") { return DataLayout::kMKLDNN; + } else if (s == "ONEDNNLAYOUT") { + return DataLayout::ONEDNN; } else if (s == "SPARSE_COO") { return DataLayout::SPARSE_COO; } else if (s == "SPARSE_CSR") { diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc index 296989f7d612a2..3455922b3066eb 100755 --- a/paddle/fluid/framework/details/build_strategy.cc +++ b/paddle/fluid/framework/details/build_strategy.cc @@ -321,7 +321,7 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph, continue; } } else if (pass->Type() == "onednn_placement_pass") { - pass->Set("mkldnn_enabled_op_types", + pass->Set("onednn_enabled_op_types", new std::unordered_set(onednn_enabled_op_types_)); } VLOG(1) << "Start Apply Pass " << pass->Type(); diff --git a/paddle/fluid/framework/ir/onednn/onednn_placement_pass.cc b/paddle/fluid/framework/ir/onednn/onednn_placement_pass.cc index 62748541683476..d1c73836ac90d9 100644 --- a/paddle/fluid/framework/ir/onednn/onednn_placement_pass.cc +++ b/paddle/fluid/framework/ir/onednn/onednn_placement_pass.cc @@ -64,7 +64,7 @@ inline bool FoundPhiOneDNNKernelWithCorrectDataType( return false; } -bool MKLDNNPlacementPass::IsSupport(const Node* op) const { +bool ONEDNNPlacementPass::IsSupport(const Node* op) const { if (FoundOneDNNKernelWithCorrectDataType(op) || FoundPhiOneDNNKernelWithCorrectDataType(op)) { // For interpolate ops, there's a little difference between Paddle and @@ -89,8 +89,8 @@ bool MKLDNNPlacementPass::IsSupport(const Node* op) const { } // namespace paddle::framework::ir -REGISTER_PASS(onednn_placement_pass, paddle::framework::ir::MKLDNNPlacementPass) - .RequirePassAttr("mkldnn_enabled_op_types"); +REGISTER_PASS(onednn_placement_pass, paddle::framework::ir::ONEDNNPlacementPass) + .RequirePassAttr("onednn_enabled_op_types"); REGISTER_PASS_CAPABILITY(onednn_placement_pass) .AddCombination( diff --git a/paddle/fluid/framework/ir/onednn/onednn_placement_pass.h b/paddle/fluid/framework/ir/onednn/onednn_placement_pass.h index 5fc1dbd24f18ef..b7e0e1d3383c69 100644 --- a/paddle/fluid/framework/ir/onednn/onednn_placement_pass.h +++ b/paddle/fluid/framework/ir/onednn/onednn_placement_pass.h @@ -26,17 +26,17 @@ namespace ir { /* * Specifies which operators should use MKLDNN. */ -class MKLDNNPlacementPass : public PlacementPassBase { +class ONEDNNPlacementPass : public PlacementPassBase { protected: bool IsSupport(const Node* op) const override; private: - const std::string GetPlacementName() const override { return "MKLDNN"; } + const std::string GetPlacementName() const override { return "ONEDNN"; } const std::string GetAttrName() const override { return "use_mkldnn"; } const std::unordered_set GetOpTypesList() const override { - return Get>("mkldnn_enabled_op_types"); + return Get>("onednn_enabled_op_types"); } }; diff --git a/paddle/fluid/framework/ir/onednn/onednn_placement_pass_tester.cc b/paddle/fluid/framework/ir/onednn/onednn_placement_pass_tester.cc index c8346dcbafd7a0..81f4ca871d550a 100644 --- a/paddle/fluid/framework/ir/onednn/onednn_placement_pass_tester.cc +++ b/paddle/fluid/framework/ir/onednn/onednn_placement_pass_tester.cc @@ -133,7 +133,7 @@ class PlacementPassTest { auto pass = PassRegistry::Instance().Get("onednn_placement_pass"); - pass->Set("mkldnn_enabled_op_types", + pass->Set("onednn_enabled_op_types", new std::unordered_set(onednn_enabled_op_types)); graph.reset(pass->Apply(graph.release())); @@ -143,8 +143,10 @@ class PlacementPassTest { for (auto* node : graph->Nodes()) { if (node->IsOp()) { auto* op = node->Op(); - if (op->HasAttr("use_mkldnn") && - PADDLE_GET_CONST(bool, op->GetAttr("use_mkldnn"))) { + if ((op->HasAttr("use_mkldnn") && + PADDLE_GET_CONST(bool, op->GetAttr("use_mkldnn"))) || + (op->HasAttr("use_onednn") && + PADDLE_GET_CONST(bool, op->GetAttr("use_onednn")))) { ++use_onednn_true_count; } } @@ -156,27 +158,27 @@ class PlacementPassTest { void PlacementNameTest() { auto pass = PassRegistry::Instance().Get("onednn_placement_pass"); EXPECT_EQ(static_cast(pass.get())->GetPlacementName(), - "MKLDNN"); + "ONEDNN"); } }; -TEST(MKLDNNPlacementPass, enable_conv_relu) { +TEST(ONEDNNPlacementPass, enable_conv_relu) { // 2 conv (1 conv is always true) + 2 relu (1 relu is always true) + 0 pool PlacementPassTest().MainTest({"conv2d", "relu"}, 4); } -TEST(MKLDNNPlacementPass, enable_relu_pool) { +TEST(ONEDNNPlacementPass, enable_relu_pool) { // 1 conv (1 conv is always true) + 2 relu (1 relu is always true) + 1 pool PlacementPassTest().MainTest({"relu", "pool2d"}, 4); } -TEST(MKLDNNPlacementPass, enable_all) { +TEST(ONEDNNPlacementPass, enable_all) { // 2 conv (1 conv is always true) + 2 relu (1 relu is always true) + 1 pool + // 1 concat PlacementPassTest().MainTest({}, 6); } -TEST(MKLDNNPlacementPass, placement_name) { +TEST(ONEDNNPlacementPass, placement_name) { PlacementPassTest().PlacementNameTest(); } diff --git a/paddle/fluid/framework/new_executor/instruction/control_flow/if_instruction.cc b/paddle/fluid/framework/new_executor/instruction/control_flow/if_instruction.cc index 0f3f0c2411f2c3..2863be568ae68c 100644 --- a/paddle/fluid/framework/new_executor/instruction/control_flow/if_instruction.cc +++ b/paddle/fluid/framework/new_executor/instruction/control_flow/if_instruction.cc @@ -119,7 +119,7 @@ IfInstruction::IfInstruction(size_t id, outputs.emplace(value, GetValueIds(value, *value_exec_info)); } if (value.use_count() > 0) { - VLOG(6) << "value " << i << " use conutn != 0"; + VLOG(6) << "value " << i << " use count != 0"; is_last_op = false; } } diff --git a/paddle/fluid/framework/new_executor/instruction/cuda_graph_instruction.cc b/paddle/fluid/framework/new_executor/instruction/cuda_graph_instruction.cc index 3105b6d09e3839..bc8fd95bf0da5c 100644 --- a/paddle/fluid/framework/new_executor/instruction/cuda_graph_instruction.cc +++ b/paddle/fluid/framework/new_executor/instruction/cuda_graph_instruction.cc @@ -109,7 +109,7 @@ CudaGraphInstruction::CudaGraphInstruction( outputs.emplace(value, GetValueIds(value, *value_exec_info)); } if (value.use_count() > 0) { - VLOG(6) << "value " << i << " use conutn != 0"; + VLOG(6) << "value " << i << " use count != 0"; is_last_op = false; } } diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h index 198ff8dcd8ccc3..95121b1d223312 100644 --- a/paddle/fluid/framework/op_registry.h +++ b/paddle/fluid/framework/op_registry.h @@ -170,7 +170,7 @@ inline void RegisterKernelClass(const char* op_type, std::string library(library_type); std::string data_layout = "ANYLAYOUT"; if (library == "MKLDNN") { - data_layout = "MKLDNNLAYOUT"; + data_layout = "ONEDNNLAYOUT"; } #ifdef PADDLE_WITH_CUSTOM_DEVICE if (std::is_same::value) { diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h index 5e11ce0e3f47cb..6b2b38feebef02 100644 --- a/paddle/fluid/inference/analysis/argument.h +++ b/paddle/fluid/inference/analysis/argument.h @@ -193,12 +193,12 @@ struct Argument { // whether to mute all logs in inference. DECL_ARGUMENT_FIELD(disable_logs, DisableLogs, bool); - // Pass a set of op types to enable its mkldnn kernel - DECL_ARGUMENT_FIELD(mkldnn_enabled_op_types, - MKLDNNEnabledOpTypes, + // Pass a set of op types to enable its onednn kernel + DECL_ARGUMENT_FIELD(onednn_enabled_op_types, + ONEDNNEnabledOpTypes, std::unordered_set); - // The cache capacity of different input shapes for mkldnn. - DECL_ARGUMENT_FIELD(mkldnn_cache_capacity, MkldnnCacheCapacity, int); + // The cache capacity of different input shapes for onednn. + DECL_ARGUMENT_FIELD(mkldnn_cache_capacity, OnednnCacheCapacity, int); #ifdef PADDLE_WITH_DNNL // A set of op types to enable their quantized kernels @@ -219,7 +219,7 @@ struct Argument { Bfloat16EnabledOpTypes, std::unordered_set); - DECL_ARGUMENT_FIELD(use_onednn_int8, UseMkldnnInt8, bool); + DECL_ARGUMENT_FIELD(use_onednn_int8, UseOnednnInt8, bool); #endif // Passed from config. diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc index 6048d8b4944477..c416926df5dfdd 100644 --- a/paddle/fluid/inference/analysis/ir_pass_manager.cc +++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc @@ -131,9 +131,9 @@ void IRPassManager::CreatePasses(Argument *argument, pass->Set("optim_cache_dir", new std::string(std::move(optim_cache_dir))); pass_num++; } else if (pass_name == "onednn_placement_pass") { - pass->Set("mkldnn_enabled_op_types", + pass->Set("onednn_enabled_op_types", new std::unordered_set( - argument->mkldnn_enabled_op_types())); + argument->onednn_enabled_op_types())); } else if (pass_name == "cudnn_placement_pass") { pass->Set("cudnn_enabled_op_types", new std::unordered_set()); diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index fcee93efdb61e9..4f1d59f4b64d94 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -1031,8 +1031,8 @@ void AnalysisPredictor::OptimizeInferencePirProgram() { } #endif #ifdef PADDLE_WITH_DNNL - } else if (config_.mkldnn_enabled()) { - // mkldnn + } else if (config_.onednn_enabled()) { + // onednn pir::IrContext *ctx = pir::IrContext::Instance(); ctx->GetOrRegisterDialect(); if (!config_.custom_pass_only_) { @@ -2100,9 +2100,9 @@ void AnalysisPredictor::PrepareArgument() { argument_->SetIpuCustomPatterns(config_.ipu_custom_patterns_); #endif - if (config_.mkldnn_enabled() && !config_.use_gpu()) { - LOG(INFO) << "MKLDNN is enabled"; - argument_->SetMKLDNNEnabledOpTypes(config_.onednn_enabled_op_types_); + if (config_.onednn_enabled() && !config_.use_gpu()) { + LOG(INFO) << "ONEDNN is enabled"; + argument_->SetONEDNNEnabledOpTypes(config_.onednn_enabled_op_types_); } if (config_.cinn_enabled()) { @@ -2115,7 +2115,7 @@ void AnalysisPredictor::PrepareArgument() { argument_->SetBfloat16EnabledOpTypes(config_.bfloat16_enabled_op_types_); } - if (config_.mkldnn_int8_enabled()) { + if (config_.onednn_int8_enabled()) { LOG(INFO) << "Int8 is enabled"; argument_->SetQuantizeEnabledOpTypes(config_.quantize_enabled_op_types_); argument_->SetQuantizeExcludedOpIds(config_.quantize_excluded_op_ids_); @@ -2296,7 +2296,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() { #if defined(_WIN32) argument_->PartiallyRelease(); #else - if (config_.mkldnn_enabled() || + if (config_.onednn_enabled() || (config_.tensorrt_engine_enabled() && config_.tensorrt_precision_mode_ == AnalysisConfig::Precision::kInt8)) { // NOLINT diff --git a/paddle/fluid/inference/capi/pd_config.cc b/paddle/fluid/inference/capi/pd_config.cc index 96b0b49915d5d0..b19a33e5eadfd9 100644 --- a/paddle/fluid/inference/capi/pd_config.cc +++ b/paddle/fluid/inference/capi/pd_config.cc @@ -311,7 +311,7 @@ bool PD_OnednnEnabled(const PD_AnalysisConfig* config) { config, common::errors::InvalidArgument( "The pointer of analysis configuration shouldn't be nullptr")); - return config->config.mkldnn_enabled(); + return config->config.onednn_enabled(); } void PD_SetCpuMathLibraryNumThreads(PD_AnalysisConfig* config, diff --git a/paddle/fluid/jit/engine/interpreter_engine.cc b/paddle/fluid/jit/engine/interpreter_engine.cc index 0bba3ebd2e554b..d1f341b504c965 100644 --- a/paddle/fluid/jit/engine/interpreter_engine.cc +++ b/paddle/fluid/jit/engine/interpreter_engine.cc @@ -53,7 +53,7 @@ void InterpreterEngine::CreateInterpreterCore() { #ifdef PADDLE_WITH_DNNL auto onednn_pass = framework::ir::PassRegistry::Instance().Get("onednn_placement_pass"); - onednn_pass->Set("mkldnn_enabled_op_types", + onednn_pass->Set("onednn_enabled_op_types", new std::unordered_set({})); onednn_pass->Apply(&graph); #endif diff --git a/paddle/fluid/operators/generator/get_expected_kernel_func.cc b/paddle/fluid/operators/generator/get_expected_kernel_func.cc index 944d9f6bfca1e2..4089772637abf0 100644 --- a/paddle/fluid/operators/generator/get_expected_kernel_func.cc +++ b/paddle/fluid/operators/generator/get_expected_kernel_func.cc @@ -61,7 +61,7 @@ static bool ReduceOpHasOptimizedOneDNNKernel( } // only poolop -bool CanMKLDNNSupportPool(const framework::ExecutionContext& ctx) { +bool CanONEDNNSupportPool(const framework::ExecutionContext& ctx) { if (ctx.Attr("adaptive") == false) return true; // oneDNN is supporting only unchangeable in size pool window auto src_tz = common::vectorize(ctx.Input("X")->dims()); @@ -181,7 +181,7 @@ phi::KernelKey GetPoolExpectedKernelType( auto data_type = op_ptr->OperatorWithKernel::IndicateVarDataType(ctx, "X"); // NOTE(jiahongyu): Below codes originally enclosed by PADDLE_WITH_DNNL - op_ptr->SetDnnFallback(!CanMKLDNNSupportPool(ctx)); + op_ptr->SetDnnFallback(!CanONEDNNSupportPool(ctx)); // NOTE(jiahongyu) END: Above codes originally enclosed by PADDLE_WITH_DNNL return phi::KernelKey(data_type, ctx.GetPlace()); @@ -194,7 +194,7 @@ phi::KernelKey GetPoolDoubleGradExpectedKernelType( op_ptr->OperatorWithKernel::IndicateVarDataType(ctx, "grad_x@GRAD"); // NOTE(jiahongyu): Below codes originally enclosed by PADDLE_WITH_DNNL - op_ptr->SetDnnFallback(!CanMKLDNNSupportPool(ctx)); + op_ptr->SetDnnFallback(!CanONEDNNSupportPool(ctx)); // NOTE(jiahongyu) END: Above codes originally enclosed by PADDLE_WITH_DNNL return phi::KernelKey(data_type, ctx.GetPlace()); diff --git a/paddle/fluid/pybind/compiled_program.cc b/paddle/fluid/pybind/compiled_program.cc index 563ff805815fc7..18f36a2b2efe33 100644 --- a/paddle/fluid/pybind/compiled_program.cc +++ b/paddle/fluid/pybind/compiled_program.cc @@ -824,6 +824,15 @@ void BindCompiledProgram(pybind11::module &m) { // NOLINT const std::unordered_set &onednn_enabled_op_types) { self.onednn_enabled_op_types_ = onednn_enabled_op_types; }) + .def_property( + "onednn_enabled_op_types", + [](const BuildStrategy &self) { + return self.onednn_enabled_op_types_; + }, + [](BuildStrategy &self, + const std::unordered_set &onednn_enabled_op_types) { + self.onednn_enabled_op_types_ = onednn_enabled_op_types; + }) .def_property( "allow_cuda_graph_capture", [](const BuildStrategy &self) { diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc index f4a373824b162b..8af90c243833d3 100644 --- a/paddle/fluid/pybind/eager_method.cc +++ b/paddle/fluid/pybind/eager_method.cc @@ -1414,10 +1414,10 @@ static PyObject* tensor_method_set_underline_tensor(TensorObject* self, if (self->tensor.is_dense_tensor()) { auto* dst_tensor = static_cast(self->tensor.impl().get()); - if (self->tensor.has_allocation() && - !dst_tensor->meta().is_contiguous() || - !src_tensor->meta().is_contiguous()) { - VLOG(8) << "set_tensor() method , src or dst tensor is not contiguous"; + if (self->tensor.has_allocation() && self->tensor.initialized() && + (!dst_tensor->meta().is_contiguous() || + !src_tensor->meta().is_contiguous())) { + VLOG(8) << "set_tensor() method , src or dst tensor is not contiguous "; if (!FLAGS_use_stride_kernel) { PADDLE_THROW(common::errors::Fatal( "FLAGS_use_stride_kernel is closed. Strided kernel " @@ -1450,7 +1450,6 @@ static PyObject* tensor_method_set_underline_tensor(TensorObject* self, "The `set_tensor()` method of non DenseTensor get a DenseTensor src " "value")); } - } else if (value.is_dist_tensor()) { #ifdef PADDLE_WITH_DISTRIBUTE auto* src_tensor = @@ -1484,7 +1483,6 @@ static PyObject* tensor_method_set_underline_tensor(TensorObject* self, "current PaddlePaddle, please recompile and installPaddlePaddle " "with the option of `WITH_DISTRIBUTE=ON`.")); #endif - } else { PADDLE_THROW(common::errors::Unavailable( "The `set_tensor()` method of (Dist)Tensor get a non " diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc index b48a5ba9f630b8..f090156d54d0c6 100644 --- a/paddle/fluid/pybind/inference_api.cc +++ b/paddle/fluid/pybind/inference_api.cc @@ -1050,26 +1050,35 @@ void BindAnalysisConfig(py::module *m) { &AnalysisConfig::SwitchIrDebug, py::arg("x") = true, py::arg("passes") = std::vector()) - .def("enable_mkldnn", &AnalysisConfig::EnableONEDNN) - .def("disable_mkldnn", &AnalysisConfig::DisableONEDNN) - .def("mkldnn_enabled", &AnalysisConfig::onednn_enabled) + .def("enable_mkldnn", &AnalysisConfig::EnableONEDNN) // deprecated + .def("disable_mkldnn", &AnalysisConfig::DisableONEDNN) // deprecated + .def("mkldnn_enabled", &AnalysisConfig::onednn_enabled) // deprecated + .def("enable_onednn", &AnalysisConfig::EnableONEDNN) + .def("disable_onednn", &AnalysisConfig::DisableONEDNN) + .def("onednn_enabled", &AnalysisConfig::onednn_enabled) .def("enable_cinn", &AnalysisConfig::EnableCINN) .def("set_cpu_math_library_num_threads", &AnalysisConfig::SetCpuMathLibraryNumThreads) .def("cpu_math_library_num_threads", &AnalysisConfig::cpu_math_library_num_threads) .def("to_native_config", &AnalysisConfig::ToNativeConfig) - .def("enable_mkldnn_bfloat16", &AnalysisConfig::EnableOnednnBfloat16) + .def("enable_mkldnn_bfloat16", + &AnalysisConfig::EnableOnednnBfloat16) // deprecated + .def("enable_onednn_bfloat16", &AnalysisConfig::EnableOnednnBfloat16) #ifdef PADDLE_WITH_DNNL .def("set_mkldnn_cache_capacity", + &AnalysisConfig::SetOnednnCacheCapacity, + py::arg("capacity") = 0) // deprecated + .def("set_onednn_cache_capacity", &AnalysisConfig::SetOnednnCacheCapacity, py::arg("capacity") = 0) .def("set_bfloat16_op", &AnalysisConfig::SetBfloat16Op) .def("enable_mkldnn_int8", &AnalysisConfig::EnableOnednnInt8, py::arg("mkldnn_int8_enabled_op_types") = - std::unordered_set({})) - .def("mkldnn_int8_enabled", &AnalysisConfig::onednn_int8_enabled) + std::unordered_set({})) // deprecated + .def("mkldnn_int8_enabled", + &AnalysisConfig::onednn_int8_enabled) // deprecated .def("disable_mkldnn_fc_passes", &AnalysisConfig::DisableOnednnFcPasses, R"DOC( @@ -1085,9 +1094,31 @@ void BindAnalysisConfig(py::module *m) { >>> config = Config("") >>> config.enable_mkldnn() >>> config.disable_mkldnn_fc_passes() + )DOC") // deprecated + .def("enable_onednn_int8", + &AnalysisConfig::EnableOnednnInt8, + py::arg("onednn_int8_enabled_op_types") = + std::unordered_set({})) + .def("onednn_int8_enabled", &AnalysisConfig::onednn_int8_enabled) + .def("disable_onednn_fc_passes", + &AnalysisConfig::DisableOnednnFcPasses, + R"DOC( + Disable Onednn FC + Returns: + None. + + Examples: + .. code-block:: python + + >>> from paddle.inference import Config + + >>> config = Config("") + >>> config.enable_onednn() + >>> config.disable_onednn_fc_passes() )DOC") #endif - .def("set_mkldnn_op", &AnalysisConfig::SetONEDNNOp) + .def("set_mkldnn_op", &AnalysisConfig::SetONEDNNOp) // deprecated + .def("set_onednn_op", &AnalysisConfig::SetONEDNNOp) .def("set_model_buffer", &AnalysisConfig::SetModelBuffer) .def("model_from_memory", &AnalysisConfig::model_from_memory) .def("delete_pass", &AnalysisConfig::DeletePass) @@ -1329,23 +1360,32 @@ void BindPaddlePassBuilder(py::module *m) { py::class_(*m, "PassStrategy") .def(py::init &>()) .def("enable_cudnn", &PassStrategy::EnableCUDNN) - .def("enable_mkldnn", &PassStrategy::EnableONEDNN) - .def("enable_mkldnn_bfloat16", &PassStrategy::EnableMkldnnBfloat16) + .def("enable_mkldnn", &PassStrategy::EnableONEDNN) // deprecated + .def("enable_mkldnn_bfloat16", + &PassStrategy::EnableMkldnnBfloat16) // deprecated + .def("enable_onednn", &PassStrategy::EnableONEDNN) + .def("enable_onednn_bfloat16", &PassStrategy::EnableOnednnBfloat16) .def("use_gpu", &PassStrategy::use_gpu); py::class_(*m, "CpuPassStrategy") .def(py::init<>()) .def(py::init()) .def("enable_cudnn", &CpuPassStrategy::EnableCUDNN) - .def("enable_mkldnn", &CpuPassStrategy::EnableONEDNN) - .def("enable_mkldnn_bfloat16", &CpuPassStrategy::EnableMkldnnBfloat16); + .def("enable_mkldnn", &CpuPassStrategy::EnableONEDNN) // deprecated + .def("enable_mkldnn_bfloat16", + &CpuPassStrategy::EnableMkldnnBfloat16) // deprecated + .def("enable_onednn", &CpuPassStrategy::EnableONEDNN) + .def("enable_onednn_bfloat16", &CpuPassStrategy::EnableOnednnBfloat16); py::class_(*m, "GpuPassStrategy") .def(py::init<>()) .def(py::init()) .def("enable_cudnn", &GpuPassStrategy::EnableCUDNN) - .def("enable_mkldnn", &GpuPassStrategy::EnableONEDNN) - .def("enable_mkldnn_bfloat16", &GpuPassStrategy::EnableMkldnnBfloat16); + .def("enable_mkldnn", &GpuPassStrategy::EnableONEDNN) // deprecated + .def("enable_mkldnn_bfloat16", + &GpuPassStrategy::EnableMkldnnBfloat16) // deprecated + .def("enable_onednn", &GpuPassStrategy::EnableONEDNN) + .def("enable_onednn_bfloat16", &GpuPassStrategy::EnableOnednnBfloat16); } void BindInternalUtils(py::module *m) { diff --git a/paddle/fluid/pybind/slice_utils.h b/paddle/fluid/pybind/slice_utils.h index e56e494160fe88..4be2fe7a31976d 100644 --- a/paddle/fluid/pybind/slice_utils.h +++ b/paddle/fluid/pybind/slice_utils.h @@ -525,7 +525,7 @@ static void ParseIndex(const paddle::Tensor& tensor, if (slice_tensor.dtype() == phi::DataType::BOOL) { // bool tensor consumes (rank of index tensor) dimensions of input // tensor - for (int i = 0; i < slice_tensor.shape().size(); i++) { + for (size_t i = 0; i < slice_tensor.shape().size(); i++) { PADDLE_ENFORCE_EQ(slice_tensor.shape()[i], dim_len, common::errors::OutOfRange( @@ -684,7 +684,7 @@ static paddle::Tensor dealWithAdvancedIndex( if (index.dtype() == phi::DataType::BOOL) { *rank_of_new_dim = std::max(*rank_of_new_dim, 1); i--; - for (int j = 0; j < index.shape().size(); j++) { + for (size_t j = 0; j < index.shape().size(); j++) { i++; index_dim = (*advanced_index_dim)[i]; trans_dim->push_back(index_dim); diff --git a/paddle/fluid/pybind/tensor.cc b/paddle/fluid/pybind/tensor.cc index 52462bd182803f..b2c83177284486 100644 --- a/paddle/fluid/pybind/tensor.cc +++ b/paddle/fluid/pybind/tensor.cc @@ -903,7 +903,7 @@ void BindTensor(pybind11::module &m) { // NOLINT const auto &device_id = paddle::platform::GetXPUCurrentDeviceId(); auto stream = paddle::platform::get_current_stream(device_id); - xpu_wait(stream); + xpu_wait(stream->raw_stream()); int type_idx = static_cast(self.type()); size_t data_size = self.numel() * framework::SizeOfType( diff --git a/paddle/fluid/pybind/xpu_streams_py.cc b/paddle/fluid/pybind/xpu_streams_py.cc index 98a581e0768138..957746605007ab 100644 --- a/paddle/fluid/pybind/xpu_streams_py.cc +++ b/paddle/fluid/pybind/xpu_streams_py.cc @@ -33,19 +33,27 @@ namespace py = pybind11; namespace paddle { namespace platform { #ifdef PADDLE_WITH_XPU -XPUStream get_current_stream(int device_id) { - if (device_id == -1) { - device_id = phi::backends::xpu::GetXPUCurrentDeviceId(); - } +phi::XPUStreamHandle *get_current_stream(int device_id) { auto place = phi::XPUPlace(device_id); auto *dev_ctx = static_cast( phi::DeviceContextPool::Instance().Get(place)); dev_ctx->Wait(); - return dev_ctx->stream(); + return dev_ctx->get_current_stream_handle(); +} + +phi::XPUStreamHandle *set_current_stream(int idx) { + int device_id = phi::backends::xpu::GetXPUCurrentDeviceId(); + auto original_stream = get_current_stream(device_id); + auto place = phi::XPUPlace(device_id); + auto *dev_ctx = static_cast( + phi::DeviceContextPool::Instance().Get(place)); + dev_ctx->SetCurrentStream(idx); + return original_stream; } #endif } // namespace platform + namespace pybind { void BindXpuStream(py::module *m_ptr) { auto &m = *m_ptr; @@ -69,7 +77,7 @@ void BindXpuStream(py::module *m_ptr) { #endif }); m.def( - "_get_current_stream", + "_xpu_get_current_stream", [](int device_id) { #ifdef PADDLE_WITH_XPU if (device_id == -1) { @@ -79,7 +87,19 @@ void BindXpuStream(py::module *m_ptr) { return platform::get_current_stream(device_id); #else PADDLE_THROW( - common::errors::Unavailable("Paddle is not compiled with CUDA. " + common::errors::Unavailable("Paddle is not compiled with XPU. " + "Cannot visit device synchronize.")); +#endif + }, + py::return_value_policy::reference); + m.def( + "_xpu_set_current_stream", + [](int stream_id) { +#ifdef PADDLE_WITH_XPU + return platform::set_current_stream(stream_id); +#else + PADDLE_THROW( + common::errors::Unavailable("Paddle is not compiled with XPU. " "Cannot visit device synchronize.")); #endif }, @@ -100,12 +120,167 @@ void BindXpuStream(py::module *m_ptr) { #endif }); + py::class_(m, "XPUStream", R"DOC( + The handle of the XPU stream. + + Parameters: + device(paddle.XPUPlace()|int|None, optional): The device which wanted to allocate the stream. + If device is None or negative integer, device will be the current device. + If device is positive integer, it must less than the device count. Default: None. + + Examples: + .. code-block:: python + + >>> # doctest: +REQUIRES(env:XPU) + >>> import paddle + >>> s1 = paddle.device.xpu.Stream(paddle.XPUPlace(0)) + >>> s2 = paddle.device.xpu.Stream(0) + >>> s3 = paddle.device.xpu.Stream() + + )DOC") +#ifdef PADDLE_WITH_XPU + .def_property_readonly( + "xpu_stream", + [](phi::XPUStreamHandle &self) { + return reinterpret_cast(self.raw_stream()); + }) + .def("wait_stream", + [](phi::XPUStreamHandle &self, phi::XPUStreamHandle &other) { + auto *dev_ctx = phi::get_xpu_context(); + dev_ctx->StreamWaitStreamInPool(self.id(), other.id()); + }) + .def("wait_event", + [](phi::XPUStreamHandle &self, phi::XPUEventHandle &other) { + self.wait_event(other.get_event()); + }) + .def("query", + [](phi::XPUStreamHandle &self) { + PADDLE_THROW(common::errors::Unavailable( + "Query function for XPUStream is not supported now")); + }) + .def("record_event", + [](phi::XPUStreamHandle &self, phi::XPUEventHandle *event) { + if (event == nullptr) { + event = new phi::XPUEventHandle(); + } + self.record_event(event->get_event()); + return event; + }) + .def( + "synchronize", + [](phi::XPUStreamHandle &self) { self.synchronize(); }, + R"DOC( + Waits for stream tasks to complete. + + Examples: + .. code-block:: python + + >>> # doctest: +REQUIRES(env:XPU) + >>> import paddle + >>> s = paddle.device.xpu.Stream(paddle.XPUPlace(0), 1) + >>> s.synchronize() + + )DOC") + .def_property_readonly( + "place", + [](phi::XPUStreamHandle &self) { + return phi::XPUPlace(platform::GetXPUCurrentDeviceId()); + }) + .def_property_readonly( + "idx", [](phi::XPUStreamHandle &self) { return self.id(); }) +#endif + + .def("__init__", + [](phi::XPUStreamHandle &self) { +#ifdef PADDLE_WITH_XPU + new (&self) phi::XPUStreamHandle(); + self.Init(); +#else + PADDLE_THROW(common::errors::Unavailable( + "Class XPUStream can only be initialized on the XPU " + "platform.")); +#endif + }) + .def( + "__init__", + [](phi::XPUStreamHandle &self, phi::XPUPlace *place) { +#ifdef PADDLE_WITH_XPU + if (place == nullptr) { + int curr_device_id = platform::GetXPUCurrentDeviceId(); + auto place_tmp = phi::XPUPlace(curr_device_id); + new (&self) phi::XPUStreamHandle(place_tmp); + } else { + new (&self) phi::XPUStreamHandle(*place); + } +#else + PADDLE_THROW(common::errors::Unavailable( + "Class XPUStream can only be initialized on the XPU " + "platform.")); +#endif + }, + py::arg("device") = nullptr) + .def( + "__init__", + [](phi::XPUStreamHandle &self, int device) { +#ifdef PADDLE_WITH_XPU + if (device < 0) { + device = platform::GetXPUCurrentDeviceId(); + } + auto place_tmp = phi::XPUPlace(device); + new (&self) phi::XPUStreamHandle(place_tmp); +#else + PADDLE_THROW(common::errors::Unavailable( + "Class XPUStream can only be initialized on the XPU " + "platform.")); +#endif + }, + py::arg("device") = -1); + py::class_(m, "XPUEvent", R"DOC( + The handle of the XPU event. + + Examples: + .. code-block:: python + + >>> # doctest: +REQUIRES(env:XPU) + >>> import paddle + >>> event = paddle.device.xpu.Event() + + )DOC") +#ifdef PADDLE_WITH_XPU + .def( + "record", + [](phi::XPUEventHandle &self, phi::XPUStreamHandle *stream) { + if (stream == nullptr) { + auto *dev_ctx = phi::get_xpu_context(); + auto stream_handle = dev_ctx->get_current_stream_handle(); + self.record(stream_handle->raw_stream()); + } else { + self.record(stream->raw_stream()); + } + }, + py::arg("stream") = nullptr) + .def("query", [](phi::XPUEventHandle &self) { return self.query(); }) + .def("elapsed_time", + [](phi::XPUEventHandle &self) { + PADDLE_THROW(common::errors::Unavailable( + "XPUEvent elapsed_time is not supported now")); + }) + .def("synchronize", [](phi::XPUEventHandle &self) { self.synchronize(); }) +#endif + .def("__init__", [](phi::XPUEventHandle &self) { +#ifdef PADDLE_WITH_XPU + new (&self) phi::XPUEventHandle(); +#else + PADDLE_THROW(common::errors::Unavailable( + "Class XPUEvent can only be initialized on the XPU platform.")); +#endif + }); #ifdef PADDLE_WITH_XPU - py::class_(m, "XPUStream", R"DOC( - The handle of the CUDA stream. + py::class_(m, "XPUCUDAStream", R"DOC( + The handle of the XPU stream. Parameters: - device(paddle.CUDAPlace()|int|None, optional): The device which wanted to allocate the stream. + device(paddle.XPUPlace()|int|None, optional): The device which wanted to allocate the stream. If device is None or negative integer, device will be the current device. If device is positive integer, it must less than the device count. Default: None. priority(int|None, optional): The priority of stream. The priority can be 1(high) or 2(normal). @@ -114,16 +289,16 @@ void BindXpuStream(py::module *m_ptr) { Examples: .. code-block:: python - >>> # doctest: +REQUIRES(env:GPU) + >>> # doctest: +REQUIRES(env:XPU) >>> import paddle - >>> s1 = paddle.device.cuda.Stream(paddle.CUDAPlace(0), 1) - >>> s2 = paddle.device.cuda.Stream(0, 1) - >>> s3 = paddle.device.cuda.Stream() + >>> s1 = paddle.device.xpu.Stream(paddle.XPUPlace(0), 1) + >>> s2 = paddle.device.xpu.Stream(0, 1) + >>> s3 = paddle.device.xpu.Stream() )DOC") .def( "synchronize", - [](XPUStream &self) { xpu_wait(self); }, + [](phi::XPUCUDAStream &self) { self.Synchronize(); }, R"DOC( Waits for stream tasks to complete. @@ -135,7 +310,25 @@ void BindXpuStream(py::module *m_ptr) { >>> s = paddle.device.cuda.Stream(paddle.CUDAPlace(0), 1) >>> s.synchronize() - )DOC"); + )DOC") + .def("__init__", + [](phi::XPUCUDAStream &self, phi::XPUPlace *place, int priority) { + if (priority != 1 && priority != 2) { + PADDLE_THROW(common::errors::InvalidArgument( + "Priority should be 1(high) or 2(normal) ")); + } + auto stream_flag = + phi::XPUCUDAStream::StreamFlag::kStreamNonBlocking; + if (place == nullptr) { + int curr_device_id = platform::GetXPUCurrentDeviceId(); + auto place_tmp = phi::XPUPlace(curr_device_id); + new (&self) + phi::XPUCUDAStream(place_tmp, priority - 2, stream_flag); + } else { + new (&self) + phi::XPUCUDAStream(*place, priority - 2, stream_flag); + } + }); #endif } } // namespace pybind diff --git a/paddle/fluid/pybind/xpu_streams_py.h b/paddle/fluid/pybind/xpu_streams_py.h index a146cf6ba3419e..a1f56b879d1cd9 100644 --- a/paddle/fluid/pybind/xpu_streams_py.h +++ b/paddle/fluid/pybind/xpu_streams_py.h @@ -18,12 +18,16 @@ #include "pybind11/stl.h" #ifdef PADDLE_WITH_XPU +#include "paddle/phi/backends/xpu/xpu_context.h" #include "paddle/phi/core/xpu_cuda_stream.h" #include "xpu/runtime.h" #include "xpu/runtime_ex.h" + #else namespace phi { class XPUCUDAStream {}; +class XPUStreamHandle {}; +class XPUEventHandle {}; } // namespace phi #endif @@ -32,7 +36,8 @@ namespace py = pybind11; namespace paddle { namespace platform { #ifdef PADDLE_WITH_XPU -XPUStream get_current_stream(int device_id = -1); +phi::XPUStreamHandle* get_current_stream(int device_id = -1); +phi::XPUStreamHandle* set_current_stream(int idx); #endif } // namespace platform namespace pybind { diff --git a/paddle/phi/api/include/tensor.h b/paddle/phi/api/include/tensor.h index b9e919c52b11b2..93bed19b2bc29d 100644 --- a/paddle/phi/api/include/tensor.h +++ b/paddle/phi/api/include/tensor.h @@ -29,6 +29,11 @@ using gpuStream_t = cudaStream_t; using gpuStream_t = hipStream_t; #endif +#ifdef PADDLE_WITH_XPU +#include "xpu/runtime.h" +#include "xpu/runtime_ex.h" +#endif + #ifdef PADDLE_WITH_CUSTOM_DEVICE #include "paddle/phi/backends/stream.h" #endif @@ -434,6 +439,10 @@ class PADDLE_API Tensor final { * @return gpuStream_t */ gpuStream_t stream() const; +#elif defined(PADDLE_WITH_XPU) + + void record_stream(XPUStream stream) const; + #elif defined(PADDLE_WITH_CUSTOM_DEVICE) /** * @brief Get the stream where the tensor is currently located diff --git a/paddle/phi/api/lib/tensor.cc b/paddle/phi/api/lib/tensor.cc index 98c632a511cd74..0e6af802094e2d 100644 --- a/paddle/phi/api/lib/tensor.cc +++ b/paddle/phi/api/lib/tensor.cc @@ -40,6 +40,8 @@ limitations under the License. */ #include "paddle/phi/core/tensor_meta.h" #include "paddle/phi/core/tensor_utils.h" +#include "paddle/phi/core/memory/malloc.h" + namespace paddle { using DeviceContextPool = experimental::DeviceContextPool; @@ -397,6 +399,14 @@ Tensor Tensor::slice(int64_t begin_idx, int64_t end_idx) const { const std::shared_ptr &Tensor::impl() const { return impl_; } +#ifdef PADDLE_WITH_XPU + +void Tensor::record_stream(XPUStream stream) const { + paddle::memory::RecordStream( + std::dynamic_pointer_cast(impl_)->Holder(), stream); +} + +#endif void Tensor::set_impl(const std::shared_ptr &impl) { impl_ = impl; } diff --git a/paddle/phi/api/lib/tensor_method.cc b/paddle/phi/api/lib/tensor_method.cc index 149cbc3b56beb3..2f7d54eaa05e00 100644 --- a/paddle/phi/api/lib/tensor_method.cc +++ b/paddle/phi/api/lib/tensor_method.cc @@ -198,7 +198,8 @@ void Tensor::copy_(const Tensor &src, return; } #endif - if(is_dense_tensor() && has_allocation() && src.is_dense_tensor()) { + if(is_dense_tensor() && has_allocation() && + initialized() && src.is_dense_tensor()) { auto dst_tensor = static_cast(impl_.get()); auto src_tensor = std::static_pointer_cast(src.impl_); if(!dst_tensor->meta().is_contiguous() || diff --git a/paddle/phi/backends/xpu/xpu_context.cc b/paddle/phi/backends/xpu/xpu_context.cc index 800aefdc91ffa4..1c5d26d5f548c4 100644 --- a/paddle/phi/backends/xpu/xpu_context.cc +++ b/paddle/phi/backends/xpu/xpu_context.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/phi/backends/xpu/xpu_context.h" +#include "paddle/phi/backends/context_pool.h" #ifdef PADDLE_WITH_XPU #include @@ -100,8 +101,8 @@ struct XPUContext::Impl { } // Set external stream for context - void SetStream(void* stream) { - if (context_->xpu_stream != nullptr && stream_owned_) { + void SetStream(void* stream, bool clear = true) { + if (clear && context_->xpu_stream != nullptr && stream_owned_) { xpu_stream_destroy(context_->xpu_stream); } stream_owned_ = false; @@ -343,7 +344,21 @@ XPUContext::XPUContext() : DeviceContext() { } else { impls_.push_back(std::make_unique()); impls_[0]->Init(get_gm_size(0), get_l3_size(0)); + stream_pool.push_back(impls_[0]->context_->get_stream()); + idle_stream_flags.push_back(false); + current_stream_handle = + XPUStreamHandle(impls_[0]->context_->get_stream(), 0); + if (std::getenv("XPU_DEFAULT_STREAM_NUMBER") != nullptr) { + int default_num_stream = atoi(std::getenv("XPU_DEFAULT_STREAM_NUMBER")); + for (int i = 0; i < default_num_stream; i++) { + XPUStream s; + PADDLE_ENFORCE_XPU_SUCCESS(xpu_stream_create(&s)); + stream_pool.push_back(s); + idle_stream_flags.push_back(true); + } + } } + current_stream_idx = 0; } XPUContext::XPUContext(const XPUPlace& place, bool is_comm_context) @@ -362,10 +377,18 @@ XPUContext::XPUContext(const XPUPlace& place, bool is_comm_context) impls_.push_back(std::make_unique(place)); impls_[i]->Init(get_gm_size(i), get_l3_size(i)); } + stream_pool.push_back(impls_[0]->context_->get_stream()); + idle_stream_flags.push_back(false); } else { impls_.push_back(std::make_unique(place)); impls_[0]->Init(get_gm_size(0), get_l3_size(0)); + stream_pool.push_back(impls_[0]->context_->get_stream()); + idle_stream_flags.push_back(false); + current_stream_handle = + XPUStreamHandle(impls_[0]->context_->get_stream(), 0); } + + current_stream_idx = 0; } XPUContext::~XPUContext() = default; @@ -380,6 +403,9 @@ XPUStream XPUContext::stream(int i) const { void XPUContext::SetStream(void* stream, int i) { CheckValidStreamId(i); impls_[i]->SetStream(stream); + if (i == 0) { + current_stream_handle.set_stream(static_cast(stream)); + } } void XPUContext::CheckValidStreamId(int i) const { @@ -397,6 +423,21 @@ void XPUContext::CheckValidStreamId(int i) const { i)); } +void XPUContext::CheckValidIdxInRange(int i, int i_max) const { + PADDLE_ENFORCE_GE( + i, + 0, + errors::InvalidArgument( + "The stream index must be greater than or equal to 0.")); + PADDLE_ENFORCE_LT( + i, + i_max, + errors::InvalidArgument("The stream index should be less than the number " + "of stream used (%d), but got %d", + i_max, + i)); +} + void XPUContext::SetXpuVersion(int version) { impls_[0]->xpu_version_ = static_cast(version); } @@ -462,26 +503,251 @@ void XPUContext::StreamWaitEvent(XPUEvent event, int s) const { void XPUContext::StreamWaitStream(int wait_stream, int record_stream) const { CheckValidStreamId(wait_stream); CheckValidStreamId(record_stream); - XPUEvent event; - int r = xpu_event_create(&event); - PADDLE_ENFORCE_XRE_SUCCESS(r); + XPUEvent event = XPUEventPool::Instance().CreateEventFromPool(); RecordEvent(event, record_stream); StreamWaitEvent(event, wait_stream); - r = xpu_event_destroy(event); - PADDLE_ENFORCE_XRE_SUCCESS(r); - impls_[record_stream]->ClearStashedMemory(); } int64_t XPUContext::GetStreamNum() const { return impls_.size(); } +int XPUContext::SetCurrentStream(int idx) { + int prev_stream_idx = current_stream_idx; + if (prev_stream_idx != idx) { + impls_[0]->SetStream(stream_pool[idx]); + current_stream_handle.set_stream(stream_pool[idx]); + current_stream_idx = idx; + idle_stream_flags[prev_stream_idx] = true; + idle_stream_flags[current_stream_idx] = false; + } + return prev_stream_idx; +} + +void XPUContext::StreamWaitStreamInPool(int wait_stream, + int record_stream) const { + CheckValidIdxInRange(wait_stream, stream_pool.size()); + CheckValidIdxInRange(record_stream, stream_pool.size()); + XPUEvent event = XPUEventPool::Instance().CreateEventFromPool(); + int r = xpu_event_record(event, stream_pool[record_stream]); + PADDLE_ENFORCE_XRE_SUCCESS(r); + r = xpu_stream_wait_event(stream_pool[wait_stream], event); + PADDLE_ENFORCE_XRE_SUCCESS(r); +} + +void XPUContext::StreamWaitEventInPool(int wait_stream, XPUEvent event) const { + CheckValidIdxInRange(wait_stream, stream_pool.size()); + int r = xpu_stream_wait_event(stream_pool[wait_stream], event); + PADDLE_ENFORCE_XRE_SUCCESS(r); +} + +int XPUContext::get_idle_stream() { + bool found_idle_stream = false; + int stream_idx = 0; + int num_streams = idle_stream_flags.size(); + for (; stream_idx < num_streams; stream_idx++) { + if (idle_stream_flags[stream_idx]) { + found_idle_stream = true; + break; + } + } + if (found_idle_stream) { + idle_stream_flags[stream_idx] = false; + return stream_idx; + } else { + add_stream_to_pool(); + return stream_pool.size() - 1; + } +} + +void XPUContext::add_stream_to_pool() { + XPUStream s; + PADDLE_ENFORCE_XPU_SUCCESS(xpu_stream_create(&s)); + stream_pool.push_back(s); + idle_stream_flags.push_back(false); +} + +XPUStream XPUContext::get_stream_from_pool(int idx) const { + PADDLE_ENFORCE_GE( + idx, + 0, + errors::InvalidArgument( + "The stream index must be greater than or equal to 0.")); + PADDLE_ENFORCE_LT( + idx, + stream_pool.size(), + errors::InvalidArgument("The stream index should be less than the number " + "of stream used (%d), but got %d", + stream_pool.size(), + idx)); + return stream_pool[idx]; +} + +int XPUContext::get_current_stream_idx() { return current_stream_idx; } void XPUContext::AddStashedMemory(int stream, const DenseTensor& tensor) { CheckValidStreamId(stream); impls_[stream]->AddStashedMemory(tensor); } +XPUStream XPUContext::get_current_stream() { return impls_[0]->stream(); } + +XPUStreamHandle* XPUContext::get_current_stream_handle() { + if (impls_[0]->context_->get_stream() == nullptr) { + XPUStream s; + PADDLE_ENFORCE_XPU_SUCCESS(xpu_stream_create(&s)); + impls_[0]->SetStream(s); + stream_pool[current_stream_idx] = s; + current_stream_handle.set_stream(s); + } + return ¤t_stream_handle; +} + void XPUContext::Init() { impls_[0]->Init(); } +XPUContext* get_xpu_context(int device_id) { + auto place_tmp = phi::XPUPlace( + device_id > -1 ? device_id : phi::backends::xpu::GetXPUCurrentDeviceId()); + phi::XPUContext* dev_ctx = static_cast( + phi::DeviceContextPool::Instance().Get(place_tmp)); + + return dev_ctx; +} + +XPUStreamHandle::XPUStreamHandle() {} + +XPUStreamHandle::XPUStreamHandle(const int idx) { + auto* dev_ctx = phi::get_xpu_context(); + stream_id = idx; + stream = dev_ctx->get_stream_from_pool(stream_id); +} + +XPUStreamHandle::XPUStreamHandle(const phi::XPUPlace& place) { + phi::XPUContext* dev_ctx = static_cast( + phi::DeviceContextPool::Instance().Get(place)); + stream_id = dev_ctx->get_idle_stream(); + stream = dev_ctx->get_stream_from_pool(stream_id); +} + +XPUStreamHandle::XPUStreamHandle(const XPUStream xpu_stream, const int id) { + stream = xpu_stream; + stream_id = id; +} + +void XPUStreamHandle::Init() { + auto* dev_ctx = phi::get_xpu_context(); + stream_id = dev_ctx->get_idle_stream(); + stream = dev_ctx->get_stream_from_pool(stream_id); +} + +void XPUStreamHandle::wait_event(XPUEvent event) const { + int r = xpu_stream_wait_event(stream, event); + PADDLE_ENFORCE_XRE_SUCCESS(r); +} + +void XPUStreamHandle::synchronize() const { + int r = xpu_wait(stream); + PADDLE_ENFORCE_XRE_SUCCESS(r); +} + +void XPUStreamHandle::set_stream(XPUStream stream_) { stream = stream_; } + +void XPUStreamHandle::record_event(XPUEvent event) const { + int r = xpu_event_record(event, stream); + PADDLE_ENFORCE_XRE_SUCCESS(r); +} + +XPUStreamHandle get_current_stream_handle(int device_id) { + auto* dev_ctx = get_xpu_context(device_id); + return *dev_ctx->get_current_stream_handle(); +} + +XPUStreamHandle get_stream_handle(int device_id) { + auto* dev_ctx = get_xpu_context(device_id); + return XPUStreamHandle(dev_ctx->get_idle_stream()); +} + +void set_current_stream(XPUStreamHandle* s) { + auto* dev_ctx = get_xpu_context(); + dev_ctx->SetStream(s->raw_stream(), 0); +} + +XPUEventPool& XPUEventPool::Instance() { + static XPUEventPool pool; + return pool; +} + +XPUEventPool::~XPUEventPool() { + const auto& DestroyEvent = [](XPUEvent event) { + int r = xpu_event_destroy(event); + PADDLE_ENFORCE_XRE_SUCCESS(r); + }; + const auto& CheckComplishAndDestroy = [&](XPUEvent event) -> bool { + if (xpu_event_query(event) == XPU_SUCCESS) { + DestroyEvent(event); + return true; + } else { + return false; + } + }; + std::unique_lock lock(mtx_); + while (!incomplished_events_.empty()) { + XPUEvent event = incomplished_events_.front(); + if (!CheckComplishAndDestroy(event)) { + LOG(ERROR) << "failed on destroying event when destroying event pool."; + } + incomplished_events_.pop(); + } +} + +XPUEvent XPUEventPool::CreateEventFromPool() { + std::unique_lock lock(mtx_); + + const auto& CreateNewEvent = [&]() -> XPUEvent { + XPUEvent new_event; + PADDLE_ENFORCE_XPU_SUCCESS(xpu_event_create(&new_event)); + incomplished_events_.push(new_event); + return new_event; + }; + + const auto& CreateNewOrReuseEvent = [&]() -> XPUEvent { + XPUEvent front_event = incomplished_events_.front(); + incomplished_events_.pop(); + incomplished_events_.push(front_event); + if (xpu_event_query(front_event) == XPU_SUCCESS) { + return front_event; + } + return CreateNewEvent(); + }; + + if (incomplished_events_.empty()) { + return CreateNewEvent(); + } + return CreateNewOrReuseEvent(); +} + +XPUEventHandle::XPUEventHandle() { + event_ = XPUEventPool::Instance().CreateEventFromPool(); +} +XPUEventHandle::XPUEventHandle(XPUStream stream) { + event_ = XPUEventPool::Instance().CreateEventFromPool(); + PADDLE_ENFORCE_XRE_SUCCESS(xpu_event_record(event_, stream)); +} + +void XPUEventHandle::record(XPUStream stream) { + PADDLE_ENFORCE_XRE_SUCCESS(xpu_event_query(event_)); + PADDLE_ENFORCE_XRE_SUCCESS(xpu_event_record(event_, stream)); +} + +bool XPUEventHandle::query() { + int result = xpu_event_query(event_); + if (result == XPU_SUCCESS) { + return true; + } + return false; +} + +void XPUEventHandle::synchronize() { + PADDLE_ENFORCE_XRE_SUCCESS(xpu_event_wait(event_)); +} #if defined(PADDLE_WITH_XPU) XPUPinnedContext::XPUPinnedContext() { eigen_device_ = std::make_unique(); diff --git a/paddle/phi/backends/xpu/xpu_context.h b/paddle/phi/backends/xpu/xpu_context.h index 2a9823d6a8de88..daa4cdd05e3d69 100644 --- a/paddle/phi/backends/xpu/xpu_context.h +++ b/paddle/phi/backends/xpu/xpu_context.h @@ -17,14 +17,14 @@ limitations under the License. */ #ifdef PADDLE_WITH_XPU #include +#include +#include #include - #include "paddle/phi/backends/xpu/forwards.h" #include "paddle/phi/backends/xpu/xpu_header.h" #include "paddle/phi/backends/xpu/xpu_info.h" #include "paddle/phi/common/place.h" #include "paddle/phi/core/device_context.h" - #ifdef PADDLE_WITH_XPU #include "paddle/phi/core/xpu_cuda_stream.h" #endif @@ -39,6 +39,26 @@ namespace phi { #ifdef PADDLE_WITH_XPU class XPUCUDAStream; +class XPUStreamHandle { + public: + XPUStreamHandle(); + explicit XPUStreamHandle(const int idx); + explicit XPUStreamHandle(const XPUPlace& place); + explicit XPUStreamHandle(const XPUStream xpu_stream, const int id); + + void Init(); + + int id() const { return stream_id; } + XPUStream raw_stream() const { return stream; } + void wait_event(XPUEvent event) const; + void synchronize() const; + void record_event(XPUEvent event) const; + void set_stream(XPUStream stream); + + private: + XPUStream stream; + int stream_id; +}; #endif class DenseTensor; @@ -110,16 +130,65 @@ class XPUContext : public DeviceContext, Eigen::DefaultDevice* eigen_device() const { return nullptr; } XPUStream stream(int i = 0) const; - + XPUStream get_stream_from_pool(int i = 0) const; + XPUStream get_current_stream(); static const char* name() { return "XPUContext"; } + int SetCurrentStream(int idx); + void StreamWaitStreamInPool(int wait_stream, int record_stream) const; + void StreamWaitEventInPool(int wait_stream, XPUEvent event) const; + int get_idle_stream(); + int get_current_stream_idx(); + XPUStreamHandle* get_current_stream_handle(); private: struct Impl; + XPUStreamHandle current_stream_handle; std::vector> impls_; + std::vector idle_stream_flags; + std::vector stream_pool; + int current_stream_idx; + void add_stream_to_pool(); + int get_stream_pool_size() const { return stream_pool.size(); } void CheckValidStreamId(int i) const; + void CheckValidIdxInRange(int idx, int range) const; +}; + +XPUContext* get_xpu_context(int device_id = -1); + +class XPUEventPool { + public: + XPUEventPool() = default; + XPUEventPool(const XPUEventPool&) = delete; + XPUEventPool(XPUEventPool&&) = delete; + ~XPUEventPool(); + + XPUEvent CreateEventFromPool(); + + static XPUEventPool& Instance(); + + private: + std::queue incomplished_events_; + std::mutex mtx_; }; +class XPUEventHandle { + public: + XPUEventHandle(); + explicit XPUEventHandle(XPUStream stream); + void record(XPUStream stream); + bool query(); + void synchronize(); + XPUEvent get_event() const { return event_; } + + private: + XPUEvent event_; +}; + +XPUStreamHandle get_current_stream_handle(int device_id = -1); +XPUStreamHandle get_stream_handle(int device_id = -1); +void set_current_stream(XPUStreamHandle* s); + // KPS (Kernel PrimitiveS API) needs to exist as a kind of backend, // because we want to implement a KPS-based kernel and make it run // on GPU and XPU at the same time, so we need KPSContext when registering diff --git a/paddle/phi/core/memory/allocation/allocator_facade.cc b/paddle/phi/core/memory/allocation/allocator_facade.cc index 65daaa257c2c5f..5123c6b33b6685 100644 --- a/paddle/phi/core/memory/allocation/allocator_facade.cc +++ b/paddle/phi/core/memory/allocation/allocator_facade.cc @@ -1940,6 +1940,14 @@ void AllocatorFacade::SetDefaultStream(const phi::XPUPlace& place, } #endif +#ifdef PADDLE_WITH_XPU + +bool AllocatorFacade::RecordStream(std::shared_ptr allocation, + XPUStream stream) { + return GetPrivate()->RecordStream(allocation, stream); +} +#endif + #ifdef PADDLE_WITH_CUSTOM_DEVICE uint64_t AllocatorFacade::Release(const phi::CustomPlace& place, phi::stream::stream_t stream) { diff --git a/paddle/phi/core/memory/allocation/allocator_facade.h b/paddle/phi/core/memory/allocation/allocator_facade.h index e46a6f9b13ef52..4b24dfcf57af4a 100644 --- a/paddle/phi/core/memory/allocation/allocator_facade.h +++ b/paddle/phi/core/memory/allocation/allocator_facade.h @@ -97,6 +97,7 @@ class AllocatorFacade { #elif defined(PADDLE_WITH_XPU) TEST_API const std::shared_ptr& GetAllocator( const phi::Place& place, XPUStream stream); + bool RecordStream(std::shared_ptr allocation, XPUStream stream); void SetDefaultStream(const phi::XPUPlace& place, XPUStream stream); #endif diff --git a/paddle/phi/core/memory/allocation/stream_safe_xpu_allocator.cc b/paddle/phi/core/memory/allocation/stream_safe_xpu_allocator.cc index fd30d61d47593e..8cd5471eb0e014 100644 --- a/paddle/phi/core/memory/allocation/stream_safe_xpu_allocator.cc +++ b/paddle/phi/core/memory/allocation/stream_safe_xpu_allocator.cc @@ -38,6 +38,10 @@ StreamSafeXPUAllocation::StreamSafeXPUAllocation( bool StreamSafeXPUAllocation::RecordStream(XPUStream stream) { VLOG(8) << "Try record stream " << stream << " for address " << ptr(); if (stream == owning_stream_) { + VLOG(8) << "stream " << stream << " is the same as owning stream " + << owning_stream_; + VLOG(8) << "Skip recording the same stream " << stream << " for address " + << ptr(); return false; } @@ -57,9 +61,13 @@ bool StreamSafeXPUAllocation::CanBeFreed() { it != outstanding_event_map_.end(); ++it) { XPUEvent& event = it->second; - - PADDLE_ENFORCE_XRE_SUCCESS(xpu_event_destroy(event)); - VLOG(8) << "Destroy event " << event; + if (xpu_event_query(event) == XPU_SUCCESS) { + PADDLE_ENFORCE_XRE_SUCCESS(xpu_event_destroy(event)); + VLOG(8) << "Destroy event " << event; + } else { + outstanding_event_map_.erase(outstanding_event_map_.begin(), it); + return false; + } } return true; } diff --git a/paddle/phi/core/memory/malloc.cc b/paddle/phi/core/memory/malloc.cc index 050a3d2855189b..304a835a5b1b71 100644 --- a/paddle/phi/core/memory/malloc.cc +++ b/paddle/phi/core/memory/malloc.cc @@ -76,6 +76,13 @@ gpuStream_t GetStream(const std::shared_ptr& allocation) { #endif +#ifdef PADDLE_WITH_XPU +bool RecordStream(std::shared_ptr allocation, XPUStream stream) { + return allocation::AllocatorFacade::Instance().RecordStream(allocation, + stream); +} +#endif + #ifdef PADDLE_WITH_CUSTOM_DEVICE uint64_t Release(const phi::CustomPlace& place, phi::stream::stream_t stream) { return allocation::AllocatorFacade::Instance().Release(place, stream); diff --git a/paddle/phi/core/memory/malloc.h b/paddle/phi/core/memory/malloc.h index eea770696608a2..0d064e28b8a119 100644 --- a/paddle/phi/core/memory/malloc.h +++ b/paddle/phi/core/memory/malloc.h @@ -22,6 +22,11 @@ limitations under the License. */ #include "paddle/phi/core/memory/allocation/allocator.h" #include "paddle/phi/core/stream.h" +#ifdef PADDLE_WITH_XPU +#include "xpu/runtime.h" +#include "xpu/runtime_ex.h" +#endif + namespace paddle { namespace memory { @@ -58,6 +63,11 @@ void EraseStream(std::shared_ptr allocation, gpuStream_t stream); gpuStream_t GetStream(const std::shared_ptr& allocation); #endif + +#ifdef PADDLE_WITH_XPU +bool RecordStream(std::shared_ptr allocation, XPUStream stream); +#endif + #ifdef PADDLE_WITH_CUSTOM_DEVICE extern uint64_t Release(const phi::CustomPlace& place, phi::stream::stream_t stream); diff --git a/paddle/phi/core/platform/device/xpu/xpu_resource_pool.cc b/paddle/phi/core/platform/device/xpu/xpu_resource_pool.cc index 8104cbe80514b1..2956043e9bd18c 100644 --- a/paddle/phi/core/platform/device/xpu/xpu_resource_pool.cc +++ b/paddle/phi/core/platform/device/xpu/xpu_resource_pool.cc @@ -71,7 +71,12 @@ XpuEventResourcePool::XpuEventResourcePool() { auto deleter = [dev_idx](xpuEventHandle event) { phi::backends::xpu::XPUDeviceGuard guard(dev_idx); - xpu_event_destroy(event); + if (xpu_event_query(event) == XPU_SUCCESS) { + xpu_event_destroy(event); + } else { + PADDLE_THROW(phi::errors::InvalidArgument( + "event not finished, can not destroy")); + } }; pool_.emplace_back(ResourcePool::Create(creator, deleter)); diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc index 98b432e39975a3..50c2d0801f0852 100644 --- a/paddle/phi/infermeta/binary.cc +++ b/paddle/phi/infermeta/binary.cc @@ -1911,7 +1911,7 @@ void FusedMatmulInferMeta(const MetaTensor& x, const std::vector& fused_transpose_Y, const std::vector& fused_reshape_Out, const std::vector& fused_transpose_Out, - const std::string& mkldnn_data_type, + const std::string& onednn_data_type, const float scale_x, const float scale_y, const float scale_scale_in_eltwise, diff --git a/paddle/phi/infermeta/binary.h b/paddle/phi/infermeta/binary.h index 419559f2be0c6d..b8cd51a2d7d052 100644 --- a/paddle/phi/infermeta/binary.h +++ b/paddle/phi/infermeta/binary.h @@ -384,7 +384,7 @@ void FusedMatmulInferMeta(const MetaTensor& x, const std::vector& fused_transpose_Y, const std::vector& fused_reshape_Out, const std::vector& fused_transpose_Out, - const std::string& mkldnn_data_type, + const std::string& onednn_data_type, const float scale_x, const float scale_y, const float scale_scale_in_eltwise, diff --git a/paddle/phi/infermeta/fusion.cc b/paddle/phi/infermeta/fusion.cc index 0152ddcebc90bf..e7b6980f3b70bf 100644 --- a/paddle/phi/infermeta/fusion.cc +++ b/paddle/phi/infermeta/fusion.cc @@ -4800,7 +4800,7 @@ void MultiGruInferMeta( const std::string& gate_activation, int layers, bool origin_mode, - const std::string& mkldnn_data_type, + const std::string& onednn_data_type, float scale_data, float shift_data, bool force_fp32_output, diff --git a/paddle/phi/infermeta/fusion.h b/paddle/phi/infermeta/fusion.h index 6e840d67ead536..a3e6342b09f0a1 100644 --- a/paddle/phi/infermeta/fusion.h +++ b/paddle/phi/infermeta/fusion.h @@ -1080,7 +1080,7 @@ void MultiGruInferMeta( const std::string& gate_activation, int layers, bool origin_mode, - const std::string& mkldnn_data_type, + const std::string& onednn_data_type, float scale_data, float shift_data, bool force_fp32_output, diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc index 3e1edcddd48d92..bb10157cfc69da 100644 --- a/paddle/phi/infermeta/multiary.cc +++ b/paddle/phi/infermeta/multiary.cc @@ -6098,7 +6098,7 @@ void FusedConvInferMeta(const MetaTensor& input, const std::vector& dilations, int groups, const std::string& data_format, - const std::string& mkldnn_data_type, + const std::string& onednn_data_type, const std::string& fuse_activation, bool fuse_residual_conn, bool force_fp32_output, diff --git a/paddle/phi/infermeta/multiary.h b/paddle/phi/infermeta/multiary.h index 4060963ca9e0e9..67027f75097f7e 100644 --- a/paddle/phi/infermeta/multiary.h +++ b/paddle/phi/infermeta/multiary.h @@ -1231,7 +1231,7 @@ void FusedConvInferMeta(const MetaTensor& input, const std::vector& dilations, int groups, const std::string& data_format, - const std::string& mkldnn_data_type, + const std::string& onednn_data_type, const std::string& fuse_activation, bool fuse_residual_conn, bool force_fp32_output, diff --git a/paddle/phi/kernels/funcs/broadcast_function.h b/paddle/phi/kernels/funcs/broadcast_function.h index 96e93c7a97ff12..167be9f2e0d74e 100644 --- a/paddle/phi/kernels/funcs/broadcast_function.h +++ b/paddle/phi/kernels/funcs/broadcast_function.h @@ -584,7 +584,7 @@ static void SliceTensor(DenseTensor *x, DenseTensorMeta meta(share->dtype(), new_dim, share->layout(), - offset * SizeOf(share->dtype())); + offset * SizeOf(share->dtype()) + share->offset()); x->set_meta(meta); x->ShareBufferWith(*(share), true); x->Resize(new_dim); diff --git a/paddle/phi/kernels/funcs/reduce_grad_functions.h b/paddle/phi/kernels/funcs/reduce_grad_functions.h index 27de6176657e7f..e1de295be330b5 100644 --- a/paddle/phi/kernels/funcs/reduce_grad_functions.h +++ b/paddle/phi/kernels/funcs/reduce_grad_functions.h @@ -38,10 +38,10 @@ void ReduceGradFunctor(const Context& dev_ctx, auto x_dims = input0.dims(); auto reduced_dims_v = common::vectorize(x_dims); std::vector dims_ref = dims; - Eigen::array broadcast_dim; + Eigen::array broadcast_dim; for (size_t i = 0; i < D; ++i) broadcast_dim[i] = 1; - int broad_cast_times = 1; + int64_t broad_cast_times = 1; for (size_t i = 0; i < dims_ref.size(); ++i) { if (dims_ref[i] < 0) { dims_ref[i] = x_rank + dims_ref[i]; @@ -142,7 +142,7 @@ void LaunchReduceGradKernel(const Context& dev_ctx, auto& place = *dev_ctx.eigen_device(); // *dev_ctx.eigen_device(); auto broadcast_dim = - Eigen::array({{static_cast(input0->numel())}}); + Eigen::array({{static_cast(input0->numel())}}); functor(place, &x, &x_reduce, diff --git a/paddle/phi/kernels/gpu/argsort_kernel.cu b/paddle/phi/kernels/gpu/argsort_kernel.cu index 3de66d3d944ba0..fecd6bb71d3a54 100644 --- a/paddle/phi/kernels/gpu/argsort_kernel.cu +++ b/paddle/phi/kernels/gpu/argsort_kernel.cu @@ -198,6 +198,13 @@ void ArgFullSort(const phi::GPUContext& dev_ctx, const int64_t num_rows, const int64_t num_cols, const bool descending) { + PADDLE_ENFORCE_LE(num_cols, + std::numeric_limits::max(), + ::common::errors::PreconditionNotMet( + "The dimension being sorted should be less than " + "2^31, but got %lld. Please check the input tensor. ", + num_cols)); + auto cu_stream = dev_ctx.stream(); auto ComputeBlockSize = [](IndType col) { if (col > 512) @@ -228,8 +235,14 @@ void ArgFullSort(const phi::GPUContext& dev_ctx, const int64_t total_elements = num_cols * num_rows; const int64_t segment_size = num_cols; const int64_t element_per_call = std::min(max_elements, total_elements); + + // make sure element_per_call >= segment_size + const int64_t adjusted_elements_per_call = + std::max(max_elements, segment_size); + // make sure batch size is the multiple of segment_size - const int64_t batch_size = (element_per_call / segment_size) * segment_size; + const int64_t batch_size = + (adjusted_elements_per_call / segment_size) * segment_size; int64_t offset = 0; DenseTensor input_indices; diff --git a/paddle/phi/kernels/gpu/dist_kernel.cu b/paddle/phi/kernels/gpu/dist_kernel.cu index a55458d59a2d57..1993caec70adb3 100644 --- a/paddle/phi/kernels/gpu/dist_kernel.cu +++ b/paddle/phi/kernels/gpu/dist_kernel.cu @@ -63,6 +63,18 @@ struct PowFunctor { Ty p_order_; }; +template // Tx is high precision, Tout is low/out precision +struct PowFunctorHighPrecision { + HOSTDEVICE explicit inline PowFunctorHighPrecision(const Ty& p_order) + : p_order_(p_order) {} + HOSTDEVICE inline Tx operator()(const Tx x) const { + return static_cast(pow(static_cast(x), p_order_)); + } + Ty p_order_; +}; + template __global__ void ReduceSumWithSubtract( const T* x, const T* y, T* out, int64_t N, Functor func) { @@ -126,16 +138,17 @@ void DistKernel(const Context& dev_ctx, DenseTensor intermediate; const T* x_ptr = x.data(); const T* y_ptr = y.data(); + T* o_ptr = dev_ctx.template Alloc(out); auto stream = dev_ctx.stream(); auto xdim = x.dims(); if (xdim == y.dims()) { // same shape - auto n = x.numel(); + int64_t n = x.numel(); + auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n); intermediate.Resize(common::make_ddim({config.block_per_grid.x})); T* i_ptr = dev_ctx.template Alloc(&intermediate); - std::vector axis_dims = {static_cast(-1)}; std::vector reduce_axis = funcs::details::GetReduceDim(axis_dims, xdim.size(), true); @@ -166,15 +179,23 @@ void DistKernel(const Context& dev_ctx, ReduceSumWithSubtract <<>>( x_ptr, y_ptr, i_ptr, n, OtherOrderFunctor(p_order)); - phi::funcs::ReduceKernel>( - dev_ctx, intermediate, out, kps::IdentityFunctor(), reduce_axis); - - const DenseTensor* tmp_norm = out; - std::vector ins = {tmp_norm}; + DenseTensor out_other; + out_other.Resize(out->dims()); + dev_ctx.template Alloc(&out_other); + + phi::funcs:: + ReduceKernel>( + dev_ctx, + intermediate, + &out_other, + kps::IdentityFunctor(), + reduce_axis); + std::vector ins = {&out_other}; std::vector outs = {out}; - MT p_order_ = static_cast(static_cast(1.) / p_order); + + MT p_order_ = static_cast(1.f / p_order); phi::funcs::ElementwiseKernel( - dev_ctx, ins, &outs, PowFunctor(p_order_)); + dev_ctx, ins, &outs, PowFunctorHighPrecision(p_order_)); } } else { diff --git a/paddle/phi/kernels/gpu/p_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/p_norm_grad_kernel.cu index fdfed25b3dda8f..5efd6a36a5399f 100644 --- a/paddle/phi/kernels/gpu/p_norm_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/p_norm_grad_kernel.cu @@ -42,10 +42,12 @@ struct AbsMaxAndMinGradFunctor { template struct PNormGradFunctor { + using MT = typename phi::dtype::MPTypeTrait::Type; HOSTDEVICE explicit inline PNormGradFunctor(float porder, float eps) { - this->porder = static_cast(porder - 1.); - this->eps = static_cast(eps); + this->porder = static_cast(porder - 1.); + this->eps = static_cast(eps); } + template template cast(); + auto y_mt = y->template cast(); + auto dy_mt = dy->template cast(); + + auto norm_pow = y_mt.pow(-this->porder); + auto mask_norm_nonzero = (y_mt != static_cast(0)).template cast(); + + // Set to 0 where porder < 0 and x == 0 + MT zero = static_cast(0); + auto mask_x_zero = (x_mt == zero).template cast(); + + MT is_porder_negative = + this->porder < zero ? static_cast(1) : static_cast(0); + auto invalid_mask = (mask_x_zero * is_porder_negative); + auto safe_pow = + x_mt.abs().pow(this->porder) * (static_cast(1) - invalid_mask); + dx->device(place) = - (*x).abs().pow(this->porder) * (*x).sign() * dy->broadcast(dim) * - (*y + y->constant(eps)).pow(-this->porder).broadcast(dim); + (safe_pow * x_mt.sign() * dy_mt.broadcast(dim) * + norm_pow.broadcast(dim) * + mask_norm_nonzero.broadcast(dim) // Mask out positions where norm == 0 + ) + .template cast(); } - T porder; - T eps; + + MT porder; + MT eps; }; template diff --git a/paddle/phi/kernels/gpu/p_norm_kernel.cu b/paddle/phi/kernels/gpu/p_norm_kernel.cu index 9b0515feb33544..8809b082b7a826 100644 --- a/paddle/phi/kernels/gpu/p_norm_kernel.cu +++ b/paddle/phi/kernels/gpu/p_norm_kernel.cu @@ -124,31 +124,38 @@ void PNormKernel(const Context& dev_ctx, phi::funcs::ElementwiseKernel( dev_ctx, ins, &outs, UnsignedPowFunctor(1. / porder)); #else + DenseTensor out_temp; + out_temp.Resize(out_norm->dims()); + dev_ctx.template Alloc(&out_temp); + if (porder == 1.0) { // fast 1-norm phi::funcs::ReduceKernel>( dev_ctx, *in_x, out_norm, FabsFunctor(), reduce_axis); } else if (porder == 2.0) { // fast 2-norm - phi::funcs::ReduceKernel>( - dev_ctx, *in_x, out_norm, SquareFunctor(), reduce_axis); + phi::funcs::ReduceKernel>( + dev_ctx, *in_x, &out_temp, SquareFunctor(), reduce_axis); } else if (porder == 3.0) { // fast 3-norm - phi::funcs::ReduceKernel>( - dev_ctx, *in_x, out_norm, FabsCubicFunctor(), reduce_axis); + phi::funcs::ReduceKernel>( + dev_ctx, *in_x, &out_temp, FabsCubicFunctor(), reduce_axis); } else { // vanilla norm - phi::funcs::ReduceKernel>( - dev_ctx, *in_x, out_norm, UnsignedPowFunctor(porder), reduce_axis); + phi::funcs::ReduceKernel>( + dev_ctx, + *in_x, + &out_temp, + UnsignedPowFunctor(porder), + reduce_axis); } if (porder != 1.0) { - // save computation when porder is 1.0 - const DenseTensor* tmp_norm = out_norm; - std::vector ins = {tmp_norm}; + std::vector ins = {&out_temp}; std::vector outs = {out_norm}; + MT p_order_ = static_cast(1.f / porder); phi::funcs::ElementwiseKernel( - dev_ctx, ins, &outs, UnsignedPowFunctor(1. / porder)); + dev_ctx, ins, &outs, UnsignedPowFunctor(p_order_)); } #endif } diff --git a/paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu b/paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu index 91141b09aae8ce..3cd9d0f0aaeb47 100644 --- a/paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu +++ b/paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu @@ -37,10 +37,207 @@ limitations under the License. */ #endif #include "paddle/phi/kernels/full_kernel.h" +#ifdef PADDLE_WITH_CUDNN_FRONTEND +// clang-format off +#include "paddle/phi/backends/dynload/cudnn_frontend.h" +#include "paddle/phi/kernels/autotune/cache.h" +#include "paddle/phi/kernels/gpudnn/conv_cudnn_frontend.h" +// clang-format on +#endif + namespace phi { using GPUDNNDataLayout = phi::backends::gpu::DataLayout; +template +void ConvTransposeCudnnKernelImplV7(const DenseTensor* transformed_x, + const DenseTensor* filter, + const Context& dev_ctx, + const std::vector& strides, + const std::vector& padding_common, + const std::vector& dilations_, + GPUDNNDataLayout data_layout, + GPUDNNDataLayout layout, + bool exhaustive_search, + bool deterministic, + int groups, + DenseTensor* transformed_out) { + int iwo_groups = 1; + int c_groups = groups; + groups = 1; + size_t workspace_size = 0; + + const T* x_data = transformed_x->data(); + const T* filter_data = filter->data(); + T* transformed_out_data = transformed_out->data(); +#ifdef PADDLE_WITH_HIP + miopenConvBwdDataAlgorithm_t algo{}; +#else + cudnnConvolutionBwdDataAlgo_t algo{}; +#endif + // ------------------- cudnn conv algorithm --------------------- + auto handle = dev_ctx.cudnn_handle(); + auto layout_tensor = phi::backends::gpu::GetCudnnTensorFormat(layout); + auto dtype = phi::backends::gpu::CudnnDataType::type; + // ------------------- cudnn descriptors --------------------- + ConvArgs args{handle, + transformed_out, + filter, + transformed_x, + strides, + padding_common, + dilations_, + dtype, + groups, + data_layout}; + args.idesc.set(*transformed_out, iwo_groups); + args.wdesc.set(*filter, layout_tensor, iwo_groups); + args.odesc.set(*transformed_x, iwo_groups); + args.cdesc.set(dtype, + padding_common, + strides, + dilations_, + phi::AllowTF32Cudnn(), + c_groups); + +#ifdef PADDLE_WITH_HIP + SearchResult bwd_result; + using search = SearchAlgorithm; + workspace_size = std::max(workspace_size, search::GetWorkspaceSize(args)); + bwd_result.algo = search::Find( + args, exhaustive_search, deterministic, workspace_size, dev_ctx); +#else + SearchResult bwd_result; + using search = SearchAlgorithm; + bwd_result = + search::Find(dev_ctx, args, exhaustive_search, deterministic, false); + workspace_size = + std::max(workspace_size, search::GetWorkspaceSize(args, bwd_result.algo)); +#endif + + // ------------------- cudnn conv transpose forward --------------------- + int x_offset = transformed_x->numel() / transformed_x->dims()[0] / groups; + int out_offset = + transformed_out->numel() / transformed_out->dims()[0] / groups; + int filter_offset = filter->numel() / groups; + ScalingParamType alpha = 1.0f; + ScalingParamType beta = 0.0f; + auto workspace_handle = dev_ctx.cudnn_workspace_handle(); +#ifdef PADDLE_WITH_HIP + for (int g = 0; g < groups; g++) { + auto cudnn_func = [&](void* cudnn_workspace) { + PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenConvolutionBackwardData( + handle, + &alpha, + args.odesc.desc(), + x_data + x_offset * g, + args.wdesc.desc(), + filter_data + filter_offset * g, + args.cdesc.desc(), + bwd_result.algo, + &beta, + args.idesc.desc(), + transformed_out_data + out_offset * g, + cudnn_workspace, + workspace_size)); + }; + workspace_handle.RunFunc(cudnn_func, workspace_size); + } +#else + ConvRunner::Apply(dev_ctx, + args, + bwd_result, + x_data, + filter_data, + transformed_out_data, + groups, + out_offset, + filter_offset, + x_offset, + workspace_size, + &workspace_handle, + false); +#endif +} +#ifdef PADDLE_WITH_CUDNN_FRONTEND +template +void ConvTransposeCudnnKernelImplV8(const DenseTensor* transformed_x, + const DenseTensor* filter, + const Context& dev_ctx, + const std::vector& strides, + const std::vector& padding_common, + const std::vector& dilations_, + GPUDNNDataLayout data_layout, + GPUDNNDataLayout layout, + bool exhaustive_search, + bool deterministic, + int groups, + DenseTensor* transformed_out) { + auto& plan_cache = phi::autotune::AutoTuneCache::Instance().GetConvV8( + phi::autotune::AlgorithmType::kConvBackwardDataV8); + + T* input_data = const_cast(transformed_x->data()); + T* filter_data = const_cast(filter->data()); + T* output_data = transformed_out->data(); + cudnnHandle_t handle = const_cast(dev_ctx.cudnn_handle()); + auto workspace_handle = dev_ctx.cudnn_workspace_handle(); + + auto layout_format = phi::backends::gpu::GetCudnnTensorFormat(layout); + auto dtype = phi::backends::gpu::CudnnDataType::type; + + float alpha = 1.0f; + float beta = 0.0f; + + using helper = CudnnFrontendConvHelper; + auto op_graph = helper::BuildConvOperationGraph< + CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR>( + transformed_out, + transformed_x, + filter, + layout_format, + strides, + padding_common, + dilations_, + dtype, + handle, + alpha, + beta); + if (plan_cache.FindPlan(op_graph, handle)) { + const cudnn_frontend::ExecutionPlan* cached_plan = nullptr; + int64_t workspace_size = 0; + plan_cache.GetPlanAndWorkspaceSize( + op_graph, &cached_plan, &workspace_size, handle); + helper::ExecutePlan(handle, + &workspace_handle, + output_data, + input_data, + filter_data, + cached_plan->get_raw_desc(), + workspace_size); + return; + } + + auto plans = helper::FindExecutionPlans(&op_graph, + exhaustive_search, + deterministic, + output_data, + input_data, + filter_data, + handle, + &workspace_handle); + + helper::ExecutePlansAndCache(handle, + &workspace_handle, + output_data, + input_data, + filter_data, + &plans, + exhaustive_search, + op_graph, + &plan_cache); +} +#endif + template void ConvTransposeRawGPUDNNKernel(const Context& dev_ctx, const DenseTensor& x, @@ -57,15 +254,28 @@ void ConvTransposeRawGPUDNNKernel(const Context& dev_ctx, dev_ctx, phi::IntArray(common::vectorize(out->dims())), 0, out); return; } + + bool has_exhaustive_search = dev_ctx.HasDnnAttr("exhaustive_search"); + bool exhaustive_search_attr = + has_exhaustive_search + ? PADDLE_GET_CONST(bool, dev_ctx.GetDnnAttr("exhaustive_search")) + : false; + bool exhaustive_search = + FLAGS_cudnn_exhaustive_search || exhaustive_search_attr; + bool deterministic = FLAGS_cudnn_deterministic; + PADDLE_ENFORCE_EQ(exhaustive_search && deterministic, + false, + common::errors::InvalidArgument( + "Can't set exhaustive_search True and " + "FLAGS_cudnn_deterministic True at same time.")); + std::vector paddings_ = paddings; - std::vector dilations_ = - dilations; // cudnn v5 does not support dilations - const T* filter_data = filter.data(); + std::vector dilations_ = dilations; const GPUDNNDataLayout data_layout = (data_format != "NHWC" ? GPUDNNDataLayout::kNCHW : GPUDNNDataLayout::kNHWC); - std::vector x_vec = common::vectorize(x.dims()); - std::vector out_vec = common::vectorize(out->dims()); + std::vector x_vec = common::vectorize(x.dims()); + std::vector out_vec = common::vectorize(out->dims()); // if channel_last, transpose to channel_first DenseTensor x_transpose; if (data_layout == GPUDNNDataLayout::kNHWC) { @@ -106,7 +316,7 @@ void ConvTransposeRawGPUDNNKernel(const Context& dev_ctx, std::vector padding_common(data_dim, 0); if (!is_sys_pad) { std::vector padding_diff(data_dim); - std::vector new_x_shape_vec(data_dim + 2); + std::vector new_x_shape_vec(data_dim + 2); new_x_shape_vec[0] = x_dims[0]; new_x_shape_vec[1] = x_dims[1]; @@ -158,10 +368,9 @@ void ConvTransposeRawGPUDNNKernel(const Context& dev_ctx, axes[i] = i + 2; } - const T* x_data = transformed_x.data(); - x_vec = common::vectorize(transformed_x.dims()); + x_vec = common::vectorize(transformed_x.dims()); - std::vector transformed_out_vec = out_vec; + std::vector transformed_out_vec = out_vec; for (size_t i = 0; i < data_dim; ++i) { transformed_out_vec[i + 2] = out_vec[i + 2] + (x_pad[2 * i + 4] + x_pad[2 * i + 5]) * strides[i] - @@ -177,119 +386,55 @@ void ConvTransposeRawGPUDNNKernel(const Context& dev_ctx, transformed_out.ShareDataWith(*out); transformed_out.Resize(common::make_ddim(transformed_out_vec)); } - T* transformed_out_data = transformed_out.data(); - -#ifndef PADDLE_WITH_HIP - CUDNN_ENFORCE_TENSOR_SIZE_SUPPORTED(transformed_x); - CUDNN_ENFORCE_TENSOR_SIZE_SUPPORTED(filter); - CUDNN_ENFORCE_TENSOR_SIZE_SUPPORTED(transformed_out); -#endif GPUDNNDataLayout layout; - - int iwo_groups = groups; - int c_groups = 1; -#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1) - iwo_groups = 1; - c_groups = groups; - groups = 1; -#endif - if (strides.size() == 2U) { layout = GPUDNNDataLayout::kNCHW; } else { layout = GPUDNNDataLayout::kNCDHW; } - size_t workspace_size = 0; -#ifdef PADDLE_WITH_HIP - miopenConvBwdDataAlgorithm_t algo{}; +#ifdef PADDLE_WITH_CUDNN_FRONTEND + if (dynload::IsCudnnFrontendEnabled()) + ConvTransposeCudnnKernelImplV8(&transformed_x, + &filter, + dev_ctx, + strides, + padding_common, + dilations_, + data_layout, + layout, + exhaustive_search, + deterministic, + groups, + &transformed_out); + else + ConvTransposeCudnnKernelImplV7(&transformed_x, + &filter, + dev_ctx, + strides, + padding_common, + dilations_, + data_layout, + layout, + exhaustive_search, + deterministic, + groups, + &transformed_out); #else - cudnnConvolutionBwdDataAlgo_t algo{}; + ConvTransposeCudnnKernelImplV7(&transformed_x, + &filter, + dev_ctx, + strides, + padding_common, + dilations_, + data_layout, + layout, + exhaustive_search, + deterministic, + groups, + &transformed_out); #endif - // ------------------- cudnn conv algorithm --------------------- - auto handle = dev_ctx.cudnn_handle(); - auto layout_tensor = phi::backends::gpu::GetCudnnTensorFormat(layout); - bool deterministic = FLAGS_cudnn_deterministic; - - auto dtype = phi::backends::gpu::CudnnDataType::type; - // ------------------- cudnn descriptors --------------------- - ConvArgs args{handle, - &transformed_out, - &filter, - &transformed_x, - strides, - padding_common, - dilations_, - dtype, - groups, - data_layout}; - args.idesc.set(transformed_out, iwo_groups); - args.wdesc.set(filter, layout_tensor, iwo_groups); - args.odesc.set(transformed_x, iwo_groups); - args.cdesc.set(dtype, - padding_common, - strides, - dilations_, - phi::AllowTF32Cudnn(), - c_groups); - -#ifdef PADDLE_WITH_HIP - SearchResult bwd_result; - using search = SearchAlgorithm; - workspace_size = std::max(workspace_size, search::GetWorkspaceSize(args)); - bwd_result.algo = - search::Find(args, false, deterministic, workspace_size, dev_ctx); -#else - SearchResult bwd_result; - using search = SearchAlgorithm; - bwd_result = search::Find(dev_ctx, args, false, deterministic, false); - workspace_size = - std::max(workspace_size, search::GetWorkspaceSize(args, bwd_result.algo)); -#endif - - // ------------------- cudnn conv transpose forward --------------------- - int x_offset = transformed_x.numel() / transformed_x.dims()[0] / groups; - int out_offset = transformed_out.numel() / transformed_out.dims()[0] / groups; - int filter_offset = filter.numel() / groups; - ScalingParamType alpha = 1.0f; - ScalingParamType beta = 0.0f; - auto workspace_handle = dev_ctx.cudnn_workspace_handle(); -#ifdef PADDLE_WITH_HIP - for (int g = 0; g < groups; g++) { - auto cudnn_func = [&](void* cudnn_workspace) { - PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenConvolutionBackwardData( - handle, - &alpha, - args.odesc.desc(), - x_data + x_offset * g, - args.wdesc.desc(), - filter_data + filter_offset * g, - args.cdesc.desc(), - bwd_result.algo, - &beta, - args.idesc.desc(), - transformed_out_data + out_offset * g, - cudnn_workspace, - workspace_size)); - }; - workspace_handle.RunFunc(cudnn_func, workspace_size); - } -#else // PADDLE_WITH_HIP - ConvRunner::Apply(dev_ctx, - args, - bwd_result, - x_data, - filter_data, - transformed_out_data, - groups, - out_offset, - filter_offset, - x_offset, - workspace_size, - &workspace_handle, - false); -#endif // PADDLE_WITH_HIP if (!is_sys_pad && strides.size() == 2U) { funcs::Slice( diff --git a/paddle/phi/kernels/impl/accuracy_check_kernel_impl.h b/paddle/phi/kernels/impl/accuracy_check_kernel_impl.h index d6b9e1976a1270..4d78b934ab17b1 100644 --- a/paddle/phi/kernels/impl/accuracy_check_kernel_impl.h +++ b/paddle/phi/kernels/impl/accuracy_check_kernel_impl.h @@ -108,7 +108,7 @@ struct AccuracyCheckFunctor> { for (int i = 0; i < num; i++) { out_data[i] = true; } - bool val; + bool val = false; int res_index = -1; for (int i = 0; i < num; i++) { const phi::dtype::complex a = in_a[i], b = in_b[i]; diff --git a/paddle/phi/kernels/impl/einsum_grad_kernel_impl.h b/paddle/phi/kernels/impl/einsum_grad_kernel_impl.h index 0354e28761ab9a..8aa0c09d28ff05 100644 --- a/paddle/phi/kernels/impl/einsum_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/einsum_grad_kernel_impl.h @@ -44,7 +44,7 @@ DenseTensor PerformTileAndReduction(const Context& dev_ctx, std::vector resize_dims; std::vector recover_shape; std::vector t_shape = common::vectorize(t.dims()); - for (int i = 0; i < op_label.size(); i++) { + for (size_t i = 0; i < op_label.size(); i++) { int c = op_label[i]; if (label2type[c] == LabelType::Reduction) { repeat_times.push_back(label2shape[c]); @@ -64,7 +64,7 @@ DenseTensor PerformTileAndReduction(const Context& dev_ctx, "shape size: `%d`, but got label nums: `%d`", t_shape.size(), op_label.size())); - for (int i = 0; i < op_label.size(); i++) { + for (size_t i = 0; i < op_label.size(); i++) { int c = op_label[i]; if (label2type[c] == LabelType::Contraction && t_shape[i] != label2shape[c]) { diff --git a/paddle/phi/ops/yaml/op_compat.yaml b/paddle/phi/ops/yaml/op_compat.yaml index 652cb7b078fa50..1bde7d4f727ce3 100755 --- a/paddle/phi/ops/yaml/op_compat.yaml +++ b/paddle/phi/ops/yaml/op_compat.yaml @@ -21,7 +21,7 @@ outputs : out : Out extra : - attrs : [bool use_mkldnn = false] + attrs : [bool use_mkldnn = false, bool use_onednn = false] - op : accuracy inputs : @@ -42,7 +42,7 @@ out : Out backward : acosh_grad extra : - attrs : [bool use_mkldnn = false, bool use_cudnn = false] + attrs : [bool use_mkldnn = false, bool use_onednn = false, bool use_cudnn = false] - op : adadelta_ (adadelta) inputs : @@ -104,7 +104,7 @@ attrs : {scale_x : Scale_x, scale_y : Scale_y, scale_out : Scale_out} extra : - attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32", + attrs : [bool use_mkldnn = false, bool use_onednn = false, str mkldnn_data_type = "float32", bool use_quantizer = false, float Scale_x = 1.0f, float Scale_y = 1.0f, float Scale_out = 1.0f] complex_promote : [X, Y] @@ -114,7 +114,7 @@ outputs: {out : Out} extra : - attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32"] + attrs : [bool use_mkldnn = false, bool use_onednn = false, str mkldnn_data_type = "float32"] - op : add_position_encoding backward: add_position_encoding_grad @@ -132,7 +132,7 @@ attrs : {alpha : Alpha, beta : Beta} extra : - attrs : [bool use_mkldnn = false] + attrs : [bool use_mkldnn = false, bool use_onednn = false] - op : affine_channel backward: affine_channel_grad @@ -163,7 +163,7 @@ out : Out manual_signature : [all] extra : - attrs : [bool use_mkldnn = false] + attrs : [bool use_mkldnn = false, bool use_onednn = false] - op : allclose inputs : @@ -187,7 +187,7 @@ attrs: { axis : dim, keepdim : keep_dim } extra : - attrs : [bool use_mkldnn = false] + attrs : [bool use_mkldnn = false, bool use_onednn = false] get_expected_kernel_type : amax_grad : GetReduceGradExpectedKernelType manual_signature : [amax] @@ -201,7 +201,7 @@ attrs: { axis : dim, keepdim : keep_dim } extra : - attrs : [bool use_mkldnn = false] + attrs : [bool use_mkldnn = false, bool use_onednn = false] get_expected_kernel_type : amin_grad : GetReduceGradExpectedKernelType manual_signature : [amin] @@ -219,7 +219,7 @@ outputs : out : Out extra : - attrs : [bool use_mkldnn = false] + attrs : [bool use_mkldnn = false, bool use_onednn = false] - op : any (reduce_any) inputs : @@ -229,7 +229,7 @@ attrs: { axis : dim, keepdim : keep_dim } extra : - attrs : [bool use_mkldnn = false] + attrs : [bool use_mkldnn = false, bool use_onednn = false] get_expected_kernel_type : any : GetReduceOpUseInputPlaceExpectedKernelType manual_signature : [any] @@ -310,7 +310,7 @@ outputs : out : Out extra : - attrs : [bool use_mkldnn = false, bool use_cudnn = false] + attrs : [bool use_mkldnn = false, bool use_onednn = false, bool use_cudnn = false] - op : assert inputs : @@ -357,7 +357,7 @@ outputs : out : Out extra : - attrs : [bool use_mkldnn = false, bool use_cudnn = false] + attrs : [bool use_mkldnn = false, bool use_onednn = false, bool use_cudnn = false] - op : attention_lstm backward: attention_lstm_grad @@ -381,7 +381,7 @@ attrs : {alpha : Alpha, beta : Beta} extra : - attrs : [bool use_mkldnn = false] + attrs : [bool use_mkldnn = false, bool use_onednn = false] - op : barrier inputs : @@ -414,7 +414,7 @@ attrs: data_format: data_layout extra : - attrs : [bool use_mkldnn = false, bool fuse_with_relu = false] + attrs : [bool use_mkldnn = false, bool use_onednn = false, bool fuse_with_relu = false] - op : bce_loss backward : bce_loss_grad @@ -444,7 +444,7 @@ attrs: data_format: data_layout extra : - attrs : [bool use_mkldnn = false] + attrs : [bool use_mkldnn = false, bool use_onednn = false] - op : bilinear (bilinear_tensor_product) backward: bilinear_grad (bilinear_tensor_product_grad) @@ -462,7 +462,7 @@ attrs: data_format: data_layout extra : - attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32"] + attrs : [bool use_mkldnn = false, bool use_onednn = false, str mkldnn_data_type = "float32"] - op : bincount inputs : @@ -564,7 +564,7 @@ outputs : out : Out extra : - attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32"] + attrs : [bool use_mkldnn = false, bool use_onednn = false, str mkldnn_data_type = "float32"] - op : ceil backward : ceil_grad @@ -573,7 +573,7 @@ outputs : out : Out extra : - attrs : [bool use_mkldnn = false, bool use_cudnn = false] + attrs : [bool use_mkldnn = false, bool use_onednn = false, bool use_cudnn = false] - op : celu backward : celu_grad, celu_double_grad(celu_grad_grad) @@ -622,7 +622,7 @@ data_type : float tensor_name : Max extra : - attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32"] + attrs : [bool use_mkldnn = false, bool use_onednn = false, str mkldnn_data_type = "float32"] - op : clip_by_norm inputs : @@ -667,7 +667,7 @@ tensor_name : AxisTensor drop_empty_grad : [x_grad] extra : - attrs : [bool use_mkldnn = false, bool use_quantizer = false, str mkldnn_data_type = "float32"] + attrs : [bool use_mkldnn = false, bool use_onednn = false, bool use_quantizer = false, str mkldnn_data_type = "float32"] get_expected_kernel_type : concat : GetConcatExpectedKernelType @@ -689,7 +689,7 @@ outputs : out : Output extra : - attrs : [bool is_test = false, bool use_cudnn = true, bool use_mkldnn = false, bool use_addto = false, + attrs : [bool is_test = false, bool use_cudnn = true, bool use_mkldnn = false, bool use_onednn = false, bool use_addto = false, bool force_fp32_output = false, int workspace_size_MB = phi::backends::gpu::GetDefaultConvWorkspaceSizeLimitMB(), bool exhaustive_search = false, str mkldnn_data_type = "float32"] get_expected_kernel_type : @@ -707,7 +707,7 @@ support_tensor : true extra : inputs : [bias] - attrs : [bool is_test = false, bool use_cudnn = true, bool use_mkldnn = false, bool force_fp32_output = false, + attrs : [bool is_test = false, bool use_cudnn = true, bool use_mkldnn = false, bool use_onednn = false, bool force_fp32_output = false, str mkldnn_data_type = "float32", bool fuse_relu = false, str fuse_activation = "", float fuse_alpha = 0.0f, float fuse_beta = 0.0f, int workspace_size_MB = phi::backends::gpu::GetDefaultConvWorkspaceSizeLimitMB()] @@ -722,7 +722,7 @@ data_type : int support_tensor : true extra : - attrs : [bool is_test = false, bool use_cudnn = false, bool use_mkldnn = true, bool force_fp32_output = false, + attrs : [bool is_test = false, bool use_cudnn = false, bool use_mkldnn = true, bool use_onednn = false, bool force_fp32_output = false, str mkldnn_data_type = "float32", bool fuse_relu = false, str fuse_activation = "", float fuse_alpha = 0.0f, float fuse_beta = 0.0f] @@ -733,7 +733,7 @@ outputs : out : Output extra : - attrs : [bool is_test = false, bool use_cudnn = true, bool use_mkldnn = false, str mkldnn_data_type = "float32", bool fuse_relu = false, + attrs : [bool is_test = false, bool use_cudnn = true, bool use_mkldnn = false, bool use_onednn = false, str mkldnn_data_type = "float32", bool fuse_relu = false, str fuse_activation = "", float fuse_alpha = 0.0f, float fuse_beta = 0.0f, bool use_addto = false, bool fuse_residual_connection = false, bool force_fp32_output = false, int workspace_size_MB = phi::backends::gpu::GetDefaultConvWorkspaceSizeLimitMB(), bool exhaustive_search = false] @@ -747,7 +747,7 @@ outputs : out : Output extra : - attrs : [bool use_cudnn = true, bool use_mkldnn = false, int workspace_size_MB = phi::backends::gpu::GetDefaultConvWorkspaceSizeLimitMB()] + attrs : [bool use_cudnn = true, bool use_mkldnn = false, bool use_onednn = false, int workspace_size_MB = phi::backends::gpu::GetDefaultConvWorkspaceSizeLimitMB()] - op : correlation backward : correlation_grad @@ -756,7 +756,7 @@ outputs : out : Output extra : - attrs : [bool use_mkldnn = false] + attrs : [bool use_mkldnn = false, bool use_onednn = false] - op : cos backward : cos_grad, cos_double_grad, cos_triple_grad @@ -765,7 +765,7 @@ outputs : out : Out extra : - attrs : [bool use_mkldnn = false, bool use_cudnn = false] + attrs : [bool use_mkldnn = false, bool use_onednn = false, bool use_cudnn = false] - op : cosh backward : cosh_grad @@ -774,7 +774,7 @@ outputs : out : Out extra : - attrs : [bool use_mkldnn = false, bool use_cudnn = false] + attrs : [bool use_mkldnn = false, bool use_onednn = false, bool use_cudnn = false] - op : crop (crop_tensor) backward : crop_grad (crop_tensor_grad) @@ -837,7 +837,7 @@ - op : data_norm backward : data_norm_grad extra : - attrs : [bool use_mkldnn = false] + attrs : [bool use_mkldnn = false, bool use_onednn = false] - op : decode_jpeg inputs : @@ -861,7 +861,7 @@ attrs : {scale_in : Scale_in, scale_out : Scale_out, scale_in_eltwise : Scale_in_eltwise, scale_weights : Scale_weights} extra : - attrs : [bool is_test = false, bool use_cudnn = false, bool fuse_relu_before_depthwise_conv = false, bool use_mkldnn = false, + attrs : [bool is_test = false, bool use_cudnn = false, bool fuse_relu_before_depthwise_conv = false, bool use_mkldnn = false, bool use_onednn = false, bool use_quantizer = false, str mkldnn_data_type = "float32", bool fuse_relu = false, str fuse_activation = "", float fuse_alpha = 0.0f, float fuse_beta = 0.0f, bool use_addto = false, bool fuse_residual_connection = false, float Scale_in = 1.0f, float Scale_out = 1.0f, @@ -882,7 +882,7 @@ support_tensor : true extra : inputs : [bias] - attrs : [bool is_test = false, bool use_cudnn = false, bool use_mkldnn = false, bool force_fp32_output = false, + attrs : [bool is_test = false, bool use_cudnn = false, bool use_mkldnn = false, bool use_onednn = false, bool force_fp32_output = false, str mkldnn_data_type = "float32", bool fuse_relu = false, str fuse_activation = "", float fuse_alpha = 0.0f, float fuse_beta = 0.0f, int workspace_size_MB = phi::backends::gpu::GetDefaultConvWorkspaceSizeLimitMB()] @@ -979,7 +979,7 @@ outputs : out: Out extra : - attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32", + attrs : [bool use_mkldnn = false, bool use_onednn = false, str mkldnn_data_type = "float32", bool use_quantizer = false, float Scale_x = 1.0f, float Scale_y = 1.0f, float Scale_out = 1.0f] - op : dot @@ -1069,7 +1069,7 @@ outputs : {out : Out} extra : - attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32", + attrs : [bool use_mkldnn = false, bool use_onednn = false, str mkldnn_data_type = "float32", bool use_quantizer = false, float Scale_x = 1.0f, float Scale_y = 1.0f, float Scale_out = 1.0f] complex_promote : [X, Y] manual_signature : [elementwise_pow] @@ -1081,7 +1081,7 @@ outputs : out : Out extra : - attrs : [bool use_mkldnn = false] + attrs : [bool use_mkldnn = false, bool use_onednn = false] - op : embedding (lookup_table_v2) backward : embedding_grad (lookup_table_v2_grad) @@ -1137,7 +1137,7 @@ outputs : out : Out extra : - attrs : [bool use_mkldnn = false, bool use_cudnn = false] + attrs : [bool use_mkldnn = false, bool use_onednn = false, bool use_cudnn = false] - op : expand (expand_v2) backward : expand_grad (expand_v2_grad), expand_double_grad(expand_v2_double_grad) @@ -1153,7 +1153,7 @@ tensor_name : Shape tensors_name : expand_shapes_tensor extra : - attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32"] + attrs : [bool use_mkldnn = false, bool use_onednn = false, str mkldnn_data_type = "float32"] manual_signature : [expand, expand_grad] - op : expand_as (expand_as_v2) @@ -1170,7 +1170,7 @@ outputs : out : Out extra : - attrs : [bool use_mkldnn = false, bool use_cudnn = false] + attrs : [bool use_mkldnn = false, bool use_onednn = false, bool use_cudnn = false] - op : exponential_ (exponential) backward : exponential__grad (exponential_grad) @@ -1280,7 +1280,7 @@ attrs : {scale_in : Scale_in, scale_out : Scale_out, scale_weights : Scale_weights} extra : - attrs : [bool use_mkldnn = false, bool use_quantizer = false, str mkldnn_data_type = "float32", float Scale_in = 1.0f, "float[] Scale_weights = {1.0f}", float Scale_out = 1.0f, bool force_fp32_output = false, str fuse_activation = "" , float fuse_alpha = 0.0f, float fuse_beta = 0.0f, float fused_output_scale = 1.0f, 'int[] fused_reshape2_shape = {}'] + attrs : [bool use_mkldnn = false, bool use_onednn = false, bool use_quantizer = false, str mkldnn_data_type = "float32", float Scale_in = 1.0f, "float[] Scale_weights = {1.0f}", float Scale_out = 1.0f, bool force_fp32_output = false, str fuse_activation = "" , float fuse_alpha = 0.0f, float fuse_beta = 0.0f, float fused_output_scale = 1.0f, 'int[] fused_reshape2_shape = {}'] - op : feed outputs: {out: Out} @@ -1357,7 +1357,7 @@ {start_axis : start_axis, stop_axis : stop_axis} extra : outputs : [xshape] - attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32"] + attrs : [bool use_mkldnn = false, bool use_onednn = false, str mkldnn_data_type = "float32"] manual_signature : [flatten, flatten_grad] - op : flip @@ -1373,7 +1373,7 @@ outputs : out : Out extra : - attrs : [bool use_mkldnn = false, bool use_cudnn = false] + attrs : [bool use_mkldnn = false, bool use_onednn = false, bool use_cudnn = false] - op : floor_divide (elementwise_floordiv) inputs : @@ -1381,7 +1381,7 @@ outputs : {out : Out} extra : - attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32", + attrs : [bool use_mkldnn = false, bool use_onednn = false, str mkldnn_data_type = "float32", bool use_quantizer = false, float Scale_x = 1.0f, float Scale_y = 1.0f, float Scale_out = 1.0f] complex_promote : [X, Y] manual_signature : [floor_divide] @@ -1393,7 +1393,7 @@ outputs : {out : Out} extra : - attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32", + attrs : [bool use_mkldnn = false, bool use_onednn = false, str mkldnn_data_type = "float32", bool use_quantizer = false, float Scale_x = 1.0f, float Scale_y = 1.0f, float Scale_out = 1.0f] complex_promote : [X, Y] manual_signature : [fmax] @@ -1405,7 +1405,7 @@ outputs : {out : Out} extra : - attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32", + attrs : [bool use_mkldnn = false, bool use_onednn = false, str mkldnn_data_type = "float32", bool use_quantizer = false, float Scale_x = 1.0f, float Scale_y = 1.0f, float Scale_out = 1.0f] complex_promote : [X, Y] manual_signature : [fmin] @@ -1439,13 +1439,13 @@ frobenius_norm : GetReduceExpectedKernelType frobenius_norm_grad : GetReduceGradExpectedKernelType extra : - attrs : [bool use_mkldnn = false] + attrs : [bool use_mkldnn = false, bool use_onednn = false] - op : full (fill_constant) outputs : out : Out extra : - attrs : [bool use_mkldnn = false] + attrs : [bool use_mkldnn = false, bool use_onednn = false] - op : full_like (fill_any_like) inputs : @@ -1565,7 +1565,7 @@ {scale_in : Scale_in, scale_out : Scale_out, scale_in_eltwise : Scale_in_eltwise, scale_weights : Scale_weights} extra : attrs : [bool use_cudnn = false, float fuse_alpha = 0.0f, float fuse_beta = 0.0f, float Scale_in = 1.0f, - float Scale_out = 1.0f, float Scale_in_eltwise = 1.0f, 'float[] Scale_weights = {1.0f}', bool use_mkldnn = true, str mkldnn_data_type = "float32"] + float Scale_out = 1.0f, float Scale_in_eltwise = 1.0f, 'float[] Scale_weights = {1.0f}', bool use_mkldnn = true, bool use_onednn = false, str mkldnn_data_type = "float32"] - op : fused_conv2d_add_act inputs : @@ -1577,7 +1577,7 @@ output : Output outputs : Outputs extra : - attrs : [bool is_test = false, bool use_cudnn = true, bool fuse_relu_before_depthwise_conv = false, bool use_mkldnn = false, + attrs : [bool is_test = false, bool use_cudnn = true, bool fuse_relu_before_depthwise_conv = false, bool use_mkldnn = false, bool use_onednn = false, bool use_quantizer = false, str mkldnn_data_type = "float32", bool fuse_relu = false, str fuse_activation = "", float fuse_beta = 0.0f, bool use_addto = false, bool fuse_residual_connection = false, float Scale_in = 1.0f, float Scale_out = 1.0f, @@ -1594,7 +1594,7 @@ {scale_in : Scale_in, scale_out : Scale_out, scale_in_eltwise : Scale_in_eltwise, scale_weights : Scale_weights} extra : attrs : [bool use_cudnn = false, float fuse_alpha = 0.0f, float fuse_beta = 0.0f, float Scale_in = 1.0f, - float Scale_out = 1.0f, float Scale_in_eltwise = 1.0f, 'float[] Scale_weights = {1.0f}', bool use_mkldnn = true, str mkldnn_data_type = "float32"] + float Scale_out = 1.0f, float Scale_in_eltwise = 1.0f, 'float[] Scale_weights = {1.0f}', bool use_mkldnn = true, bool use_onednn = false, str mkldnn_data_type = "float32"] - op : fused_elementwise_add inputs : @@ -1741,7 +1741,7 @@ attrs : {scale_data : Scale_data, shift_data : Shift_data, scale_weights : Scale_weights} extra : - attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32", float Scale_data = 1.0f, float Shift_data = 0.0f, 'float[] Scale_weights = {1.0f}'] + attrs : [bool use_mkldnn = false, bool use_onednn = false, str mkldnn_data_type = "float32", float Scale_data = 1.0f, float Shift_data = 0.0f, 'float[] Scale_weights = {1.0f}'] - op : fusion_lstm inputs : @@ -1765,7 +1765,7 @@ attrs : {scale_data : Scale_data, shift_data : Shift_data, scale_weights : Scale_weights} extra : - attrs : [bool use_mkldnn = true, str mkldnn_data_type = "float32"] + attrs : [bool use_mkldnn = true, bool use_onednn = false, str mkldnn_data_type = "float32"] - op : fusion_repeated_fc_relu inputs : @@ -1834,7 +1834,7 @@ tensor_name : ShapeTensor tensors_name : ShapeTensorList extra : - attrs : [bool use_mkldnn = false] + attrs : [bool use_mkldnn = false, bool use_onednn = false] manual_signature : [gaussian] - op : gelu @@ -1844,7 +1844,7 @@ outputs : out : Out extra : - attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32"] + attrs : [bool use_mkldnn = false, bool use_onednn = false, str mkldnn_data_type = "float32"] - op : generate_proposals(generate_proposals_v2) inputs : @@ -1873,7 +1873,7 @@ outputs : {out : Out} extra : - attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32", + attrs : [bool use_mkldnn = false, bool use_onednn = false, str mkldnn_data_type = "float32", bool use_quantizer = false, float Scale_x = 1.0f, float Scale_y = 1.0f, float Scale_out = 1.0f] - op : graph_khop_sampler @@ -1948,7 +1948,7 @@ out : Out backward : hardswish_grad (hard_swish_grad) extra : - attrs : [bool use_mkldnn = false] + attrs : [bool use_mkldnn = false, bool use_onednn = false] manual_signature : [hardswish] - op : hardtanh (brelu) @@ -1973,7 +1973,7 @@ outputs : {out : Out} extra : - attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32", + attrs : [bool use_mkldnn = false, bool use_onednn = false, str mkldnn_data_type = "float32", bool use_quantizer = false, float Scale_x = 1.0f, float Scale_y = 1.0f, float Scale_out = 1.0f] complex_promote : [X, Y] @@ -2137,7 +2137,7 @@ out : Out backward : l1_norm_grad extra : - attrs : [bool use_mkldnn = false, bool use_cudnn = false] + attrs : [bool use_mkldnn = false, bool use_onednn = false, bool use_cudnn = false] - op : label_smooth inputs : @@ -2162,7 +2162,7 @@ mean : Mean variance : Variance extra : - attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32", bool is_test = false] + attrs : [bool use_mkldnn = false, bool use_onednn = false, str mkldnn_data_type = "float32", bool is_test = false] get_expected_kernel_type : layer_norm : GetLayerNormExpectedKernelType @@ -2175,7 +2175,7 @@ attrs: negative_slope : alpha extra : - attrs : [bool use_mkldnn = false] + attrs : [bool use_mkldnn = false, bool use_onednn = false] - op : legacy_bilinear_interp (bilinear_interp) backward : legacy_bilinear_interp_grad (bilinear_interp_grad) @@ -2186,7 +2186,7 @@ attrs: data_format: data_layout extra : - attrs : [bool use_mkldnn = false] + attrs : [bool use_mkldnn = false, bool use_onednn = false] - op : legacy_expand (expand) backward : legacy_expand_grad (expand_grad) @@ -2202,7 +2202,7 @@ tensor_name : ExpandTimes tensors_name : expand_times_tensor extra : - attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32"] + attrs : [bool use_mkldnn = false, bool use_onednn = false, str mkldnn_data_type = "float32"] manual_signature : [legacy_expand, legacy_expand_grad] - op : legacy_generate_proposals(generate_proposals) @@ -2236,7 +2236,7 @@ attrs: data_format: data_layout extra : - attrs : [bool use_mkldnn = false] + attrs : [bool use_mkldnn = false, bool use_onednn = false] - op : legacy_reshape (reshape) backward : legacy_reshape_grad (reshape_grad) @@ -2251,7 +2251,7 @@ tensor_name : Shape tensors_name : ShapeTensor extra : - attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32", bool use_quantizer = false] + attrs : [bool use_mkldnn = false, bool use_onednn = false, str mkldnn_data_type = "float32", bool use_quantizer = false] - op : lerp backward : lerp_grad @@ -2287,7 +2287,7 @@ attrs: data_format: data_layout extra : - attrs : [bool use_mkldnn = false] + attrs : [bool use_mkldnn = false, bool use_onednn = false] - op : linspace inputs : @@ -2309,7 +2309,7 @@ outputs : out : Out extra : - attrs : [bool use_mkldnn = false, bool use_cudnn = false] + attrs : [bool use_mkldnn = false, bool use_onednn = false, bool use_cudnn = false] - op : log10 backward : log10_grad @@ -2318,7 +2318,7 @@ outputs : out : Out extra : - attrs : [bool use_mkldnn = false, bool use_cudnn = false] + attrs : [bool use_mkldnn = false, bool use_onednn = false, bool use_cudnn = false] - op : log1p backward : log1p_grad @@ -2327,7 +2327,7 @@ outputs : out : Out extra : - attrs : [bool use_mkldnn = false, bool use_cudnn = false] + attrs : [bool use_mkldnn = false, bool use_onednn = false, bool use_cudnn = false] - op : log2 backward : log2_grad @@ -2336,7 +2336,7 @@ outputs : out : Out extra : - attrs : [bool use_mkldnn = false, bool use_cudnn = false] + attrs : [bool use_mkldnn = false, bool use_onednn = false, bool use_cudnn = false] - op : log_loss backward : log_loss_grad @@ -2352,7 +2352,7 @@ outputs : out: Out extra : - attrs : [bool use_mkldnn = false] + attrs : [bool use_mkldnn = false, bool use_onednn = false] - op : logcumsumexp backward : logcumsumexp_grad @@ -2398,7 +2398,7 @@ outputs : out : Out extra : - attrs : [bool use_mkldnn = false, bool use_cudnn = false] + attrs : [bool use_mkldnn = false, bool use_onednn = false, bool use_cudnn = false] - op : logsumexp backward : logsumexp_grad @@ -2424,7 +2424,7 @@ outputs : {out : Out, mid_out : MidOut} extra : - attrs : [bool use_mkldnn = false, bool is_test = false] + attrs : [bool use_mkldnn = false, bool use_onednn = false, bool is_test = false] - op : lstsq inputs : @@ -2473,7 +2473,7 @@ outputs : out : Out extra : - attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32", bool force_fp32_output = false] + attrs : [bool use_mkldnn = false, bool use_onednn = false, str mkldnn_data_type = "float32", bool force_fp32_output = false] complex_promote : [X, Y] - op : matmul_with_flatten (mul) @@ -2483,7 +2483,7 @@ outputs : out : Out extra : - attrs : [bool use_mkldnn = false, float scale_x = 1.0f, 'float[] scale_y = {1.0f}', + attrs : [bool use_mkldnn = false, bool use_onednn = false, float scale_x = 1.0f, 'float[] scale_y = {1.0f}', float scale_out = 1.0f, bool force_fp32_output = false] - op : matrix_nms @@ -2516,7 +2516,7 @@ outputs: out : Out extra : - attrs : [bool use_mkldnn = false] + attrs : [bool use_mkldnn = false, bool use_onednn = false] int_array: axis : data_type : int @@ -2549,7 +2549,7 @@ outputs : {out : Out} extra : - attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32", + attrs : [bool use_mkldnn = false, bool use_onednn = false, str mkldnn_data_type = "float32", bool use_quantizer = false, float Scale_x = 1.0f, float Scale_y = 1.0f, float Scale_out = 1.0f] complex_promote : [X, Y] manual_signature : [maximum] @@ -2569,7 +2569,7 @@ attrs : {axis : dim, keepdim : keep_dim} extra : - attrs : [bool use_mkldnn = false] + attrs : [bool use_mkldnn = false, bool use_onednn = false] int_array: axis : data_type : int @@ -2637,7 +2637,7 @@ attrs: { axis : dim, keepdim : keep_dim} extra : - attrs : [bool use_mkldnn = false] + attrs : [bool use_mkldnn = false, bool use_onednn = false] int_array: axis : data_type : int @@ -2654,7 +2654,7 @@ outputs : {out : Out} extra : - attrs : [bool use_mkldnn = false, str x_data_format = "", str y_data_format = "", str mkldnn_data_type = "float32", + attrs : [bool use_mkldnn = false, bool use_onednn = false, str x_data_format = "", str y_data_format = "", str mkldnn_data_type = "float32", bool use_quantizer = false, float Scale_x = 1.0f, float Scale_y = 1.0f, float Scale_out = 1.0f] complex_promote : [X, Y] manual_signature : [minimum] @@ -2666,7 +2666,7 @@ outputs: out: Out extra : - attrs : [bool use_mkldnn = false] + attrs : [bool use_mkldnn = false, bool use_onednn = false] - op : mode backward : mode_grad @@ -2751,7 +2751,7 @@ outputs : out : Out extra : - attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32", + attrs : [bool use_mkldnn = false, bool use_onednn = false, str mkldnn_data_type = "float32", bool use_quantizer = false, float Scale_x = 1.0f, float Scale_y = 1.0f, float Scale_out = 1.0f] - op : mv @@ -2781,7 +2781,7 @@ attrs: data_format: data_layout extra : - attrs : [bool use_mkldnn = false] + attrs : [bool use_mkldnn = false, bool use_onednn = false] - op : nll_loss backward : nll_loss_grad @@ -2863,7 +2863,7 @@ - op : pad2d backward : pad2d_grad extra : - attrs : [bool use_mkldnn = false] + attrs : [bool use_mkldnn = false, bool use_onednn = false] - op : pad3d backward : pad3d_grad, pad3d_double_grad @@ -2878,7 +2878,7 @@ attrs : pad_value : value extra : - attrs : [bool use_mkldnn = false] + attrs : [bool use_mkldnn = false, bool use_onednn = false] - op : partial_allgather inputs : @@ -2894,7 +2894,7 @@ out : Out drop_empty_grad : [x_grad] extra : - attrs : [bool use_mkldnn = false] + attrs : [bool use_mkldnn = false, bool use_onednn = false] - op : partial_recv outputs : @@ -2908,7 +2908,7 @@ out : Out drop_empty_grad : [x_grad] extra : - attrs : [bool use_mkldnn = false] + attrs : [bool use_mkldnn = false, bool use_onednn = false] - op : pixel_shuffle backward : pixel_shuffle_grad @@ -2947,7 +2947,7 @@ pool2d_grad : GetPoolExpectedKernelType pool2d_double_grad : GetPoolDoubleGradExpectedKernelType extra : - attrs : [bool use_mkldnn = false, bool use_quantizer = false, + attrs : [bool use_mkldnn = false, bool use_onednn = false, bool use_quantizer = false, str mkldnn_data_type = "float32", bool is_test = false] - op : pool3d @@ -2962,7 +2962,7 @@ pool3d : GetPoolExpectedKernelType pool3d_grad : GetPoolExpectedKernelType extra : - attrs : [bool use_mkldnn = false] + attrs : [bool use_mkldnn = false, bool use_onednn = false] - op : pow backward : pow_grad, pow_double_grad, pow_triple_grad @@ -2984,7 +2984,7 @@ outputs : out : Out extra : - attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32", bool is_test = false] + attrs : [bool use_mkldnn = false, bool use_onednn = false, str mkldnn_data_type = "float32", bool is_test = false] - op : print inputs : @@ -2998,7 +2998,7 @@ outputs : {out: Boxes, var: Variances} extra : - attrs : [bool use_mkldnn = false, bool use_quantizer = false, str mkldnn_data_type = "float32"] + attrs : [bool use_mkldnn = false, bool use_onednn = false, bool use_quantizer = false, str mkldnn_data_type = "float32"] - op : prod (reduce_prod) backward : prod_grad (reduce_prod_grad) @@ -3013,7 +3013,7 @@ data_type : int support_tensor : true extra : - attrs : [bool use_mkldnn = false] + attrs : [bool use_mkldnn = false, bool use_onednn = false] get_expected_kernel_type : prod : GetReduceExpectedKernelType prod_grad : GetReduceGradExpectedKernelType @@ -3093,7 +3093,7 @@ outputs : out : Out extra : - attrs : [bool use_mkldnn = false, bool use_cudnn = false] + attrs : [bool use_mkldnn = false, bool use_onednn = false, bool use_cudnn = false] - op : relu backward : relu_grad, relu_double_grad (relu_grad_grad) @@ -3102,7 +3102,7 @@ outputs : out : Out extra : - attrs : [bool use_mkldnn = false, bool use_cudnn = false, str mkldnn_data_type = "float32"] + attrs : [bool use_mkldnn = false, bool use_onednn = false, bool use_cudnn = false, str mkldnn_data_type = "float32"] - op : relu6 backward : relu6_grad @@ -3111,7 +3111,7 @@ outputs : out : Out extra : - attrs : [bool use_mkldnn = false, float threshold = 6.0] + attrs : [bool use_mkldnn = false, bool use_onednn = false, float threshold = 6.0] - op : remainder (elementwise_mod) inputs : @@ -3119,7 +3119,7 @@ outputs : {out : Out} extra : - attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32", + attrs : [bool use_mkldnn = false, bool use_onednn = false, str mkldnn_data_type = "float32", bool use_quantizer = false, float Scale_x = 1.0f, float Scale_y = 1.0f, float Scale_out = 1.0f] complex_promote : [X, Y] manual_signature : [remainder] @@ -3131,7 +3131,7 @@ outputs : out : Out extra : - attrs : [bool use_mkldnn = false, bool use_cudnn = false] + attrs : [bool use_mkldnn = false, bool use_onednn = false, bool use_cudnn = false] - op : repeat_interleave inputs : @@ -3180,7 +3180,7 @@ tensor_name : Shape tensors_name : ShapeTensor extra : - attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32", bool use_quantizer = false] + attrs : [bool use_mkldnn = false, bool use_onednn = false, str mkldnn_data_type = "float32", bool use_quantizer = false] - op : resnet_basic_block backward: resnet_basic_block_grad @@ -3253,7 +3253,7 @@ outputs : out : Out extra : - attrs : [bool use_mkldnn = false, bool use_cudnn = false] + attrs : [bool use_mkldnn = false, bool use_onednn = false, bool use_cudnn = false] - op : row_conv backward : row_conv_grad @@ -3269,7 +3269,7 @@ outputs : out : Out extra : - attrs : [bool use_mkldnn = false, bool use_cudnn = false] + attrs : [bool use_mkldnn = false, bool use_onednn = false, bool use_cudnn = false] - op : save_combine inputs : @@ -3289,7 +3289,7 @@ data_type : float support_tensor : false extra : - attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32"] + attrs : [bool use_mkldnn = false, bool use_onednn = false, str mkldnn_data_type = "float32"] - op : scatter backward : scatter_grad @@ -3398,7 +3398,7 @@ get_expected_kernel_type : sgd_ : GetSgdExpectedKernelType extra : - attrs : [bool use_mkldnn=false] + attrs : [bool use_mkldnn = false, bool use_onednn = false] - op : shape inputs : @@ -3408,7 +3408,7 @@ - op : shape extra : - attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32"] + attrs : [bool use_mkldnn = false, bool use_onednn = false, str mkldnn_data_type = "float32"] - op : shard_index inputs : @@ -3443,7 +3443,7 @@ outputs: {out : Out} extra : - attrs : [bool use_mkldnn = false] + attrs : [bool use_mkldnn = false, bool use_onednn = false] - op : sigmoid backward : sigmoid_grad, sigmoid_double_grad (sigmoid_grad_grad), sigmoid_triple_grad @@ -3452,7 +3452,7 @@ outputs : out : Out extra : - attrs : [bool use_mkldnn = false, bool use_cudnn = false, str mkldnn_data_type = "float32"] + attrs : [bool use_mkldnn = false, bool use_onednn = false, bool use_cudnn = false, str mkldnn_data_type = "float32"] - op : sign backward : sign_grad @@ -3468,7 +3468,7 @@ outputs : out : Out extra : - attrs : [bool use_mkldnn = false, bool use_cudnn = false] + attrs : [bool use_mkldnn = false, bool use_onednn = false, bool use_cudnn = false] - op : sin backward : sin_grad, sin_double_grad, sin_triple_grad @@ -3477,7 +3477,7 @@ outputs : out : Out extra : - attrs : [bool use_mkldnn = false, bool use_cudnn = false] + attrs : [bool use_mkldnn = false, bool use_onednn = false, bool use_cudnn = false] - op : sinh backward : sinh_grad @@ -3486,7 +3486,7 @@ outputs : out : Out extra : - attrs : [bool use_mkldnn = false, bool use_cudnn = false] + attrs : [bool use_mkldnn = false, bool use_onednn = false, bool use_cudnn = false] - op : slice backward : slice_grad @@ -3495,7 +3495,7 @@ outputs : out : Out extra : - attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32"] + attrs : [bool use_mkldnn = false, bool use_onednn = false, str mkldnn_data_type = "float32"] int_array : starts : data_type : int @@ -3530,7 +3530,7 @@ softmax : GetSoftmaxExpectedKernelType softmax_grad : GetSoftmaxGradExpectedKernelType extra : - attrs : [str data_format = "AnyLayout", bool use_cudnn = true, bool use_mkldnn = false, str mkldnn_data_type = "float32", bool is_test = false] + attrs : [str data_format = "AnyLayout", bool use_cudnn = true, bool use_mkldnn = false, bool use_onednn = false, str mkldnn_data_type = "float32", bool is_test = false] - op : softplus backward : softplus_grad, softplus_double_grad @@ -3539,7 +3539,7 @@ outputs : out : Out extra : - attrs : [bool use_mkldnn = false, bool use_cudnn = false] + attrs : [bool use_mkldnn = false, bool use_onednn = false, bool use_cudnn = false] - op : softshrink backward : softshrink_grad @@ -3557,7 +3557,7 @@ outputs : out : Out extra : - attrs : [bool use_mkldnn = false, bool use_cudnn = false] + attrs : [bool use_mkldnn = false, bool use_onednn = false, bool use_cudnn = false] - op : solve inputs : @@ -3619,7 +3619,7 @@ data_type : int support_tensor : true extra : - attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32"] + attrs : [bool use_mkldnn = false, bool use_onednn = false, str mkldnn_data_type = "float32"] - op : split_with_num scalar : @@ -3635,7 +3635,7 @@ outputs : out : Out extra : - attrs : [bool use_mkldnn = false, bool use_cudnn = false] + attrs : [bool use_mkldnn = false, bool use_onednn = false, bool use_cudnn = false] - op : square backward : square_grad, square_double_grad (square_grad_grad) @@ -3644,7 +3644,7 @@ outputs : out : Out extra : - attrs : [bool use_mkldnn = false, bool use_cudnn = false] + attrs : [bool use_mkldnn = false, bool use_onednn = false, bool use_cudnn = false] - op : squeeze (squeeze2) backward : squeeze_grad (squeeze2_grad), squeeze_double_grad(squeeze2_double_grad) @@ -3659,7 +3659,7 @@ data_type : int support_tensor : true extra : - attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32"] + attrs : [bool use_mkldnn = false, bool use_onednn = false, str mkldnn_data_type = "float32"] outputs : [xshape] - op : stack @@ -3669,7 +3669,7 @@ outputs : out : Y extra : - attrs : [bool use_mkldnn = false] + attrs : [bool use_mkldnn = false, bool use_onednn = false] drop_empty_grad : [x_grad] - op : stanh @@ -3716,7 +3716,7 @@ outputs : out : Out extra : - attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32", + attrs : [bool use_mkldnn = false, bool use_onednn = false, str mkldnn_data_type = "float32", bool use_quantizer = false, float Scale_x = 1.0f, float Scale_y = 1.0f, float Scale_out = 1.0f] complex_promote : [X, Y] @@ -3729,7 +3729,7 @@ attrs: { axis : dim, keepdim : keep_dim, dtype : out_dtype} extra : - attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32"] + attrs : [bool use_mkldnn = false, bool use_onednn = false, str mkldnn_data_type = "float32"] int_array: axis : data_type : int @@ -3753,7 +3753,7 @@ outputs : out : Out extra : - attrs : [bool use_mkldnn = false, float beta = 1.0] + attrs : [bool use_mkldnn = false, bool use_onednn = false, float beta = 1.0] - op : sync_batch_norm inputs : @@ -3764,7 +3764,7 @@ attrs: data_format: data_layout extra : - attrs : [bool use_mkldnn = false, bool fuse_with_relu = false] + attrs : [bool use_mkldnn = false, bool use_onednn = false, bool fuse_with_relu = false] - op : take_along_axis backward : take_along_axis_grad @@ -3782,7 +3782,7 @@ outputs : out : Out extra : - attrs : [bool use_mkldnn = false, bool use_cudnn = false] + attrs : [bool use_mkldnn = false, bool use_onednn = false, bool use_cudnn = false] - op : tanh backward : tanh_grad, tanh_double_grad (tanh_grad_grad), tanh_triple_grad @@ -3791,7 +3791,7 @@ outputs : out : Out extra : - attrs : [bool use_mkldnn = false, bool use_cudnn = false] + attrs : [bool use_mkldnn = false, bool use_onednn = false, bool use_cudnn = false] - op : tanh_shrink backward : tanh_shrink_grad @@ -3800,7 +3800,7 @@ outputs : out : Out extra : - attrs : [bool use_mkldnn = false, bool use_cudnn = false] + attrs : [bool use_mkldnn = false, bool use_onednn = false, bool use_cudnn = false] - op : tdm_child inputs : @@ -3808,7 +3808,7 @@ outputs : {child : Child, leaf_mask : LeafMask} extra : - attrs : [bool use_mkldnn = false] + attrs : [bool use_mkldnn = false, bool use_onednn = false] - op : tdm_sampler inputs: @@ -3872,7 +3872,7 @@ perm : axis extra : outputs : [XShape] - attrs : [bool use_mkldnn = false, str data_format = "AnyLayout", str mkldnn_data_type = "float32"] + attrs : [bool use_mkldnn = false, bool use_onednn = false, str data_format = "AnyLayout", str mkldnn_data_type = "float32"] - op : triangular_solve backward : triangular_solve_grad @@ -3897,7 +3897,7 @@ attrs: data_format: data_layout extra : - attrs : [bool use_mkldnn = false] + attrs : [bool use_mkldnn = false, bool use_onednn = false] - op : trunc inputs : diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py index 9111fe8eda5af1..53680e172adcd6 100644 --- a/python/paddle/__init__.py +++ b/python/paddle/__init__.py @@ -122,6 +122,7 @@ _pir_ops as _pir_ops, _typing as _typing, callbacks as callbacks, + compat as compat, fft as fft, hub as hub, linalg as linalg, @@ -328,6 +329,7 @@ masked_scatter_, moveaxis, put_along_axis, + ravel, repeat_interleave, reshape, reshape_, @@ -579,6 +581,7 @@ kthvalue, masked_select, mode, + msort, nonzero, searchsorted, sort, @@ -879,6 +882,7 @@ 'summary', 'flops', 'sort', + 'msort', 'searchsorted', 'bucketize', 'split', @@ -1092,6 +1096,7 @@ 'std', 'flatten', 'flatten_', + 'ravel', 'asin', 'multiply', 'multiply_', diff --git a/python/paddle/base/core.py b/python/paddle/base/core.py index b6ab11fad00a8c..dc434c2337f96b 100644 --- a/python/paddle/base/core.py +++ b/python/paddle/base/core.py @@ -325,6 +325,8 @@ def to_list(s): _switch_tracer, _test_enforce_gpu_success, _xpu_device_synchronize, + _xpu_get_current_stream, + _xpu_set_current_stream, ) # isort: off diff --git a/python/paddle/base/dygraph/math_op_patch.py b/python/paddle/base/dygraph/math_op_patch.py index bea78c7a528d9b..15270ea89e19b6 100644 --- a/python/paddle/base/dygraph/math_op_patch.py +++ b/python/paddle/base/dygraph/math_op_patch.py @@ -65,6 +65,36 @@ _already_patch_eager_tensor = False +_supported_dtype_conversions = { + # float + 'float16': 'float16', + 'half': 'float16', + 'bfloat16': 'bfloat16', + 'float32': 'float32', + 'float': 'float32', + 'float64': 'float64', + 'double': 'float64', + # int + 'int8': 'int8', + 'char': 'int8', + # We handle uint8 conversion separately + # 'uint8': 'uint8', + # 'byte': 'uint8', + 'int16': 'int16', + 'short': 'int16', + 'int32': 'int32', + 'int': 'int32', + 'int64': 'int64', + 'long': 'int64', + # other + 'bool': 'bool', + 'complex64': 'complex64', + 'complex128': 'complex128', + 'cfloat': 'complex64', + 'cdouble': 'complex128', +} + + def monkey_patch_math_tensor(): """ Similar to monkey_patch_variable. @@ -104,6 +134,44 @@ def astype(self: Tensor, dtype: DTypeLike) -> Tensor: return _C_ops.cast(self, dtype) + def byte(self: Tensor) -> Tensor: + # since paddle don't support float to uint8, so we need to convert it to int8 first + if self.is_floating_point(): + tensor = astype(self, 'int8') + return astype(tensor, 'uint8') + elif self.is_complex(): + real = astype(self.real(), 'int8') + return astype(real, 'uint8') + else: + return astype(self, 'uint8') + + def _create_dtype_conversion_methods(): + """ + Batch create all data type conversion methods + """ + methods = [] + + for method_name, target_dtype in _supported_dtype_conversions.items(): + + def make_conversion_method(dtype): + def conversion_method(self: Tensor) -> Tensor: + return astype(self, dtype) + + return conversion_method + + method_impl = make_conversion_method(target_dtype) + method_impl.__name__ = method_name + method_impl.__doc__ = f""" + Cast a Tensor to {target_dtype} data type if it differs from the current dtype; + otherwise, return the original Tensor. + Returns: + Tensor: a new Tensor with {target_dtype} dtype + """ + + methods.append((method_name, method_impl)) + + return methods + def _scalar_elementwise_op_( var: Tensor, scale: float, bias: float ) -> Tensor: @@ -225,6 +293,8 @@ def _mT_(var: Tensor) -> Tensor: ('__len__', _len_), ('__index__', _index_), ('astype', astype), + ('byte', byte), + ('uint8', byte), ('dim', dim), ('ndimension', ndimension), ('ndim', _ndim), @@ -235,6 +305,9 @@ def _mT_(var: Tensor) -> Tensor: ('__array_ufunc__', None), ] + dtype_conversion_methods = _create_dtype_conversion_methods() + eager_methods.extend(dtype_conversion_methods) + eager_cpp_level_patch = [ "__add__", "__radd__", diff --git a/python/paddle/compat.py b/python/paddle/compat.py new file mode 100644 index 00000000000000..d42b733edccc80 --- /dev/null +++ b/python/paddle/compat.py @@ -0,0 +1,21 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .tensor.compat import ( + split, +) + +__all__ = [ + 'split', +] diff --git a/python/paddle/device/__init__.py b/python/paddle/device/__init__.py index f50db1c25393bf..a0e8264dcf70df 100644 --- a/python/paddle/device/__init__.py +++ b/python/paddle/device/__init__.py @@ -43,8 +43,12 @@ from paddle._typing.device_like import PlaceLike from paddle.base.core import Place - _InitStreamBase = Union[core.CUDAStream, core.CustomDeviceStream] - _InitEventBase = Union[core.CUDAEvent, core.CustomDeviceEvent] + _InitStreamBase = Union[ + core.CUDAStream, core.CustomDeviceStream, core.XPUStream + ] + _InitEventBase = Union[ + core.CUDAEvent, core.CustomDeviceEvent, core.XPUEvent + ] from paddle import CUDAPlace, CustomPlace from paddle.base.libpaddle import _customDeviceProperties @@ -983,6 +987,11 @@ def __init__( self.event_base = core.CUDAEvent( enable_timing, blocking, interprocess ) + elif paddle.is_compiled_with_xpu() and isinstance( + self.device, paddle.XPUPlace + ): + self.event_base = core.XPUEvent() + elif isinstance(self.device, paddle.CustomPlace): self.event_base = core.CustomDeviceEvent( self.device.get_device_type(), @@ -1146,13 +1155,14 @@ def __init__( ) -> None: if stream_base is not None: if isinstance( - stream_base, (core.CUDAStream, core.CustomDeviceStream) + stream_base, + (core.CUDAStream, core.CustomDeviceStream, core.XPUStream), ): self.stream_base = stream_base self.device = stream_base.place else: raise TypeError( - "stream_base should be CUDAStream, CustomDeviceStream" + "stream_base should be CUDAStream, XPUStream, CustomDeviceStream" ) return @@ -1169,6 +1179,10 @@ def __init__( self.stream_base = core.CUDAStream( self.device.get_device_id(), priority ) + elif paddle.is_compiled_with_xpu() and isinstance( + self.device, paddle.XPUPlace + ): + self.stream_base = core.XPUStream(self.device.get_device_id()) elif isinstance(self.device, paddle.CustomPlace): self.stream_base = core.CustomDeviceStream( self.device.get_device_type(), @@ -1314,6 +1328,8 @@ def synchronize(self) -> None: def _as_parameter_(self): if isinstance(self.stream_base, core.CUDAStream): return ctypes.c_void_p(self.stream_base.cuda_stream) + elif isinstance(self.stream_base, core.XPUStream): + return ctypes.c_void_p(self.stream_base.xpu_stream) else: return ctypes.c_void_p(self.stream_base.raw_stream) @@ -1366,6 +1382,10 @@ def current_stream(device: PlaceLike | None = None) -> Stream: return Stream( stream_base=core._get_current_stream(place.get_device_id()) ) + elif paddle.is_compiled_with_xpu() and isinstance(place, paddle.XPUPlace): + return Stream( + stream_base=core._xpu_get_current_stream(place.get_device_id()) + ) elif isinstance(place, paddle.CustomPlace): return Stream( stream_base=core._get_current_custom_device_stream( @@ -1409,6 +1429,10 @@ def set_stream(stream: Stream) -> Stream: stream.stream_base.place, paddle.CUDAPlace ): core._set_current_stream(stream.stream_base) + elif paddle.is_compiled_with_xpu() and isinstance( + stream.stream_base.place, paddle.XPUPlace + ): + core._xpu_set_current_stream(stream.stream_base.idx) elif isinstance(stream.stream_base.place, paddle.CustomPlace): core._set_current_custom_device_stream( stream.stream_base.place.get_device_type(), diff --git a/python/paddle/device/xpu/__init__.py b/python/paddle/device/xpu/__init__.py index 9a48a70e4a7f23..3840c173953dcd 100644 --- a/python/paddle/device/xpu/__init__.py +++ b/python/paddle/device/xpu/__init__.py @@ -20,6 +20,8 @@ from paddle.base import core from paddle.utils import deprecated +from .streams import Event, Stream + if TYPE_CHECKING: from paddle import XPUPlace @@ -30,6 +32,8 @@ ] __all__ = [ + 'Stream', + 'Event', 'synchronize', 'device_count', 'set_debug_level', @@ -45,6 +49,45 @@ ] +def current_stream(device: _XPUPlaceLike | None = None) -> core.XPUStream: + ''' + Return the current XPU stream by the device. + + Args: + device(paddle.XPUPlace()|int|None, optional): The device or the ID of the device which want to get stream from. + If device is None, the device is the current device. Default: None. + + Returns: + XPUStream: the stream to the device. + + Examples: + .. code-block:: python + + >>> # doctest: +REQUIRES(env:XPU) + >>> import paddle + >>> paddle.device.set_device('xpu') + + >>> s1 = paddle.device.xpu.current_stream() + + >>> s2 = paddle.device.xpu.current_stream(0) + + >>> s3 = paddle.device.xpu.current_stream(paddle.XPUPlace(0)) + + ''' + + device_id = -1 + + if device is not None: + if isinstance(device, int): + device_id = device + elif isinstance(device, core.XPUPlace): + device_id = device.get_device_id() + else: + raise ValueError("device type must be int or paddle.XPUPlace") + + return core._xpu_get_current_stream(device_id) + + def extract_xpu_device_id(device: _XPUPlaceLike, op_name: str) -> int: ''' Return the id of the given xpu device. It is just a utility that will not be exposed to users. diff --git a/python/paddle/device/xpu/streams.py b/python/paddle/device/xpu/streams.py new file mode 100644 index 00000000000000..b396c38890e59f --- /dev/null +++ b/python/paddle/device/xpu/streams.py @@ -0,0 +1,18 @@ +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from paddle.base.core import ( # noqa: F401 + XPUEvent as Event, + XPUStream as Stream, +) diff --git a/python/paddle/nn/functional/__init__.py b/python/paddle/nn/functional/__init__.py index c822bb83fafff9..e2e37bf83dd33c 100644 --- a/python/paddle/nn/functional/__init__.py +++ b/python/paddle/nn/functional/__init__.py @@ -59,6 +59,7 @@ class_center_sample, cosine_similarity, dropout, + dropout1d, dropout2d, dropout3d, feature_alpha_dropout, @@ -216,6 +217,7 @@ 'gumbel_softmax', 'sequence_mask', 'dropout', + 'dropout1d', 'dropout2d', 'dropout3d', 'alpha_dropout', diff --git a/python/paddle/nn/functional/common.py b/python/paddle/nn/functional/common.py index 37cb95ab466089..14dd4a53642d54 100644 --- a/python/paddle/nn/functional/common.py +++ b/python/paddle/nn/functional/common.py @@ -15,6 +15,7 @@ from __future__ import annotations import math +import warnings from typing import TYPE_CHECKING, Literal import numpy @@ -1427,6 +1428,74 @@ def get_attrs(prog, dropout_prob, is_test, seed): return ret +def dropout1d( + input: paddle.Tensor, + p: float = 0.5, + training: bool = True, + inplace: bool = False, +) -> paddle.Tensor: + """ + Randomly zero out entire 1D channels (feature maps) during training. + + Args: + input: Input tensor of shape [C, L] (2D) or [N, C, L] (3D) + p: Probability of a channel being zeroed. Default: 0.5 + training: If False, returns input unchanged. Default: True + inplace: If True, modifies input tensor in-place. Default: False + WARNING: Currently not implemented (will behave as False). + TODO: Implement in-place operation in future versions. + Default: False + + Returns: + Tensor with the same shape as input, where entire channels are zeroed with probability p + + Examples: + .. code-block:: python + + >>> import paddle + + # Case 1: 3D input (batched) + >>> x = paddle.randn([2, 3, 10]) # [N, C, L] + >>> y_train = paddle.nn.functional.dropout1d(x, p=0.2) # Training mode + >>> y_test = paddle.nn.functional.dropout1d(x, p=0.2, training=False) # Test mode + >>> print("Original first channel:", x[0, 0, :]) + >>> print("Train output (may be zeroed):", y_train[0, 0, :]) + >>> print("Test output (always unchanged):", y_test[0, 0, :]) + + # Case 2: 2D input (single sample) + >>> x = paddle.randn([3, 8]) # [C, L] + >>> y = paddle.nn.functional.dropout1d(x, p=0.5) + >>> print("Input shape:", x.shape) + >>> print("Output shape:", y.shape) + >>> print("Zeroed channels count:", paddle.sum(y == 0).item()) + """ + if p < 0 or p > 1: + raise ValueError(f"dropout probability must be in [0, 1], got {p}") + + ndim = input.ndim + if ndim not in [2, 3]: + raise RuntimeError(f"dropout1d expects 2D or 3D input, got {ndim}D") + + if inplace: + warnings.warn( + "inplace=True is currently not supported in dropout1d and will be ignored. " + "This parameter is reserved for future implementation." + ) + # TODO: Implement actual in-place operation when supported by dropout + + need_squeeze = ndim == 2 + if need_squeeze: + input = input.unsqueeze(0) # [C, L] -> [1, C, L] + + # Apply dropout along channel dimension + result = dropout(input, p=p, axis=1, training=training) + + if need_squeeze: + result = result.squeeze(0) # [1, C, L] -> [C, L] + + return result + + def dropout2d( x: Tensor, p: float = 0.5, diff --git a/python/paddle/pir/math_op_patch.py b/python/paddle/pir/math_op_patch.py index f9f8bfcf733616..4712433d948768 100644 --- a/python/paddle/pir/math_op_patch.py +++ b/python/paddle/pir/math_op_patch.py @@ -37,6 +37,35 @@ DataType.INT64, ] +_supported_dtype_conversions = { + # float + 'float16': 'float16', + 'half': 'float16', + 'bfloat16': 'bfloat16', + 'float32': 'float32', + 'float': 'float32', + 'float64': 'float64', + 'double': 'float64', + # int + 'int8': 'int8', + 'char': 'int8', + # We handle uint8 conversion separately + # 'uint8': 'uint8', + # 'byte': 'uint8', + 'int16': 'int16', + 'short': 'int16', + 'int32': 'int32', + 'int': 'int32', + 'int64': 'int64', + 'long': 'int64', + # other + 'bool': 'bool', + 'complex64': 'complex64', + 'complex128': 'complex128', + 'cfloat': 'complex64', + 'cdouble': 'complex128', +} + SUPPORT_PROMOTION_OPS = [ "__add__", "__radd__", @@ -370,6 +399,41 @@ def astype(self, dtype): return _C_ops.cast(self, dtype) + def byte(self): + # since paddle don't support float to uint8, so we need to convert it to int8 first + if self.is_floating_point(): + tensor = astype(self, 'int8') + return astype(tensor, 'uint8') + elif self.is_complex(): + real = astype(self.real(), 'int8') + return astype(real, 'uint8') + else: + return astype(self, 'uint8') + + def _create_dtype_conversion_methods(): + """ + Batch create all data type conversion methods + """ + methods = [] + for method_name, target_dtype in _supported_dtype_conversions.items(): + + def make_conversion_method(dtype): + def conversion_method(self): + return astype(self, dtype) + + return conversion_method + + method_impl = make_conversion_method(target_dtype) + method_impl.__name__ = method_name + method_impl.__doc__ = f""" + Cast a Value to {target_dtype} data type if it differs from the current dtype; + otherwise, return the original Value. + Returns: + Value: a new Value with {target_dtype} dtype + """ + methods.append((method_name, method_impl)) + return methods + def _scalar_add_(var, value): return paddle.scale(var, 1.0, value) @@ -1109,6 +1173,8 @@ def register_hook(self, hook): ('ndimension', ndimension), ('ndim', _ndim), ('astype', astype), + ('byte', byte), + ('uint8', byte), ('size', _size_), ('T', _T_), ('mT', _mT_), @@ -1253,6 +1319,8 @@ def register_hook(self, hook): ('__bool__', _bool_), ('__complex__', _complex_), ] + dtype_conversion_methods = _create_dtype_conversion_methods() + value_methods.extend(dtype_conversion_methods) global _already_patch_value if not _already_patch_value: diff --git a/python/paddle/static/quantization/__init__.py b/python/paddle/static/quantization/__init__.py index b04cf7fbb7a297..48b6f518ec67ee 100644 --- a/python/paddle/static/quantization/__init__.py +++ b/python/paddle/static/quantization/__init__.py @@ -19,9 +19,11 @@ ) from .quant2_int8_onednn_pass import ( # noqa: F401 Quant2Int8MkldnnPass, + Quant2Int8OnednnPass, ) from .quant_int8_onednn_pass import ( # noqa: F401 QuantInt8MkldnnPass, + QuantInt8OnednnPass, ) from .quanter import ( # noqa: F401 convert, diff --git a/python/paddle/static/quantization/quant2_int8_onednn_pass.py b/python/paddle/static/quantization/quant2_int8_onednn_pass.py index 72b505d44a6054..966bd511c8df08 100644 --- a/python/paddle/static/quantization/quant2_int8_onednn_pass.py +++ b/python/paddle/static/quantization/quant2_int8_onednn_pass.py @@ -14,13 +14,15 @@ import numpy as np +from paddle.utils import deprecated + from ...base.framework import IrGraph from ...framework import _get_paddle_place, core OpRole = core.op_proto_and_checker_maker.OpRole -class Quant2Int8MkldnnPass: +class Quant2Int8OnednnPass: """ Transform a quant model IrGraph into MKL-DNN supported INT8 IrGraph. The pass consists of the following transformations: @@ -429,7 +431,7 @@ def _optimize_fp32_graph(self, graph): graph = self._update_activations(graph) graph = self._remove_ctrl_vars(graph) graph = self._apply_pass( - graph, 'onednn_placement_pass', ['mkldnn_enabled_op_types'], [set()] + graph, 'onednn_placement_pass', ['onednn_enabled_op_types'], [set()] ) # remove dropout ops graph = self._apply_pass(graph, 'simplify_with_basic_ops_pass') @@ -721,3 +723,14 @@ def _quantize_fp32_graph(self, graph): graph = self._apply_pass(graph, 'int8_scale_calculation_onednn_pass') graph = self._apply_pass(graph, 'params_quantization_onednn_pass') return graph + + +class Quant2Int8MkldnnPass(Quant2Int8OnednnPass): + @deprecated( + since="3.1.0", + update_to="paddle.static.quantization.Quant2Int8OnednnPass", + level=1, + reason="Quant2Int8MkldnnPass will be removed in future", + ) + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) diff --git a/python/paddle/static/quantization/quant_int8_onednn_pass.py b/python/paddle/static/quantization/quant_int8_onednn_pass.py index ad706837e0653e..2387e8bd9b70f7 100644 --- a/python/paddle/static/quantization/quant_int8_onednn_pass.py +++ b/python/paddle/static/quantization/quant_int8_onednn_pass.py @@ -14,11 +14,13 @@ import numpy as np +from paddle.utils import deprecated + from ...base.framework import IrGraph from ...framework import _get_paddle_place -class QuantInt8MkldnnPass: +class QuantInt8OnednnPass: """ Convert QuantizationFreezePass generated IrGraph to MKL-DNN supported INT8 IrGraph. Following transformations did in this pass: @@ -48,13 +50,13 @@ def __init__(self, _scope=None, _place=None): >>> # The original graph will be rewrite. >>> import paddle >>> from paddle import static - >>> from paddle.static.quantization import QuantInt8MkldnnPass + >>> from paddle.static.quantization import QuantInt8OnednnPass >>> from paddle.framework import IrGraph >>> from paddle.framework import core >>> graph = IrGraph(core.Graph(static.Program().desc), for_test=False) >>> place = paddle.CPUPlace() - >>> onednn_pass = QuantInt8MkldnnPass(static.global_scope(), place) + >>> onednn_pass = QuantInt8OnednnPass(static.global_scope(), place) >>> onednn_pass.apply(graph) """ @@ -245,7 +247,7 @@ def _transform_to_quantize_onednn(self, graph, op_node): quant_op_node = graph.create_op_node( op_type='quantize', attrs={ - 'data_format': 'MKLDNNLAYOUT', + 'data_format': 'ONEDNNLAYOUT', 'use_mkldnn': 1, 'Scale': scale_in, 'is_negative_input': 1, @@ -287,3 +289,14 @@ def _remove_unused_var_nodes(self, graph): ) ) graph.safe_remove_nodes(all_unused_vars) + + +class QuantInt8MkldnnPass(QuantInt8OnednnPass): + @deprecated( + since="3.1.0", + update_to="paddle.static.quantization.QuantInt8OnednnPass", + level=1, + reason="QuantInt8MkldnnPass will be removed in future", + ) + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) diff --git a/python/paddle/tensor/__init__.py b/python/paddle/tensor/__init__.py index 75d2882a04006f..32425a36ee145d 100644 --- a/python/paddle/tensor/__init__.py +++ b/python/paddle/tensor/__init__.py @@ -193,6 +193,7 @@ moveaxis, put_along_axis, put_along_axis_, + ravel, repeat_interleave, reshape, reshape_, @@ -459,6 +460,7 @@ kthvalue, masked_select, mode, + msort, nonzero, searchsorted, sort, @@ -726,6 +728,7 @@ 'index_select', 'nonzero', 'sort', + 'msort', 'index_sample', 'mean', 'std', diff --git a/python/paddle/tensor/compat.py b/python/paddle/tensor/compat.py new file mode 100644 index 00000000000000..a6a755b7025203 --- /dev/null +++ b/python/paddle/tensor/compat.py @@ -0,0 +1,213 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +from typing import TYPE_CHECKING + +import paddle +from paddle import _C_ops + +from ..base.framework import Variable +from ..framework import ( + in_dynamic_mode, +) + +if TYPE_CHECKING: + from collections.abc import Sequence + + from paddle import Tensor + +from paddle.utils.decorator_utils import ForbidKeywordsDecorator + +__all__ = [] + + +@ForbidKeywordsDecorator( + illegal_keys=["x", "num_or_sections", "axis", "name"], + func_name="paddle.compat.split", + correct_name="paddle.split", +) +def split( + tensor: Tensor, split_size_or_sections: int | Sequence[int], dim: int = 0 +) -> tuple[Tensor, ...]: + """ + (PyTorch Compatible API) Split the input tensor into multiple sub-Tensors. + + Args: + tensor (Tensor): A N-D Tensor. The data type is bool, bfloat16, float16, float32, float64, uint8, int8, int32 or int64. + split_size_or_sections (int|list|tuple): + If split_size_or_sections is an integer type, then tensor will be split into equally sized chunks (if possible). + Last chunk will be smaller if the tensor size along the given dimension dim is not divisible by split_size. + If split_size_or_sections is a list, then tensor will be split into len(split_size_or_sections) chunks with sizes + in dim according to split_size_or_sections. Negative inputs are not allowed. For example: for a dim with 9 channels, + [2, 3, -1] will not be interpreted as [2, 3, 4], but will be rejected and an exception will be thrown. + dim (int|Tensor, optional): The dim along which to split, it can be a integer or a ``0-D Tensor`` + with shape [] and data type ``int32`` or ``int64``. + If :math::`dim < 0`, the dim to split along is :math:`rank(x) + dim`. Default is 0. + Returns: + tuple(Tensor), The tuple of segmented Tensors. + + Note: + This is a pytorch compatible API that follows the function signature and behavior of torch.split. + To use the original split of paddle, please consider `paddle.split` + + Examples: + .. code-block:: python + + >>> import paddle + + >>> # x is a Tensor of shape [3, 8, 5] + >>> x = paddle.rand([3, 8, 5]) + + >>> out0, out1, out2 = paddle.compat.split(x, split_size_or_sections=3, dim=1) + >>> print(out0.shape) + [3, 3, 5] + >>> print(out1.shape) + [3, 3, 5] + >>> print(out2.shape) + [3, 2, 5] + + >>> out0, out1, out2 = paddle.compat.split(x, split_size_or_sections=[1, 2, 5], dim=1) + >>> print(out0.shape) + [3, 1, 5] + >>> print(out1.shape) + [3, 2, 5] + >>> print(out2.shape) + [3, 5, 5] + + >>> # dim is negative, the real dim is (rank(x) + dim)=1 + >>> out0, out1, out2 = paddle.compat.split(x, split_size_or_sections=3, dim=-2) + >>> print(out0.shape) + [3, 3, 5] + >>> print(out1.shape) + [3, 3, 5] + >>> print(out2.shape) + [3, 2, 5] + """ + + def GetSplitSize(split_size, shape_on_dim): + remaining_num = shape_on_dim % split_size_or_sections + num_complete_section = shape_on_dim // split_size_or_sections + if remaining_num == 0: + return num_complete_section + else: + sections = [ + split_size_or_sections for _ in range(num_complete_section) + ] + sections.append(remaining_num) + return sections + + def GetShapeOnDimInRange(shape, dim: int) -> int: + shape_range = len(shape) + if isinstance(dim, int): + if dim < -shape_range or dim >= shape_range: + raise ValueError( + f"(InvalidArgument) The dim is expected to be in range of [-{shape_range}, {shape_range}), but got {dim}" + ) + return shape[dim] + + if isinstance(split_size_or_sections, (list, tuple)): + for i, section_size in enumerate(split_size_or_sections): + shape_val = 0 + if isinstance(section_size, Variable): + shape_val = int(section_size.item(0)) + else: + shape_val = section_size + if section_size < 0: + raise ValueError( + f"paddle.compat.split expects split_sizes have only non-negative entries, but got size = {section_size} on dim {i}" + ) + + if in_dynamic_mode(): + if isinstance(dim, Variable): + dim = dim.item(0) + assert dim + len(tensor.shape) >= 0, "(rank(x) + dim) must >= 0" + dim = (dim + len(tensor.shape)) if dim < 0 else dim + + if isinstance(split_size_or_sections, (list, tuple)): + if paddle.utils._contain_var(split_size_or_sections): + for index, item in enumerate(split_size_or_sections): + if isinstance(item, Variable): + split_size_or_sections[index] = split_size_or_sections[ + index + ].item() + elif not isinstance(split_size_or_sections, int): + raise TypeError( + "The type of 'split_size_or_sections' in split must be int, list or tuple in imperative mode, but " + f"received {type(split_size_or_sections)}." + ) + + if isinstance(split_size_or_sections, int): + # check whether shape is divisible + assert ( + split_size_or_sections > 0 + ), 'split_size_or_sections must be greater than 0.' + + split_size_or_sections = GetSplitSize( + split_size_or_sections, GetShapeOnDimInRange(tensor.shape, dim) + ) + + if isinstance(split_size_or_sections, list): + return tuple(_C_ops.split(tensor, split_size_or_sections, dim)) + else: + return tuple( + _C_ops.split_with_num(tensor, split_size_or_sections, dim) + ) + else: + return tuple(_C_ops.split(tensor, split_size_or_sections, dim)) + else: + if isinstance(dim, paddle.pir.Value): + raise TypeError( + "'dim' is not allowed to be a pir.Value in a static graph: " + "\npir.Value can not be used for indexing python lists/tuples." + ) + if isinstance(dim, int): + assert len(tensor.shape) + dim >= 0, "(rank(x) + dim) must >= 0" + dim = (len(tensor.shape) + dim) if dim < 0 else dim + + input_shape = tensor.shape + + if not isinstance(split_size_or_sections, (int, list, tuple)): + raise TypeError( + "The type of 'split_size_or_sections' in split must be int, list or tuple in imperative mode." + ) + if isinstance(split_size_or_sections, int): + assert ( + split_size_or_sections > 0 + ), 'split_size_or_sections must be greater than 0.' + + split_size_or_sections = GetSplitSize( + split_size_or_sections, GetShapeOnDimInRange(tensor.shape, dim) + ) + if isinstance(split_size_or_sections, list): + if paddle.utils._contain_var(split_size_or_sections): + split_size_or_sections = paddle.utils.get_int_tensor_list( + split_size_or_sections + ) + return tuple(_C_ops.split(tensor, split_size_or_sections, dim)) + else: + return tuple( + _C_ops.split_with_num(tensor, split_size_or_sections, dim) + ) + else: + if isinstance(dim, int) and input_shape[dim] > 0: + assert ( + len(split_size_or_sections) <= input_shape[dim] + ), 'len(split_size_or_sections) must not be more than input.shape[dim].' + if paddle.utils._contain_var(split_size_or_sections): + split_size_or_sections = paddle.utils.get_int_tensor_list( + split_size_or_sections + ) + return tuple(_C_ops.split(tensor, split_size_or_sections, dim)) diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py index 6f03d03d47c1d3..2014603dff6ca6 100644 --- a/python/paddle/tensor/manipulation.py +++ b/python/paddle/tensor/manipulation.py @@ -24,6 +24,7 @@ import paddle from paddle import _C_ops from paddle.tensor import fill_constant +from paddle.utils.decorator_utils import ParamAliasDecorator from paddle.utils.inplace_utils import inplace_apis_in_dygraph_only from ..base.data_feeder import ( @@ -57,6 +58,8 @@ TensorOrTensors, ) +from paddle.utils.decorator_utils import ForbidKeywordsDecorator + __all__ = [] @@ -1991,6 +1994,46 @@ def flatten( return out +def ravel(input: Tensor) -> Tensor: + """ + Return a contiguous flattened tensor. A copy is made only if needed. + Note: + The output Tensor will share data with origin Tensor and doesn't have a Tensor copy in ``dygraph`` mode. + If you want to use the Tensor copy version, please use `Tensor.clone` like ``ravel_clone_x = x.ravel().clone()``. + For example: + + .. code-block:: text + Case 1: + Given + X.shape = (3, 100, 100, 4) + + We get: + Out.shape = (3 * 100 * 100 * 4) + Args: + x (Tensor): A tensor of number of dimensions >= axis. A tensor with data type float16, float32, + float64, int8, int32, int64, uint8. + + Returns: + Tensor: A tensor with the contents of the input tensor, whose input axes are flattened by indicated :attr:`start_axis` and :attr:`stop_axis`, and data type is the same as input :attr:`x`. + + Examples: + + .. code-block:: python + + >>> import paddle + + >>> image_shape=(2, 3, 4, 4) + + >>> x = paddle.arange(end=image_shape[0] * image_shape[1] * image_shape[2] * image_shape[3]) + >>> img = paddle.reshape(x, image_shape) + + >>> out = paddle.ravel(img) + >>> print(out.shape) + [96] + """ + return flatten(input) + + @inplace_apis_in_dygraph_only def flatten_( x: Tensor, start_axis: int = 0, stop_axis: int = -1, name: str | None = None @@ -2682,6 +2725,11 @@ def row_stack(x: Sequence[Tensor], name: str | None = None) -> Tensor: return paddle.vstack(x, name=name) +@ForbidKeywordsDecorator( + illegal_keys=["tensor", "split_size_or_sections", "dim"], + func_name="paddle.split", + correct_name="paddle.compat.split", +) def split( x: Tensor, num_or_sections: int | Sequence[int], @@ -4762,6 +4810,7 @@ def expand_as(x: Tensor, y: Tensor, name: str | None = None) -> Tensor: return out +@ParamAliasDecorator({"x": ["input"], "shape": ["size"]}) def broadcast_to( x: Tensor, shape: ShapeLike, name: str | None = None ) -> Tensor: diff --git a/python/paddle/tensor/search.py b/python/paddle/tensor/search.py index 3837d7595f8cc7..6b91b36f40fa3a 100755 --- a/python/paddle/tensor/search.py +++ b/python/paddle/tensor/search.py @@ -676,6 +676,44 @@ def sort( return out +def msort(input: Tensor) -> Tensor: + """ + + Sorts the input along the given axis = 0, and returns the sorted output tensor. The sort algorithm is ascending. + + Args: + input (Tensor): An input N-D Tensor with type float32, float64, int16, + int32, int64, uint8. + + Returns: + Tensor, sorted tensor(with the same shape and data type as ``input``). + + Examples: + + .. code-block:: python + + >>> import paddle + + >>> x = paddle.to_tensor([[[5,8,9,5], + ... [0,0,1,7], + ... [6,9,2,4]], + ... [[5,2,4,2], + ... [4,7,7,9], + ... [1,7,0,6]]], + ... dtype='float32') + >>> out1 = paddle.msort(input=x) + >>> print(out1.numpy()) + [[[5. 2. 4. 2.] + [0. 0. 1. 7.] + [1. 7. 0. 4.]] + [[5. 8. 9. 5.] + [4. 7. 7. 9.] + [6. 9. 2. 6.]]] + """ + + return sort(input, axis=0) + + def mode( x: Tensor, axis: int = -1, keepdim: bool = False, name: str | None = None ) -> tuple[Tensor, Tensor]: diff --git a/python/paddle/utils/__init__.py b/python/paddle/utils/__init__.py index 061568cc0ce9a1..3fbcf6af86b4df 100644 --- a/python/paddle/utils/__init__.py +++ b/python/paddle/utils/__init__.py @@ -15,6 +15,7 @@ from ..base.framework import require_version from . import ( # noqa: F401 cpp_extension, + decorator_utils, dlpack, download, image_util, diff --git a/python/paddle/utils/decorator_utils.py b/python/paddle/utils/decorator_utils.py new file mode 100644 index 00000000000000..4eb62a32602fb0 --- /dev/null +++ b/python/paddle/utils/decorator_utils.py @@ -0,0 +1,136 @@ +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import functools +import inspect +from typing import ( + TYPE_CHECKING, + Any, + Callable, + Generic, + TypeVar, + cast, +) + +from typing_extensions import ParamSpec + +if TYPE_CHECKING: + from collections.abc import Iterable + + +_P = ParamSpec("_P") +_R = TypeVar("_R") +_DecoratedFunc = Callable[_P, _R] + + +class DecoratorBase(Generic[_P, _R]): + """Decorative base class, providing a universal decorative framework. + + Subclass only needs to implement the 'process' method to define the core logic. + """ + + def __init__(self, *args: Any, **kwargs: Any) -> None: + """Initialize decorator parameters""" + self.args = args + self.kwargs = kwargs + + def __call__(self, func: _DecoratedFunc[_P, _R]) -> _DecoratedFunc[_P, _R]: + """As an entry point for decorative applications""" + + @functools.wraps(func) + def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> _R: + # Pretreatment parameters + processed_args, processed_kwargs = self.process(args, kwargs) + # Call the original function + return func(*processed_args, **processed_kwargs) + + # Keep original signature + wrapper.__signature__ = inspect.signature(func) + return cast("_DecoratedFunc[_P, _R]", wrapper) + + def process( + self, args: tuple[Any, ...], kwargs: dict[str, Any] + ) -> tuple[tuple[Any, ...], dict[str, Any]]: + """Core processing methods that subclasses must implement. + + Args: + args: positional parameter + kwargs: Keyword Argument + + Returns: + Processed tuples (args, kwargs) + """ + raise NotImplementedError("Subclasses must implement this method") + + +# Example implementation: Parameter alias decorator +class ParamAliasDecorator(DecoratorBase[_P, _R]): + """Implementation of Decorator for Parameter Alias Processing""" + + def __init__(self, alias_mapping: dict[str, Iterable[str]]) -> None: + super().__init__() + if not isinstance(alias_mapping, dict): + raise TypeError("alias_mapping must be a dictionary") + for k, v in alias_mapping.items(): + if not isinstance(v, (list, tuple, set)): + raise TypeError(f"Aliases for '{k}' must be iterable") + self.alias_mapping = alias_mapping + + def process( + self, args: tuple[Any, ...], kwargs: dict[str, Any] + ) -> tuple[tuple[Any, ...], dict[str, Any]]: + if not kwargs: + return args, kwargs + processed_kwargs = kwargs.copy() + for original, aliases in self.alias_mapping.items(): + for alias in aliases: + if alias in processed_kwargs: + if original not in processed_kwargs: + processed_kwargs[original] = processed_kwargs.pop(alias) + else: + raise ValueError( + f"Cannot specify both '{original}' and its alias '{alias}'" + ) + return args, processed_kwargs + + +class ForbidKeywordsDecorator(DecoratorBase[_P, _R]): + """A decorator that hints users to use the correct `compat` functions, when erroneous keyword arguments are detected""" + + def __init__( + self, illegal_keys: list[str] | str, func_name: str, correct_name: str + ) -> None: + super().__init__() + self.illegal_keys = ( + [illegal_keys] if isinstance(illegal_keys, str) else illegal_keys + ) + self.func_name = func_name + self.correct_name = correct_name + + def process( + self, args: tuple[Any, ...], kwargs: dict[str, Any] + ) -> tuple[tuple[Any, ...], dict[str, Any]]: + found_keys = [key for key in self.illegal_keys if key in kwargs] + + if found_keys: + keys_str = ", ".join(f"'{key}'" for key in found_keys) + plural = "s" if len(found_keys) > 1 else "" + + raise TypeError( + f"{self.func_name}() received unexpected keyword argument{plural} {keys_str}. " + f"\nDid you mean to use {self.correct_name}() instead?" + ) + return args, kwargs diff --git a/python/setup_cinn.py.in b/python/setup_cinn.py.in index 80d639d052ba1d..67dd46b8b52335 100644 --- a/python/setup_cinn.py.in +++ b/python/setup_cinn.py.in @@ -156,6 +156,7 @@ if '${WITH_GPU}' == 'ON': if '${WITH_ROCM}' == 'ON': cinnlibs.append('${CMAKE_BINARY_DIR}/dist/cinn/include/paddle/cinn/runtime/hip/cinn_hip_runtime_source.h') + cinnlibs.append('${CMAKE_BINARY_DIR}/dist/cinn/include/paddle/cinn/runtime/hip/float16.h') if '${CINN_WITH_SYCL}' == 'ON': cinnlibs.append('${CMAKE_BINARY_DIR}/dist/cinn/include/paddle/cinn/runtime/sycl/cinn_sycl_runtime_source.h') diff --git a/test/cpp/inference/api/config_printer.h b/test/cpp/inference/api/config_printer.h index e1b1405a397208..d14a67828a5c1d 100644 --- a/test/cpp/inference/api/config_printer.h +++ b/test/cpp/inference/api/config_printer.h @@ -75,7 +75,7 @@ std::ostream &operator<<(std::ostream &os, const AnalysisConfig &config) { << "cpu_num_threads: " << config.cpu_math_library_num_threads() << "\n"; os << GenSpaces(num_spaces) << "use_tensorrt: " << config.tensorrt_engine_enabled() << "\n"; - os << GenSpaces(num_spaces) << "use_mkldnn: " << config.mkldnn_enabled() + os << GenSpaces(num_spaces) << "use_onednn: " << config.onednn_enabled() << "\n"; num_spaces--; os << GenSpaces(num_spaces) << "}\n"; diff --git a/test/deprecated/cpp/inference/api/analysis_predictor_tester.cc b/test/deprecated/cpp/inference/api/analysis_predictor_tester.cc index e94453a9598855..38978395b5ac7c 100644 --- a/test/deprecated/cpp/inference/api/analysis_predictor_tester.cc +++ b/test/deprecated/cpp/inference/api/analysis_predictor_tester.cc @@ -335,7 +335,7 @@ TEST(AnalysisPredictor, bf16_pass_strategy) { passStrategy.EnableMkldnnBfloat16(); } -TEST(AnalysisPredictor, mkldnn_fc_pass_strategy) { +TEST(AnalysisPredictor, onednn_fc_pass_strategy) { std::vector passes; PassStrategy passStrategy(passes); passStrategy.DisableOnednnFcPasses(); @@ -343,7 +343,7 @@ TEST(AnalysisPredictor, mkldnn_fc_pass_strategy) { } #ifdef PADDLE_WITH_DNNL -TEST(AnalysisPredictor, mkldnn_fc_passes_cpu_pass_strategy) { +TEST(AnalysisPredictor, onednn_fc_passes_cpu_pass_strategy) { CpuPassStrategy cpuPassStrategy; cpuPassStrategy.EnableONEDNN(); const std::vector fc_passes_to_erase( @@ -359,15 +359,15 @@ TEST(AnalysisPredictor, mkldnn_fc_passes_cpu_pass_strategy) { #endif #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -TEST(AnalysisPredictor, mkldnn_fc_passes_gpu_pass_strategy) { +TEST(AnalysisPredictor, onednn_fc_passes_gpu_pass_strategy) { AnalysisConfig config; config.EnableUseGpu(100, 0); config.EnableONEDNN(); config.DisableOnednnFcPasses(); #ifdef PADDLE_WITH_DNNL - ASSERT_TRUE(config.mkldnn_fc_passes_disabled()); + ASSERT_TRUE(config.onednn_fc_passes_disabled()); #else - ASSERT_FALSE(config.mkldnn_fc_passes_disabled()); + ASSERT_FALSE(config.onednn_fc_passes_disabled()); #endif } #endif diff --git a/test/deprecated/cpp/inference/api/analyzer_bert_tester_deprecated.cc b/test/deprecated/cpp/inference/api/analyzer_bert_tester_deprecated.cc index fc85ae1b10f7e2..ec10b780a35eeb 100644 --- a/test/deprecated/cpp/inference/api/analyzer_bert_tester_deprecated.cc +++ b/test/deprecated/cpp/inference/api/analyzer_bert_tester_deprecated.cc @@ -22,13 +22,13 @@ namespace inference { using paddle::PaddleTensor; -void profile(bool use_mkldnn = false, bool use_bfloat16 = false); +void profile(bool use_onednn = false, bool use_bfloat16 = false); std::vector> LoadInputData(); -void CompareNativeAndAnalysisWrapper(bool use_mkldnn = false); +void CompareNativeAndAnalysisWrapper(bool use_onednn = false); std::vector ParseInputStreamToVector( const std::string &line); -AnalysisConfig SetConfig(bool use_mkldnn = false, bool use_bfloat16 = false); +AnalysisConfig SetConfig(bool use_onednn = false, bool use_bfloat16 = false); template paddle::PaddleTensor ParseTensor(const std::string &field); @@ -50,15 +50,15 @@ TEST(Analyzer_bert, profile) { } #ifdef PADDLE_WITH_DNNL -TEST(Analyzer_bert, profile_mkldnn) { - auto use_mkldnn = true; - profile(use_mkldnn); +TEST(Analyzer_bert, profile_onednn) { + auto use_onednn = true; + profile(use_onednn); } -TEST(Analyzer_bert, profile_mkldnn_bf16) { - auto use_mkldnn = true; +TEST(Analyzer_bert, profile_onednn_bf16) { + auto use_onednn = true; auto use_bfloat16 = true; - profile(use_mkldnn, use_bfloat16); + profile(use_onednn, use_bfloat16); } #endif @@ -70,8 +70,8 @@ TEST(Analyzer_bert, compare) { } #ifdef PADDLE_WITH_DNNL TEST(Analyzer_bert, compare_mkldnn) { - auto use_mkldnn = true; - CompareNativeAndAnalysisWrapper(use_mkldnn); + auto use_onednn = true; + CompareNativeAndAnalysisWrapper(use_onednn); } #endif @@ -135,8 +135,8 @@ TEST(Analyzer_bert, transfer_scope_cache) { "The size of data cache is not equal to thread number.")); } -void profile(bool use_mkldnn, bool use_bfloat16) { - auto config(SetConfig(use_mkldnn, use_bfloat16)); +void profile(bool use_onednn, bool use_bfloat16) { + auto config(SetConfig(use_onednn, use_bfloat16)); std::vector> outputs; auto inputs = LoadInputData(); TestPrediction(reinterpret_cast(&config), @@ -168,8 +168,8 @@ std::vector> LoadInputData() { return inputs; } -void CompareNativeAndAnalysisWrapper(bool use_mkldnn) { - auto cfg(SetConfig(use_mkldnn)); +void CompareNativeAndAnalysisWrapper(bool use_onednn) { + auto cfg(SetConfig(use_onednn)); auto inputs = LoadInputData(); CompareNativeAndAnalysis( reinterpret_cast(&cfg), inputs); @@ -201,12 +201,12 @@ std::vector ParseInputStreamToVector( return tensors; } -AnalysisConfig SetConfig(bool use_mkldnn, bool use_bfloat16) { +AnalysisConfig SetConfig(bool use_onednn, bool use_bfloat16) { AnalysisConfig config; config.SetModel(FLAGS_infer_model); config.DisableFCPadding(); - if (use_mkldnn) { + if (use_onednn) { config.EnableONEDNN(); } diff --git a/test/deprecated/ir/inference/auto_scan_test.py b/test/deprecated/ir/inference/auto_scan_test.py index 041f84d50b804b..752b5f32d011ba 100755 --- a/test/deprecated/ir/inference/auto_scan_test.py +++ b/test/deprecated/ir/inference/auto_scan_test.py @@ -239,7 +239,7 @@ def create_inference_config( if use_gpu: config.enable_use_gpu(100, 0) if not use_mkldnn: - config.disable_mkldnn() + config.disable_onednn() if use_xpu: config.enable_xpu() if passes is not None: @@ -248,7 +248,7 @@ def create_inference_config( return config -class MkldnnAutoScanTest(AutoScanTest): +class OnednnAutoScanTest(AutoScanTest): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -336,14 +336,14 @@ def run_test(self, quant=False, *args, **kwargs): def inference_config_str(self, config) -> str: dic = {} - enable_mkldnn = config.mkldnn_enabled() - dic["use_mkldnn"] = enable_mkldnn + enable_onednn = config.onednn_enabled() + dic["use_mkldnn"] = enable_onednn enable_gpu = config.use_gpu() dic["use_gpu"] = enable_gpu return str(dic) -class PirMkldnnAutoScanTest(MkldnnAutoScanTest): +class PirOnednnAutoScanTest(OnednnAutoScanTest): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -572,8 +572,8 @@ def run_test(self, quant=False, prog_configs=None): def inference_config_str(self, config) -> str: dic = {} - enable_mkldnn = config.mkldnn_enabled() - dic["use_mkldnn"] = enable_mkldnn + enable_onednn = config.onednn_enabled() + dic["use_mkldnn"] = enable_onednn enable_gpu = config.use_gpu() dic['use_gpu'] = enable_gpu enable_xpu = config.use_xpu() diff --git a/test/deprecated/ir/inference/inference_pass_test.py b/test/deprecated/ir/inference/inference_pass_test.py index 958fd0d4571d29..739716382f50bd 100644 --- a/test/deprecated/ir/inference/inference_pass_test.py +++ b/test/deprecated/ir/inference/inference_pass_test.py @@ -38,7 +38,7 @@ def __init__(self, methodName='runTest'): self.fetch_list = None self.enable_mkldnn = False - self.enable_mkldnn_bfloat16 = False + self.enable_onednn_bfloat16 = False self.enable_trt = False self.enable_tensorrt_varseqlen = False self.trt_parameters = None @@ -143,7 +143,7 @@ def _get_analysis_config( self.path + ".pdmodel", self.path + ".pdiparams" ) config.disable_gpu() - config.disable_mkldnn() + config.disable_onednn() config.switch_specify_input_names(True) config.switch_ir_optim(True) config.switch_use_feed_fetch_ops(False) @@ -178,9 +178,9 @@ def _get_analysis_config( config.enable_tensorrt_varseqlen() elif use_mkldnn: - config.enable_mkldnn() - if self.enable_mkldnn_bfloat16: - config.enable_mkldnn_bfloat16() + config.enable_onednn() + if self.enable_onednn_bfloat16: + config.enable_onednn_bfloat16() return config def check_output(self, atol=1e-3): @@ -285,23 +285,23 @@ def check_output_with_option( # Check whether the onednn results and the CPU results are the same. if (not use_gpu) and self.enable_mkldnn: - mkldnn_outputs = self._get_inference_outs( + onednn_outputs = self._get_inference_outs( self._get_analysis_config( use_gpu=use_gpu, use_mkldnn=self.enable_mkldnn ) ) self.assertTrue( - len(paddle_outs) == len(mkldnn_outputs), + len(paddle_outs) == len(onednn_outputs), "The number of outputs is different between CPU and MKLDNN. ", ) - if self.enable_mkldnn_bfloat16: + if self.enable_onednn_bfloat16: atol = 0.01 - for paddle_out, mkldnn_output in zip(paddle_outs, mkldnn_outputs): + for paddle_out, onednn_output in zip(paddle_outs, onednn_outputs): np.testing.assert_allclose( np.array(paddle_out), - mkldnn_output, + onednn_output, rtol=1e-05, atol=atol, err_msg='Output has diff between CPU and MKLDNN. ', diff --git a/test/deprecated/ir/inference/quant_dequant_test.py b/test/deprecated/ir/inference/quant_dequant_test.py index 725f0948266dd3..69f2ddfaaa4fda 100644 --- a/test/deprecated/ir/inference/quant_dequant_test.py +++ b/test/deprecated/ir/inference/quant_dequant_test.py @@ -47,7 +47,7 @@ def __init__(self, methodName='runTest'): self.feeds = None self.fetch_list = None self.enable_mkldnn = False - self.enable_mkldnn_bfloat16 = False + self.enable_onednn_bfloat16 = False self.enable_trt = False self.enable_tensorrt_varseqlen = True self.trt_parameters = None @@ -204,7 +204,7 @@ def _get_analysis_config( self.path + ".pdmodel", self.path + ".pdiparams" ) config.disable_gpu() - config.disable_mkldnn() + config.disable_onednn() config.switch_specify_input_names(True) config.switch_ir_optim(True) config.switch_use_feed_fetch_ops(False) @@ -231,9 +231,9 @@ def _get_analysis_config( config.enable_tensorrt_varseqlen() elif use_mkldnn: - config.enable_mkldnn() - if self.enable_mkldnn_bfloat16: - config.enable_mkldnn_bfloat16() + config.enable_onednn() + if self.enable_onednn_bfloat16: + config.enable_onednn_bfloat16() return config def check_output_with_option( @@ -388,23 +388,23 @@ def check_output_with_option( # Check whether the onednn results and the CPU results are the same. if (not use_gpu) and self.enable_mkldnn: - mkldnn_outputs = self._get_inference_outs( + onednn_outputs = self._get_inference_outs( self._get_analysis_config( use_gpu=use_gpu, use_mkldnn=self.enable_mkldnn ) ) self.assertTrue( - len(paddle_outs) == len(mkldnn_outputs), + len(paddle_outs) == len(onednn_outputs), "The number of outputs is different between CPU and MKLDNN. ", ) - if self.enable_mkldnn_bfloat16: + if self.enable_onednn_bfloat16: atol = 0.01 - for paddle_out, mkldnn_output in zip(paddle_outs, mkldnn_outputs): + for paddle_out, onednn_output in zip(paddle_outs, onednn_outputs): np.testing.assert_allclose( np.array(paddle_out), - mkldnn_output, + onednn_output, rtol=1e-05, atol=atol, err_msg='Output has diff between CPU and MKLDNN. ', diff --git a/test/deprecated/ir/inference/test_trt_inference_predictor_deprecated.py b/test/deprecated/ir/inference/test_trt_inference_predictor_deprecated.py index 7a8dc3b1a235f9..5f2f954479678a 100644 --- a/test/deprecated/ir/inference/test_trt_inference_predictor_deprecated.py +++ b/test/deprecated/ir/inference/test_trt_inference_predictor_deprecated.py @@ -98,14 +98,14 @@ def load(self, config_arg, inputs=None, outputs=None): if self.args.enable_mkldnn and not self.args.enable_gpu: config.disable_gpu() - config.enable_mkldnn() + config.enable_onednn() if self.args.precision == 'int8': - config.enable_mkldnn_int8( + config.enable_onednn_int8( {"conv2d", "depthwise_conv2d", "transpose2", "pool2d"} ) if not self.args.enable_mkldnn and not self.args.enable_gpu: config.disable_gpu() - # config.enable_mkldnn() + # config.enable_onednn() if self.args.enable_profile: config.enable_profile() shape_range_file = os.path.join( diff --git a/test/deprecated/legacy_test/test_attribute_var_deprecated.py b/test/deprecated/legacy_test/test_attribute_var_deprecated.py index 8f6e2b7091e3b7..0d041549188a20 100644 --- a/test/deprecated/legacy_test/test_attribute_var_deprecated.py +++ b/test/deprecated/legacy_test/test_attribute_var_deprecated.py @@ -51,7 +51,7 @@ def infer_prog(self): config = paddle_infer.Config( self.save_path + '.pdmodel', self.save_path + '.pdiparams' ) - config.disable_mkldnn() + config.disable_onednn() predictor = paddle_infer.create_predictor(config) input_names = predictor.get_input_names() for i, shape in enumerate(self.shapes): diff --git a/test/deprecated/mkldnn/test_mkldnn_cpu_bfloat16_pass_deprecated.py b/test/deprecated/mkldnn/test_mkldnn_cpu_bfloat16_pass_deprecated.py index b9e6379945aa76..f555bd7ff11ad7 100644 --- a/test/deprecated/mkldnn/test_mkldnn_cpu_bfloat16_pass_deprecated.py +++ b/test/deprecated/mkldnn/test_mkldnn_cpu_bfloat16_pass_deprecated.py @@ -54,7 +54,7 @@ def init_data(self): self.shape_x = [12, 10, 1] self.shape_y = [12, 1, 64] self.enable_mkldnn = True - self.enable_mkldnn_bfloat16 = True + self.enable_onednn_bfloat16 = True def test_check_output(self): use_gpu = False diff --git a/test/deprecated/quantization/CMakeLists.txt b/test/deprecated/quantization/CMakeLists.txt index 009c4df6cdd863..c5b4d9d3a67137 100644 --- a/test/deprecated/quantization/CMakeLists.txt +++ b/test/deprecated/quantization/CMakeLists.txt @@ -157,7 +157,7 @@ function(inference_quant2_int8_lstm_model_test target fp32_model quant_model ${dataset_path} --num_threads 1 - --mkldnn_cache_capacity + --onednn_cache_capacity 100 --warmup_iter 100 diff --git a/test/deprecated/quantization/test_quant2_int8_mkldnn_pass_deprecated.py b/test/deprecated/quantization/test_quant2_int8_mkldnn_pass_deprecated.py index 393196a971766d..d7221b53ecbd50 100644 --- a/test/deprecated/quantization/test_quant2_int8_mkldnn_pass_deprecated.py +++ b/test/deprecated/quantization/test_quant2_int8_mkldnn_pass_deprecated.py @@ -19,12 +19,12 @@ import paddle from paddle.base.framework import IrGraph from paddle.framework import core -from paddle.static.quantization import Quant2Int8MkldnnPass +from paddle.static.quantization import Quant2Int8OnednnPass paddle.enable_static() -class TestQuant2Int8MkldnnPassMul(unittest.TestCase): +class TestQuant2Int8OnednnPassMul(unittest.TestCase): def op_name(self): return "mul" @@ -80,7 +80,7 @@ def test_dequantize_op_weights(self): break assert op_node != "", f"op of type {self.op_name()} not found" - qpass = Quant2Int8MkldnnPass( + qpass = Quant2Int8OnednnPass( self.quantized_ops, _scope=self.scope, _place=self.place, @@ -125,12 +125,12 @@ def test_dequantize_op_weights(self): qpass._dequantize_op_weights(graph, op_node, "Y", "Out") -class TestQuant2Int8MkldnnPassMatmulV2(TestQuant2Int8MkldnnPassMul): +class TestQuant2Int8OnednnPassMatmulV2(TestQuant2Int8OnednnPassMul): def op_name(self): return "matmul_v2" -class TestQuant2Int8MkldnnPassConv2D(unittest.TestCase): +class TestQuant2Int8OnednnPassConv2D(unittest.TestCase): def setUp(self): self.scope = paddle.static.global_scope() self.place = paddle.CPUPlace() @@ -225,17 +225,17 @@ def test_quant_update_activation(self): graph = IrGraph(core.Graph(program.desc), for_test=True) graph = self.remove_fuse_activation_attribute(graph) self.check_graph_before_pass(graph) - quant2_int8_mkldnn_pass = Quant2Int8MkldnnPass( + quant2_int8_onednn_pass = Quant2Int8OnednnPass( self.quantized_ops, _scope=self.scope, _place=self.place, _core=core, _debug=False, ) - graph = quant2_int8_mkldnn_pass._update_activations(graph) + graph = quant2_int8_onednn_pass._update_activations(graph) self.check_graph_after_pass(graph) - class TestQuant2Int8MkldnnPassNearestInterp(unittest.TestCase): + class TestQuant2Int8OnednnPassNearestInterp(unittest.TestCase): def op_name(self): return "nearest_interp" @@ -357,7 +357,7 @@ def test_quant_update_activation(self): with paddle.static.program_guard(program): self.prepare_program(program) graph = IrGraph(core.Graph(program.desc), for_test=True) - quant2_int8_mkldnn_pass = Quant2Int8MkldnnPass( + quant2_int8_onednn_pass = Quant2Int8OnednnPass( self.quantized_ops, _scope=self.scope, _place=self.place, @@ -366,12 +366,12 @@ def test_quant_update_activation(self): ) input_scale_tensor = ( - quant2_int8_mkldnn_pass._convert_scale2tensor( + quant2_int8_onednn_pass._convert_scale2tensor( np.array(self.scale).astype(np.float64) ) ) output_scale_tensor = ( - quant2_int8_mkldnn_pass._convert_scale2tensor( + quant2_int8_onednn_pass._convert_scale2tensor( np.array(1.0 / self.scale * self.scale).astype( np.float64 ) @@ -383,12 +383,12 @@ def test_quant_update_activation(self): "conv_output": (False, output_scale_tensor), } if core.avx_supported(): - quant2_int8_mkldnn_pass._var_quant_scales = var_scale - graph = quant2_int8_mkldnn_pass._propagate_scales(graph) - graph = quant2_int8_mkldnn_pass._quantize_fp32_graph(graph) + quant2_int8_onednn_pass._var_quant_scales = var_scale + graph = quant2_int8_onednn_pass._propagate_scales(graph) + graph = quant2_int8_onednn_pass._quantize_fp32_graph(graph) self.check_graph_after_pass(graph) - class TestQuant2Int8MkldnnPassNearestInterpV2(unittest.TestCase): + class TestQuant2Int8OnednnPassNearestInterpV2(unittest.TestCase): def op_name(self): return "nearest_interp_v2" diff --git a/test/deprecated/quantization/test_quantization_mkldnn_pass_deprecated.py b/test/deprecated/quantization/test_quantization_mkldnn_pass_deprecated.py index 2bdbed71f72e07..addd9aad1179b9 100644 --- a/test/deprecated/quantization/test_quantization_mkldnn_pass_deprecated.py +++ b/test/deprecated/quantization/test_quantization_mkldnn_pass_deprecated.py @@ -22,7 +22,7 @@ from paddle.base.framework import IrGraph from paddle.framework import core from paddle.static.quantization import ( - QuantInt8MkldnnPass, + QuantInt8OnednnPass, QuantizationFreezePass, QuantizationTransformPass, ) @@ -98,7 +98,7 @@ def build_program(self, main, startup, is_test, seed): opt.minimize(loss) return [img, label], loss - def mkldnn_based_freeze_graph( + def onednn_based_freeze_graph( self, use_cuda, seed, @@ -174,8 +174,8 @@ def mkldnn_based_freeze_graph( freeze_pass.apply(test_graph) # Transform quantized graph for MKL-DNN INT8 inference - mkldnn_int8_pass = QuantInt8MkldnnPass(_scope=scope, _place=place) - mkldnn_int8_pass.apply(test_graph) + onednn_int8_pass = QuantInt8OnednnPass(_scope=scope, _place=place) + onednn_int8_pass.apply(test_graph) dev_name = '_cpu_' if not for_ci: marked_nodes = set() @@ -191,7 +191,7 @@ def mkldnn_based_freeze_graph( + weight_quant_type, marked_nodes, ) - mkldnn_program = test_graph.to_program() + onednn_program = test_graph.to_program() # Check the transformation weights of conv2d and mul conv_w_mkldnn = np.array(scope.find_var('conv2d_1.w_0').get_tensor()) @@ -202,7 +202,7 @@ def mkldnn_based_freeze_graph( # Check if the conv2d output and mul output are correctly linked to fake_dequantize's # output - self.check_program(mkldnn_program) + self.check_program(onednn_program) if not for_ci: print( '{}: {}'.format( @@ -215,16 +215,16 @@ def mkldnn_based_freeze_graph( ) ) - def test_mkldnn_graph_cpu_static(self): + def test_onednn_graph_cpu_static(self): with paddle.utils.unique_name.guard(): - self.mkldnn_based_freeze_graph( + self.onednn_based_freeze_graph( False, seed=2, activation_quant_type='range_abs_max', weight_quant_type='abs_max', for_ci=True, ) - self.mkldnn_based_freeze_graph( + self.onednn_based_freeze_graph( False, seed=2, activation_quant_type='moving_average_abs_max', diff --git a/test/deprecated/tokenizer/test_faster_tokenizer_op_deprecated.py b/test/deprecated/tokenizer/test_faster_tokenizer_op_deprecated.py index 5b147c409067fc..89702aa04b162c 100755 --- a/test/deprecated/tokenizer/test_faster_tokenizer_op_deprecated.py +++ b/test/deprecated/tokenizer/test_faster_tokenizer_op_deprecated.py @@ -139,7 +139,7 @@ def __init__(self, model_dir): # fast_tokenizer op only support cpu. config.disable_gpu() - config.disable_mkldnn() + config.disable_onednn() config.set_cpu_math_library_num_threads(10) config.switch_use_feed_fetch_ops(False) diff --git a/test/ir/inference/auto_scan_test.py b/test/ir/inference/auto_scan_test.py index 0ceb053c50d5d9..5ae8ed1fb44ab1 100755 --- a/test/ir/inference/auto_scan_test.py +++ b/test/ir/inference/auto_scan_test.py @@ -252,6 +252,7 @@ def create_inference_config( passes: list[str] | None = None, use_gpu: bool = False, use_mkldnn: bool = False, + use_onednn: bool = False, use_xpu: bool = False, ir_optim: bool | None = None, ): @@ -263,8 +264,10 @@ def create_inference_config( config.switch_ir_optim(ir_optim) if use_gpu: config.enable_use_gpu(100, 0) - if not use_mkldnn: - config.disable_mkldnn() + if use_mkldnn: + use_onednn = True + if not use_onednn: + config.disable_onednn() if use_xpu: config.enable_xpu() if passes is not None: @@ -273,7 +276,7 @@ def create_inference_config( return config -class MkldnnAutoScanTest(AutoScanTest): +class OnednnAutoScanTest(AutoScanTest): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -370,14 +373,14 @@ def run_test(self, quant=False, *args, **kwargs): def inference_config_str(self, config) -> str: dic = {} - enable_mkldnn = config.mkldnn_enabled() - dic["use_mkldnn"] = enable_mkldnn + enable_onednn = config.onednn_enabled() + dic["use_onednn"] = enable_onednn enable_gpu = config.use_gpu() dic["use_gpu"] = enable_gpu return str(dic) -class PirMkldnnAutoScanTest(MkldnnAutoScanTest): +class PirOnednnAutoScanTest(OnednnAutoScanTest): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -616,8 +619,8 @@ def run_test(self, quant=False, prog_configs=None): def inference_config_str(self, config) -> str: dic = {} - enable_mkldnn = config.mkldnn_enabled() - dic["use_mkldnn"] = enable_mkldnn + enable_onednn = config.onednn_enabled() + dic["use_onednn"] = enable_onednn enable_gpu = config.use_gpu() dic['use_gpu'] = enable_gpu enable_xpu = config.use_xpu() diff --git a/test/ir/inference/inference_pass_test.py b/test/ir/inference/inference_pass_test.py index 88c6debf574140..34bdfb4d2c16c5 100644 --- a/test/ir/inference/inference_pass_test.py +++ b/test/ir/inference/inference_pass_test.py @@ -38,7 +38,7 @@ def __init__(self, methodName='runTest'): self.fetch_list = None self.enable_mkldnn = False - self.enable_mkldnn_bfloat16 = False + self.enable_onednn_bfloat16 = False self.enable_trt = False self.enable_tensorrt_varseqlen = False self.trt_parameters = None @@ -144,7 +144,7 @@ def _get_analysis_config( self.path + ".pdmodel", self.path + ".pdiparams" ) config.disable_gpu() - config.disable_mkldnn() + config.disable_onednn() config.switch_specify_input_names(True) config.switch_ir_optim(True) config.switch_use_feed_fetch_ops(False) @@ -179,9 +179,9 @@ def _get_analysis_config( config.enable_tensorrt_varseqlen() elif use_mkldnn: - config.enable_mkldnn() - if self.enable_mkldnn_bfloat16: - config.enable_mkldnn_bfloat16() + config.enable_onednn() + if self.enable_onednn_bfloat16: + config.enable_onednn_bfloat16() print('config summary:', config.summary()) return config @@ -287,23 +287,23 @@ def check_output_with_option( # Check whether the onednn results and the CPU results are the same. if (not use_gpu) and self.enable_mkldnn: - mkldnn_outputs = self._get_inference_outs( + onednn_outputs = self._get_inference_outs( self._get_analysis_config( use_gpu=use_gpu, use_mkldnn=self.enable_mkldnn ) ) self.assertTrue( - len(paddle_outs) == len(mkldnn_outputs), + len(paddle_outs) == len(onednn_outputs), "The number of outputs is different between CPU and MKLDNN. ", ) - if self.enable_mkldnn_bfloat16: + if self.enable_onednn_bfloat16: atol = 0.01 - for paddle_out, mkldnn_output in zip(paddle_outs, mkldnn_outputs): + for paddle_out, onednn_output in zip(paddle_outs, onednn_outputs): np.testing.assert_allclose( np.array(paddle_out), - mkldnn_output, + onednn_output, rtol=1e-05, atol=atol, err_msg='Output has diff between CPU and MKLDNN. ', diff --git a/test/ir/inference/quant_dequant_test.py b/test/ir/inference/quant_dequant_test.py index fff8f988178d26..f955273a88667f 100644 --- a/test/ir/inference/quant_dequant_test.py +++ b/test/ir/inference/quant_dequant_test.py @@ -47,7 +47,7 @@ def __init__(self, methodName='runTest'): self.feeds = None self.fetch_list = None self.enable_mkldnn = False - self.enable_mkldnn_bfloat16 = False + self.enable_onednn_bfloat16 = False self.enable_trt = False self.enable_tensorrt_varseqlen = True self.trt_parameters = None @@ -204,7 +204,7 @@ def _get_analysis_config( self.path + ".pdmodel", self.path + ".pdiparams" ) config.disable_gpu() - config.disable_mkldnn() + config.disable_onednn() config.switch_specify_input_names(True) config.switch_ir_optim(True) config.switch_use_feed_fetch_ops(False) @@ -231,9 +231,9 @@ def _get_analysis_config( config.enable_tensorrt_varseqlen() elif use_mkldnn: - config.enable_mkldnn() - if self.enable_mkldnn_bfloat16: - config.enable_mkldnn_bfloat16() + config.enable_onednn() + if self.enable_onednn_bfloat16: + config.enable_onednn_bfloat16() print('config summary:', config.summary()) return config @@ -389,23 +389,23 @@ def check_output_with_option( # Check whether the onednn results and the CPU results are the same. if (not use_gpu) and self.enable_mkldnn: - mkldnn_outputs = self._get_inference_outs( + onednn_outputs = self._get_inference_outs( self._get_analysis_config( use_gpu=use_gpu, use_mkldnn=self.enable_mkldnn ) ) self.assertTrue( - len(paddle_outs) == len(mkldnn_outputs), + len(paddle_outs) == len(onednn_outputs), "The number of outputs is different between CPU and MKLDNN. ", ) - if self.enable_mkldnn_bfloat16: + if self.enable_onednn_bfloat16: atol = 0.01 - for paddle_out, mkldnn_output in zip(paddle_outs, mkldnn_outputs): + for paddle_out, onednn_output in zip(paddle_outs, onednn_outputs): np.testing.assert_allclose( np.array(paddle_out), - mkldnn_output, + onednn_output, rtol=1e-05, atol=atol, err_msg='Output has diff between CPU and MKLDNN. ', diff --git a/test/ir/inference/test_conv_act_onednn_fuse_pass.py b/test/ir/inference/test_conv_act_onednn_fuse_pass.py index 1106e672df270b..8392b19875abfa 100755 --- a/test/ir/inference/test_conv_act_onednn_fuse_pass.py +++ b/test/ir/inference/test_conv_act_onednn_fuse_pass.py @@ -21,7 +21,7 @@ class TestConvActOneDNNFusePass(PassAutoScanTest): def sample_predictor_configs(self, program_config): - config = self.create_inference_config(use_gpu=False, use_mkldnn=True) + config = self.create_inference_config(use_gpu=False, use_onednn=True) yield config, ['fused_conv2d'], (1e-4, 1e-5) def is_program_valid(self, prog_config): diff --git a/test/ir/inference/test_conv_bn_fuse_pass.py b/test/ir/inference/test_conv_bn_fuse_pass.py index 2483012d47197a..9cfd09d53ca9e7 100644 --- a/test/ir/inference/test_conv_bn_fuse_pass.py +++ b/test/ir/inference/test_conv_bn_fuse_pass.py @@ -60,7 +60,7 @@ def sample_program_config(self, draw): st.integers(min_value=1, max_value=2), min_size=2, max_size=2 ) ) - use_mkldnn = draw(st.booleans()) + use_onednn = draw(st.booleans()) epsilon = draw(st.floats(min_value=0.0, max_value=0.001)) x_shape = ( @@ -108,7 +108,7 @@ def generate_bn_Var(): groups=groups, paddings=paddings, strides=strides, - use_mkldnn=use_mkldnn, + use_mkldnn=use_onednn, has_bias=False, is_test=True, ) @@ -159,7 +159,7 @@ def generate_bn_Var(): def sample_predictor_configs(self, program_config): # for onednn if program_config.ops[0].attrs['use_mkldnn']: - config = self.create_inference_config(use_mkldnn=True) + config = self.create_inference_config(use_onednn=True) yield config, ['fused_conv2d'], (1e-5, 1e-5) else: config = self.create_inference_config() @@ -183,7 +183,7 @@ def add_ignore_pass_case(self): def teller1(program_config, predictor_config): if ( program_config.ops[0].attrs['data_format'] == "NHWC" - and not predictor_config.mkldnn_enabled() + and not predictor_config.onednn_enabled() ): return True return False diff --git a/test/ir/inference/test_conv_eltwiseadd_bn_fuse_pass.py b/test/ir/inference/test_conv_eltwiseadd_bn_fuse_pass.py index e7c6d6395606c8..ec013b5b89719a 100755 --- a/test/ir/inference/test_conv_eltwiseadd_bn_fuse_pass.py +++ b/test/ir/inference/test_conv_eltwiseadd_bn_fuse_pass.py @@ -47,7 +47,7 @@ def sample_predictor_configs(self, program_config): # MKLDNN config = self.create_inference_config(use_gpu=False) - config.enable_mkldnn() + config.enable_onednn() yield config, ["conv2d", "elementwise_add"], (1e-4, 1e-5) # for gpu diff --git a/test/ir/inference/test_conv_transpose_bn_fuse_pass.py b/test/ir/inference/test_conv_transpose_bn_fuse_pass.py index d623feffcf4aa0..31e9bc98973814 100644 --- a/test/ir/inference/test_conv_transpose_bn_fuse_pass.py +++ b/test/ir/inference/test_conv_transpose_bn_fuse_pass.py @@ -195,7 +195,7 @@ def generate_batch_norm_Variance(): def sample_predictor_configs(self, program_config): # for onednn if program_config.ops[0].attrs['use_mkldnn']: - config = self.create_inference_config(use_mkldnn=True) + config = self.create_inference_config(use_onednn=True) yield config, ['conv2d_transpose_bias'], (1e-5, 1e-5) # for cpu else: diff --git a/test/ir/inference/test_conv_transpose_eltwiseadd_bn_fuse_pass.py b/test/ir/inference/test_conv_transpose_eltwiseadd_bn_fuse_pass.py index ca6506c8938936..50b19a7ffba3a4 100644 --- a/test/ir/inference/test_conv_transpose_eltwiseadd_bn_fuse_pass.py +++ b/test/ir/inference/test_conv_transpose_eltwiseadd_bn_fuse_pass.py @@ -221,7 +221,7 @@ def generate_batch_norm_Variance(): def sample_predictor_configs(self, program_config): # for onednn if program_config.ops[2].attrs['use_mkldnn']: - config = self.create_inference_config(use_mkldnn=True) + config = self.create_inference_config(use_onednn=True) yield config, ['conv2d_transpose', 'elementwise_add'], (1e-5, 1e-5) # cpu else: diff --git a/test/ir/inference/test_matmul_scale_fuse_pass.py b/test/ir/inference/test_matmul_scale_fuse_pass.py index 67728e12a30250..92820db32fc182 100644 --- a/test/ir/inference/test_matmul_scale_fuse_pass.py +++ b/test/ir/inference/test_matmul_scale_fuse_pass.py @@ -36,7 +36,7 @@ def sample_predictor_configs(self, program_config): ], (1e-5, 1e-5) # onednn - config = self.create_inference_config(use_mkldnn=True) + config = self.create_inference_config(use_onednn=True) yield config, [ "matmul", ], (1e-5, 1e-5) diff --git a/test/ir/inference/test_matmul_v2_scale_fuse_pass.py b/test/ir/inference/test_matmul_v2_scale_fuse_pass.py index 65a456f3a0a841..4eafcbb3d8b16e 100644 --- a/test/ir/inference/test_matmul_v2_scale_fuse_pass.py +++ b/test/ir/inference/test_matmul_v2_scale_fuse_pass.py @@ -36,7 +36,7 @@ def sample_predictor_configs(self, program_config): # yield config, ["matmul_v2", ], (1e-5, 1e-5) # onednn - config = self.create_inference_config(use_mkldnn=True) + config = self.create_inference_config(use_onednn=True) yield config, [ "matmul_v2", ], (1e-5, 1e-5) diff --git a/test/ir/inference/test_mkldnn_conv3d_bias_fuse_pass.py b/test/ir/inference/test_mkldnn_conv3d_bias_fuse_pass.py index 98be6e451d08c7..91885e03032987 100644 --- a/test/ir/inference/test_mkldnn_conv3d_bias_fuse_pass.py +++ b/test/ir/inference/test_mkldnn_conv3d_bias_fuse_pass.py @@ -116,7 +116,7 @@ def generate_weight2(): return program_config def sample_predictor_configs(self, program_config): - config = self.create_inference_config(use_mkldnn=True) + config = self.create_inference_config(use_onednn=True) yield config, ["conv3d"], (1e-5, 1e-5) # TODO(baoachun) diff --git a/test/ir/inference/test_mkldnn_conv3d_op.py b/test/ir/inference/test_mkldnn_conv3d_op.py index 7a258626db7e8b..e6593042d8f55f 100644 --- a/test/ir/inference/test_mkldnn_conv3d_op.py +++ b/test/ir/inference/test_mkldnn_conv3d_op.py @@ -17,12 +17,12 @@ import hypothesis.strategies as st import numpy as np -from auto_scan_test import MkldnnAutoScanTest, PirMkldnnAutoScanTest +from auto_scan_test import OnednnAutoScanTest, PirOnednnAutoScanTest from hypothesis import given from program_config import OpConfig, ProgramConfig, TensorConfig -class TestMkldnnConv3dOp(MkldnnAutoScanTest): +class TestMkldnnConv3dOp(OnednnAutoScanTest): def is_program_valid(self, program_config: ProgramConfig) -> bool: return True @@ -75,7 +75,7 @@ def generate_weight(*args, **kwargs): yield program_config def sample_predictor_configs(self, program_config): - config = self.create_inference_config(use_mkldnn=True) + config = self.create_inference_config(use_onednn=True) yield config, (1e-5, 1e-5) @given( @@ -91,7 +91,7 @@ def test(self, *args, **kwargs): self.run_test(*args, **kwargs) -class TestPirOneDNNPad3DOp(PirMkldnnAutoScanTest): +class TestPirOneDNNPad3DOp(PirOnednnAutoScanTest): def is_program_valid(self, program_config: ProgramConfig) -> bool: return True @@ -145,7 +145,7 @@ def generate_weight(*args, **kwargs): yield program_config def sample_predictor_configs(self, program_config): - config = self.create_inference_config(use_mkldnn=True) + config = self.create_inference_config(use_onednn=True) yield config, (1e-5, 1e-5) @given( diff --git a/test/ir/inference/test_mkldnn_conv_affine_channel_fuse_pass.py b/test/ir/inference/test_mkldnn_conv_affine_channel_fuse_pass.py index fd60d7b65193a9..c277e19b3d4f20 100644 --- a/test/ir/inference/test_mkldnn_conv_affine_channel_fuse_pass.py +++ b/test/ir/inference/test_mkldnn_conv_affine_channel_fuse_pass.py @@ -130,7 +130,7 @@ def generate_scale_bias(): return program_config def sample_predictor_configs(self, program_config): - config = self.create_inference_config(use_mkldnn=True) + config = self.create_inference_config(use_onednn=True) yield config, ['conv2d', 'elementwise_add'], (1e-4, 1e-4) def add_ignore_pass_case(self): @@ -144,7 +144,7 @@ def teller1(program_config, predictor_config): # onednn Output has diff with bias! def teller2(program_config, predictor_config): return ( - predictor_config.mkldnn_enabled() + predictor_config.onednn_enabled() and program_config.ops[0].attrs['has_bias'] ) diff --git a/test/ir/inference/test_mkldnn_conv_gelu_fuse_pass.py b/test/ir/inference/test_mkldnn_conv_gelu_fuse_pass.py index 196e1f5909fe9c..15ad02a8fb3783 100644 --- a/test/ir/inference/test_mkldnn_conv_gelu_fuse_pass.py +++ b/test/ir/inference/test_mkldnn_conv_gelu_fuse_pass.py @@ -95,7 +95,7 @@ def generate_weight(): return program_config def sample_predictor_configs(self, program_config): - config = self.create_inference_config(use_mkldnn=True) + config = self.create_inference_config(use_onednn=True) yield config, ["fused_conv2d"], (1e-5, 1e-5) def test(self): diff --git a/test/ir/inference/test_mkldnn_conv_hard_sigmoid_fuse_pass.py b/test/ir/inference/test_mkldnn_conv_hard_sigmoid_fuse_pass.py index c13888adf1a95a..1381df923ed843 100644 --- a/test/ir/inference/test_mkldnn_conv_hard_sigmoid_fuse_pass.py +++ b/test/ir/inference/test_mkldnn_conv_hard_sigmoid_fuse_pass.py @@ -92,7 +92,7 @@ def generate_weight(): return program_config def sample_predictor_configs(self, program_config): - config = self.create_inference_config(use_mkldnn=True) + config = self.create_inference_config(use_onednn=True) yield config, ["fused_conv2d"], (1e-5, 1e-5) def test(self): diff --git a/test/ir/inference/test_mkldnn_conv_hard_swish_fuse_pass.py b/test/ir/inference/test_mkldnn_conv_hard_swish_fuse_pass.py index f0e1d8c74179b2..cf9355a9ac8d05 100644 --- a/test/ir/inference/test_mkldnn_conv_hard_swish_fuse_pass.py +++ b/test/ir/inference/test_mkldnn_conv_hard_swish_fuse_pass.py @@ -97,7 +97,7 @@ def generate_weight(): return program_config def sample_predictor_configs(self, program_config): - config = self.create_inference_config(use_mkldnn=True) + config = self.create_inference_config(use_onednn=True) yield config, ["fused_conv2d"], (1e-5, 1e-5) def test(self): diff --git a/test/ir/inference/test_mkldnn_conv_mish_fuse_pass.py b/test/ir/inference/test_mkldnn_conv_mish_fuse_pass.py index cc0ccb809a1f14..1ef842da9d0cf8 100644 --- a/test/ir/inference/test_mkldnn_conv_mish_fuse_pass.py +++ b/test/ir/inference/test_mkldnn_conv_mish_fuse_pass.py @@ -96,7 +96,7 @@ def generate_weight(): return program_config def sample_predictor_configs(self, program_config): - config = self.create_inference_config(use_mkldnn=True) + config = self.create_inference_config(use_onednn=True) yield config, ["fused_conv2d"], (1e-5, 1e-5) def test(self): diff --git a/test/ir/inference/test_mkldnn_conv_transpose_bias_fuse_pass.py b/test/ir/inference/test_mkldnn_conv_transpose_bias_fuse_pass.py index 9e3c9efd9b2a2c..d6b4f70ff27a96 100644 --- a/test/ir/inference/test_mkldnn_conv_transpose_bias_fuse_pass.py +++ b/test/ir/inference/test_mkldnn_conv_transpose_bias_fuse_pass.py @@ -105,7 +105,7 @@ def generate_weight2(): return program_config def sample_predictor_configs(self, program_config): - config = self.create_inference_config(use_mkldnn=True) + config = self.create_inference_config(use_onednn=True) yield config, ['conv2d_transpose_bias'], (1e-5, 1e-5) def test(self): diff --git a/test/ir/inference/test_mkldnn_depthwise_conv_pass.py b/test/ir/inference/test_mkldnn_depthwise_conv_pass.py index b2b02a52014ae7..108ea3385d823b 100644 --- a/test/ir/inference/test_mkldnn_depthwise_conv_pass.py +++ b/test/ir/inference/test_mkldnn_depthwise_conv_pass.py @@ -122,7 +122,7 @@ def generate_conv2d_Filter(): def sample_predictor_configs(self, program_config): # for onednn - config = self.create_inference_config(use_mkldnn=True) + config = self.create_inference_config(use_onednn=True) yield config, ['conv2d'], (1e-5, 1e-5) def is_program_valid(self, program_config: ProgramConfig) -> bool: diff --git a/test/ir/inference/test_mkldnn_log_softmax_op.py b/test/ir/inference/test_mkldnn_log_softmax_op.py index e9c028515b0001..be911541394042 100644 --- a/test/ir/inference/test_mkldnn_log_softmax_op.py +++ b/test/ir/inference/test_mkldnn_log_softmax_op.py @@ -17,12 +17,12 @@ import hypothesis.strategies as st import numpy as np -from auto_scan_test import MkldnnAutoScanTest +from auto_scan_test import OnednnAutoScanTest from hypothesis import given from program_config import OpConfig, ProgramConfig, TensorConfig -class TestMKLDNNLogSoftmaxOp(MkldnnAutoScanTest): +class TestMKLDNNLogSoftmaxOp(OnednnAutoScanTest): def is_program_valid(self, program_config: ProgramConfig) -> bool: return True @@ -51,7 +51,7 @@ def generate_input(*args, **kwargs): yield program_config def sample_predictor_configs(self, program_config): - config = self.create_inference_config(use_mkldnn=True) + config = self.create_inference_config(use_onednn=True) yield config, (1e-5, 1e-5) @given( diff --git a/test/ir/inference/test_mkldnn_matmul_v2_elementwise_add_fuse_pass.py b/test/ir/inference/test_mkldnn_matmul_v2_elementwise_add_fuse_pass.py index 16d7ae7baf5164..d6be1efaa34353 100644 --- a/test/ir/inference/test_mkldnn_matmul_v2_elementwise_add_fuse_pass.py +++ b/test/ir/inference/test_mkldnn_matmul_v2_elementwise_add_fuse_pass.py @@ -86,7 +86,7 @@ def generate_input(type): return program_config def sample_predictor_configs(self, program_config): - config = self.create_inference_config(use_mkldnn=True) + config = self.create_inference_config(use_onednn=True) yield config, ['fused_matmul'], (1e-5, 1e-5) def test(self): diff --git a/test/ir/inference/test_mkldnn_matmul_v2_transpose_reshape_fuse_pass.py b/test/ir/inference/test_mkldnn_matmul_v2_transpose_reshape_fuse_pass.py index ef4865b4d782a8..45c697117e0c90 100644 --- a/test/ir/inference/test_mkldnn_matmul_v2_transpose_reshape_fuse_pass.py +++ b/test/ir/inference/test_mkldnn_matmul_v2_transpose_reshape_fuse_pass.py @@ -129,7 +129,7 @@ def generate_input(type): return program_config def sample_predictor_configs(self, program_config): - config = self.create_inference_config(use_mkldnn=True) + config = self.create_inference_config(use_onednn=True) yield config, ['fused_matmul'], (1e-5, 1e-5) def test(self): diff --git a/test/ir/inference/test_mkldnn_matmulv2_op.py b/test/ir/inference/test_mkldnn_matmulv2_op.py index 9a72e806b32268..2c5698d6567584 100644 --- a/test/ir/inference/test_mkldnn_matmulv2_op.py +++ b/test/ir/inference/test_mkldnn_matmulv2_op.py @@ -17,12 +17,12 @@ import hypothesis.strategies as st import numpy as np -from auto_scan_test import MkldnnAutoScanTest +from auto_scan_test import OnednnAutoScanTest from hypothesis import given from program_config import OpConfig, ProgramConfig, TensorConfig -class TestMkldnnMatmulv2Op(MkldnnAutoScanTest): +class TestMkldnnMatmulv2Op(OnednnAutoScanTest): def is_program_valid(self, program_config: ProgramConfig) -> bool: if len(program_config.inputs["input_data2"].shape) == 4: if ( @@ -113,7 +113,7 @@ def generate_input(type, *args, **kwargs): yield program_config def sample_predictor_configs(self, program_config): - config = self.create_inference_config(use_mkldnn=True) + config = self.create_inference_config(use_onednn=True) yield config, (1e-5, 1e-5) @given( diff --git a/test/ir/inference/test_mkldnn_mish_op.py b/test/ir/inference/test_mkldnn_mish_op.py index c3e4bccf6ec68c..abf580836237a5 100644 --- a/test/ir/inference/test_mkldnn_mish_op.py +++ b/test/ir/inference/test_mkldnn_mish_op.py @@ -17,12 +17,12 @@ import hypothesis.strategies as st import numpy as np -from auto_scan_test import MkldnnAutoScanTest +from auto_scan_test import OnednnAutoScanTest from hypothesis import given from program_config import OpConfig, ProgramConfig, TensorConfig -class TestMkldnnMishOp(MkldnnAutoScanTest): +class TestMkldnnMishOp(OnednnAutoScanTest): def is_program_valid(self, program_config: ProgramConfig) -> bool: # if mode is channel, and in_shape is 1 rank if ( @@ -60,7 +60,7 @@ def generate_input(*args, **kwargs): yield program_config def sample_predictor_configs(self, program_config): - config = self.create_inference_config(use_mkldnn=True) + config = self.create_inference_config(use_onednn=True) yield config, (1e-5, 1e-5) @given( diff --git a/test/ir/inference/test_mkldnn_pad3d_op.py b/test/ir/inference/test_mkldnn_pad3d_op.py index f8bd247dfa64d9..eb411b82118ec0 100644 --- a/test/ir/inference/test_mkldnn_pad3d_op.py +++ b/test/ir/inference/test_mkldnn_pad3d_op.py @@ -17,7 +17,7 @@ import hypothesis.strategies as st import numpy as np -from auto_scan_test import MkldnnAutoScanTest, PirMkldnnAutoScanTest +from auto_scan_test import OnednnAutoScanTest, PirOnednnAutoScanTest from hypothesis import given from program_config import ( OpConfig, @@ -26,7 +26,7 @@ ) -class TestOneDNNPad3DOp(MkldnnAutoScanTest): +class TestOneDNNPad3DOp(OnednnAutoScanTest): def sample_program_configs(self, *args, **kwargs): def generate_input(*args, **kwargs): return np.random.random(kwargs['in_shape']).astype(np.float32) @@ -60,7 +60,7 @@ def generate_paddings(): yield program_config def sample_predictor_configs(self, program_config): - config = self.create_inference_config(use_mkldnn=True) + config = self.create_inference_config(use_onednn=True) yield config, (1e-5, 1e-5) @given( @@ -82,7 +82,7 @@ def test(self, *args, **kwargs): self.run_test(quant=False, *args, **kwargs) -class TestPirOneDNNPad3DOp(PirMkldnnAutoScanTest): +class TestPirOneDNNPad3DOp(PirOnednnAutoScanTest): def sample_program_configs(self, *args, **kwargs): def generate_input(*args, **kwargs): return np.random.random(kwargs['in_shape']).astype(np.float32) @@ -117,7 +117,7 @@ def generate_paddings(): yield program_config def sample_predictor_configs(self, program_config): - config = self.create_inference_config(use_mkldnn=True) + config = self.create_inference_config(use_onednn=True) yield config, (1e-5, 1e-5) @given( diff --git a/test/ir/inference/test_mkldnn_prelu_op.py b/test/ir/inference/test_mkldnn_prelu_op.py index cab24fb22178da..c6f8b5b6ac2653 100644 --- a/test/ir/inference/test_mkldnn_prelu_op.py +++ b/test/ir/inference/test_mkldnn_prelu_op.py @@ -17,12 +17,12 @@ import hypothesis.strategies as st import numpy as np -from auto_scan_test import MkldnnAutoScanTest +from auto_scan_test import OnednnAutoScanTest from hypothesis import given from program_config import OpConfig, ProgramConfig, TensorConfig -class TestMkldnnPreluOp(MkldnnAutoScanTest): +class TestMkldnnPreluOp(OnednnAutoScanTest): def is_program_valid(self, program_config: ProgramConfig) -> bool: # if mode is channel, and in_shape is 1 rank if ( @@ -85,7 +85,7 @@ def generate_alpha(*args, **kwargs): yield program_config def sample_predictor_configs(self, program_config): - config = self.create_inference_config(use_mkldnn=True) + config = self.create_inference_config(use_onednn=True) yield config, (1e-5, 1e-5) def add_skip_pass_case(self): diff --git a/test/ir/inference/test_mkldnn_shape_op.py b/test/ir/inference/test_mkldnn_shape_op.py index 69e18e08d32a5d..31603b81d4d49a 100644 --- a/test/ir/inference/test_mkldnn_shape_op.py +++ b/test/ir/inference/test_mkldnn_shape_op.py @@ -17,12 +17,12 @@ import hypothesis.strategies as st import numpy as np -from auto_scan_test import MkldnnAutoScanTest +from auto_scan_test import OnednnAutoScanTest from hypothesis import given from program_config import OpConfig, ProgramConfig, TensorConfig -class TestMkldnnShapeOp(MkldnnAutoScanTest): +class TestMkldnnShapeOp(OnednnAutoScanTest): def is_program_valid(self, program_config: ProgramConfig) -> bool: return True @@ -52,7 +52,7 @@ def generate_input(*args, **kwargs): yield program_config def sample_predictor_configs(self, program_config): - config = self.create_inference_config(use_mkldnn=True) + config = self.create_inference_config(use_onednn=True) yield config, (1e-5, 1e-5) @given( diff --git a/test/ir/inference/test_mkldnn_shuffle_channel_detect_pass.py b/test/ir/inference/test_mkldnn_shuffle_channel_detect_pass.py index 8ebcfaf1041b6b..1a9ae3d8f64177 100644 --- a/test/ir/inference/test_mkldnn_shuffle_channel_detect_pass.py +++ b/test/ir/inference/test_mkldnn_shuffle_channel_detect_pass.py @@ -130,7 +130,7 @@ def generate_input(): return program_config def sample_predictor_configs(self, program_config): - config = self.create_inference_config(use_mkldnn=True) + config = self.create_inference_config(use_onednn=True) yield config, ["shuffle_channel"], (1e-5, 1e-5) def test(self): diff --git a/test/ir/inference/test_mkldnn_shuffle_channel_op.py b/test/ir/inference/test_mkldnn_shuffle_channel_op.py index 64843f08156c65..d5b61dcc962ce3 100644 --- a/test/ir/inference/test_mkldnn_shuffle_channel_op.py +++ b/test/ir/inference/test_mkldnn_shuffle_channel_op.py @@ -17,12 +17,12 @@ import hypothesis.strategies as st import numpy as np -from auto_scan_test import MkldnnAutoScanTest +from auto_scan_test import OnednnAutoScanTest from hypothesis import given from program_config import OpConfig, ProgramConfig, TensorConfig -class TestMKLDNNShuffleChannelOp(MkldnnAutoScanTest): +class TestMKLDNNShuffleChannelOp(OnednnAutoScanTest): def is_program_valid(self, program_config: ProgramConfig) -> bool: return True @@ -51,7 +51,7 @@ def generate_input(*args, **kwargs): yield program_config def sample_predictor_configs(self, program_config): - config = self.create_inference_config(use_mkldnn=True) + config = self.create_inference_config(use_onednn=True) yield config, (1e-5, 1e-5) @given( diff --git a/test/ir/inference/test_onednn_batch_norm_act_fuse_pass.py b/test/ir/inference/test_onednn_batch_norm_act_fuse_pass.py index 84fefa24230fcd..5c7c091ca4f445 100644 --- a/test/ir/inference/test_onednn_batch_norm_act_fuse_pass.py +++ b/test/ir/inference/test_onednn_batch_norm_act_fuse_pass.py @@ -108,7 +108,7 @@ def generate_weight(): return program_config def sample_predictor_configs(self, program_config): - config = self.create_inference_config(use_mkldnn=True) + config = self.create_inference_config(use_onednn=True) yield config, ['batch_norm'], (1e-5, 1e-5) def test(self): diff --git a/test/ir/inference/test_onednn_conv_bias_fuse_pass.py b/test/ir/inference/test_onednn_conv_bias_fuse_pass.py index 565b4f92446cac..ac82c4997da3af 100644 --- a/test/ir/inference/test_onednn_conv_bias_fuse_pass.py +++ b/test/ir/inference/test_onednn_conv_bias_fuse_pass.py @@ -21,7 +21,7 @@ class TestConvBiasOneDNNFusePass(PassAutoScanTest): def sample_predictor_configs(self, program_config): - config = self.create_inference_config(use_gpu=False, use_mkldnn=True) + config = self.create_inference_config(use_gpu=False, use_onednn=True) yield config, ['fused_conv2d'], (1e-4, 1e-5) def is_program_valid(self, prog_config): diff --git a/test/ir/inference/test_onednn_conv_bn_fuse_pass.py b/test/ir/inference/test_onednn_conv_bn_fuse_pass.py index 21c154615cdee3..da95b32fcda80b 100644 --- a/test/ir/inference/test_onednn_conv_bn_fuse_pass.py +++ b/test/ir/inference/test_onednn_conv_bn_fuse_pass.py @@ -136,7 +136,7 @@ def generate_data(shape): return program_config def sample_predictor_configs(self, program_config): - config = self.create_inference_config(use_mkldnn=True) + config = self.create_inference_config(use_onednn=True) yield config, ['fused_conv2d'], (1e-5, 1e-5) def test(self): diff --git a/test/ir/inference/test_onednn_conv_concat_activation_fuse_pass.py b/test/ir/inference/test_onednn_conv_concat_activation_fuse_pass.py index f45190a5084f24..06b383f8aa2716 100644 --- a/test/ir/inference/test_onednn_conv_concat_activation_fuse_pass.py +++ b/test/ir/inference/test_onednn_conv_concat_activation_fuse_pass.py @@ -156,7 +156,7 @@ def generate_data(input_type): return program_config def sample_predictor_configs(self, program_config): - config = self.create_inference_config(use_mkldnn=True) + config = self.create_inference_config(use_onednn=True) yield config, ['fused_conv2d', 'fused_conv2d', 'concat'], (1e-5, 1e-5) def test(self): diff --git a/test/ir/inference/test_onednn_conv_elementwise_add_fuse_pass.py b/test/ir/inference/test_onednn_conv_elementwise_add_fuse_pass.py index fe51b2d0e38924..acce128f2fd3e9 100644 --- a/test/ir/inference/test_onednn_conv_elementwise_add_fuse_pass.py +++ b/test/ir/inference/test_onednn_conv_elementwise_add_fuse_pass.py @@ -116,7 +116,7 @@ def generate_weight(): return program_config def sample_predictor_configs(self, program_config): - config = self.create_inference_config(use_mkldnn=True) + config = self.create_inference_config(use_onednn=True) yield config, ['relu', 'conv2d', 'fused_conv2d'], (1e-5, 1e-5) def test(self): diff --git a/test/ir/inference/test_onednn_matmul_transpose_reshape_fuse_pass.py b/test/ir/inference/test_onednn_matmul_transpose_reshape_fuse_pass.py index 99c731a1d9dfb7..a7861b1ef7a7e1 100644 --- a/test/ir/inference/test_onednn_matmul_transpose_reshape_fuse_pass.py +++ b/test/ir/inference/test_onednn_matmul_transpose_reshape_fuse_pass.py @@ -109,7 +109,7 @@ def generate_input(type): return program_config def sample_predictor_configs(self, program_config): - config = self.create_inference_config(use_mkldnn=True) + config = self.create_inference_config(use_onednn=True) yield config, ['fused_matmul'], (1e-5, 1e-5) def test(self): diff --git a/test/ir/inference/test_onednn_reshape_transpose_matmul_fuse_pass.py b/test/ir/inference/test_onednn_reshape_transpose_matmul_fuse_pass.py index c60a3071010126..70337fc48b9963 100644 --- a/test/ir/inference/test_onednn_reshape_transpose_matmul_fuse_pass.py +++ b/test/ir/inference/test_onednn_reshape_transpose_matmul_fuse_pass.py @@ -145,7 +145,7 @@ def generate_input2(attrs): return program_config def sample_predictor_configs(self, program_config): - config = self.create_inference_config(use_mkldnn=True) + config = self.create_inference_config(use_onednn=True) yield config, ['fused_matmul'], (1e-5, 1e-5) def test(self): diff --git a/test/legacy_test/op_test.py b/test/legacy_test/op_test.py index 3902f8c6b98a75..e6eca9654f330e 100644 --- a/test/legacy_test/op_test.py +++ b/test/legacy_test/op_test.py @@ -389,29 +389,38 @@ def convert_uint16_to_float(in_list): return np.reshape(out, in_list.shape) -def get_places(string_format=False): +def get_places(): places = [] - if not string_format: - if ( - os.environ.get('FLAGS_CI_both_cpu_and_gpu', 'False').lower() - in ['1', 'true', 'on'] - or not core.is_compiled_with_cuda() - ): - places.append(base.CPUPlace()) - if core.is_compiled_with_cuda(): - places.append(base.CUDAPlace(0)) - else: - if ( - os.environ.get('FLAGS_CI_both_cpu_and_gpu', 'False').lower() - in ['1', 'true', 'on'] - or not paddle.is_compiled_with_cuda() - ): - places.append('cpu') - if paddle.is_compiled_with_cuda(): - places.append('gpu') + if ( + os.environ.get('FLAGS_CI_both_cpu_and_gpu', 'False').lower() + in ['1', 'true', 'on'] + or not core.is_compiled_with_cuda() + ): + places.append(base.CPUPlace()) + if core.is_compiled_with_cuda(): + places.append(base.CUDAPlace(0)) + if is_custom_device(): + dev_type = paddle.device.get_all_custom_device_type()[0] + places.append(base.CustomPlace(dev_type, 0)) return places +def get_devices(): + devices = [] + if ( + os.environ.get('FLAGS_CI_both_cpu_and_gpu', 'False').lower() + in ['1', 'true', 'on'] + or not paddle.is_compiled_with_cuda() + ): + devices.append('cpu') + if paddle.is_compiled_with_cuda(): + devices.append('gpu') + if is_custom_device(): + dev_type = paddle.device.get_all_custom_device_type()[0] + devices.append(f'{dev_type}:0') + return devices + + def get_device_place(): if core.is_compiled_with_cuda(): return base.CUDAPlace(0) @@ -423,6 +432,15 @@ def get_device_place(): return base.CPUPlace() +def is_custom_device(): + custom_dev_types = paddle.device.get_all_custom_device_type() + if custom_dev_types and paddle.device.is_compiled_with_custom_device( + custom_dev_types[0] + ): + return True + return False + + @contextmanager def auto_parallel_test_guard(test_info_path, generated_test_file_path): test_info_file, generated_test_file = None, None @@ -2902,6 +2920,13 @@ def _get_places(self): return [place] else: return [] + elif is_custom_device(): + dev_type = paddle.device.get_all_custom_device_type()[0] + place = core.CustomPlace(dev_type, 0) + if core.is_float16_supported(place): + return [place] + else: + return [] else: return [] places = [] @@ -2931,6 +2956,9 @@ def _get_places(self): and not cpu_only ): places.append(core.CUDAPlace(0)) + if is_custom_device(): + dev_type = paddle.device.get_all_custom_device_type()[0] + places.append(core.CustomPlace(dev_type, 0)) return places def check_output( diff --git a/test/legacy_test/test_activation_op.py b/test/legacy_test/test_activation_op.py index a03b55c29008ea..a40ce6f718094d 100644 --- a/test/legacy_test/test_activation_op.py +++ b/test/legacy_test/test_activation_op.py @@ -23,6 +23,7 @@ convert_float_to_uint16, get_device_place, get_places, + is_custom_device, ) from scipy.special import erf, expit from utils import static_guard @@ -497,7 +498,8 @@ def init_shape(self): @unittest.skipIf( - not core.is_compiled_with_cuda() or core.is_compiled_with_rocm(), + not (core.is_compiled_with_cuda() or is_custom_device()) + or core.is_compiled_with_rocm(), "core is not compiled with CUDA", ) class TestSigmoidBF16(OpTest): @@ -1765,7 +1767,8 @@ def init_dtype(self): @unittest.skipIf( - not core.is_compiled_with_cuda() or core.is_compiled_with_rocm(), + not (core.is_compiled_with_cuda() or is_custom_device()) + or core.is_compiled_with_rocm(), "core is not compiled with CUDA", ) class TestSqrtBF16(OpTest): @@ -2037,7 +2040,7 @@ def setUp(self): self.inputs = {'X': OpTest.np_dtype_to_base_dtype(x)} self.outputs = {'Out': out} self.convert_input_output() - if not core.is_compiled_with_cuda(): + if not (core.is_compiled_with_cuda() or is_custom_device()): self.__class__.no_need_check_grad = True def init_shape(self): @@ -2091,7 +2094,7 @@ def setUp(self): self.inputs = {'X': OpTest.np_dtype_to_base_dtype(x)} self.outputs = {'Out': out} self.convert_input_output() - if not core.is_compiled_with_cuda(): + if not (core.is_compiled_with_cuda() or is_custom_device()): self.__class__.no_need_check_grad = True def init_shape(self): @@ -4563,7 +4566,8 @@ def init_shape(self): @unittest.skipIf( - not core.is_compiled_with_cuda() or core.is_compiled_with_rocm(), + not (core.is_compiled_with_cuda() or is_custom_device()) + or core.is_compiled_with_rocm(), "core is not compiled with CUDA", ) class TestSquareBF16(OpTest): @@ -4917,7 +4921,8 @@ def init_shape(self): @unittest.skipIf( - not core.is_compiled_with_cuda() or core.is_compiled_with_rocm(), + not (core.is_compiled_with_cuda() or is_custom_device()) + or core.is_compiled_with_rocm(), "core is not compiled with CUDA", ) class TestSoftplusBF16(OpTest): @@ -5595,7 +5600,8 @@ def test_errors(self): # ------------------ Test Cudnn Activation---------------------- def create_test_act_cudnn_class(parent, atol=1e-3, grad_atol=1e-3): @unittest.skipIf( - not core.is_compiled_with_cuda(), "core is not compiled with CUDA" + not (core.is_compiled_with_cuda() or is_custom_device()), + "core is not compiled with CUDA", ) class TestActCudnn(parent): def init_kernel_type(self): diff --git a/test/legacy_test/test_adadelta_op.py b/test/legacy_test/test_adadelta_op.py index 1650f246c25755..9dfa5d3e6380e1 100644 --- a/test/legacy_test/test_adadelta_op.py +++ b/test/legacy_test/test_adadelta_op.py @@ -15,7 +15,7 @@ import unittest import numpy as np -from op_test import OpTest, get_device_place, get_places +from op_test import OpTest, get_device_place, get_devices import paddle from paddle import base @@ -294,7 +294,7 @@ def _test_adadelta_op_dygraph_place_amp(self, place, use_amp=False): paddle.enable_static() def test_main(self): - for place in get_places(string_format=True): + for place in get_devices(): use_amp_list = [True, False] for use_amp in use_amp_list: self._test_adadelta_op_dygraph_place_amp(place, use_amp) diff --git a/test/legacy_test/test_adagrad_op.py b/test/legacy_test/test_adagrad_op.py index 0b5d1fef458200..c5497d51f25bd7 100644 --- a/test/legacy_test/test_adagrad_op.py +++ b/test/legacy_test/test_adagrad_op.py @@ -17,7 +17,7 @@ import numpy as np from op import Operator -from op_test import OpTest, get_device_place, get_places +from op_test import OpTest, get_device_place, get_devices, get_places import paddle from paddle.base import core @@ -242,7 +242,7 @@ def _test_adagrad_op_dygraph_place_amp(self, place, use_amp=False): paddle.enable_static() def test_main(self): - for place in get_places(string_format=True): + for place in get_devices(): use_amp_list = [True, False] for use_amp in use_amp_list: self._test_adagrad_op_dygraph_place_amp(place, use_amp) diff --git a/test/legacy_test/test_adam_op.py b/test/legacy_test/test_adam_op.py index c2ebbea1653ad3..4875c0dda23c83 100644 --- a/test/legacy_test/test_adam_op.py +++ b/test/legacy_test/test_adam_op.py @@ -16,7 +16,7 @@ import numpy as np from op import Operator -from op_test import OpTest, get_places +from op_test import OpTest, get_devices, get_places import paddle from paddle import base @@ -1296,7 +1296,7 @@ def _adam_optimize_static( return out def _get_places(self): - return get_places(string_format=True) + return get_devices() def _check_with_place_amp(self, place, use_amp): # test dygraph mode diff --git a/test/legacy_test/test_adamax_op.py b/test/legacy_test/test_adamax_op.py index 8b3532794d0f28..5670e4b2751b71 100644 --- a/test/legacy_test/test_adamax_op.py +++ b/test/legacy_test/test_adamax_op.py @@ -15,7 +15,7 @@ import unittest import numpy as np -from op_test import OpTest, get_device_place, get_places +from op_test import OpTest, get_device_place, get_devices import paddle @@ -275,7 +275,7 @@ def _test_adamax_op_dygraph_place_amp(self, place, use_amp=False): paddle.enable_static() def _get_places(self): - return get_places(string_format=True) + return get_devices() def test_main(self): for place in self._get_places(): diff --git a/test/legacy_test/test_adamw_op.py b/test/legacy_test/test_adamw_op.py index 904d87815427ec..1523468a75460d 100644 --- a/test/legacy_test/test_adamw_op.py +++ b/test/legacy_test/test_adamw_op.py @@ -18,7 +18,7 @@ from functools import partial import numpy as np -from op_test import OpTest, get_places +from op_test import OpTest, get_devices import paddle from paddle import base, nn @@ -758,7 +758,7 @@ def _test_adamw_op_dygraph_place_amp(self, place, use_amp=False): optimizer.clear_grad() def _get_places(self): - places = get_places(string_format=True) + places = get_devices() if paddle.is_compiled_with_xpu(): places.append('xpu') return places diff --git a/test/legacy_test/test_adaptive_log_softmax_with_loss.py b/test/legacy_test/test_adaptive_log_softmax_with_loss.py index 6210e1d469bda5..29728b8b25476d 100644 --- a/test/legacy_test/test_adaptive_log_softmax_with_loss.py +++ b/test/legacy_test/test_adaptive_log_softmax_with_loss.py @@ -15,7 +15,7 @@ import unittest import numpy as np -from op_test import get_places +from op_test import get_devices, get_places import paddle import paddle.optimizer as optim @@ -58,7 +58,7 @@ def predict(self, input): class TestNNAdaptiveLogSoftmaxWithLossAPI(unittest.TestCase): def setUp(self): paddle.seed(2024) - self.place = get_places(string_format=True) + self.place = get_devices() self.log_np = np.random.randn(4, 8).astype('float32') self.predict_np = np.abs(np.random.randn(64, 8).astype('float32')) diff --git a/test/legacy_test/test_attribute_var.py b/test/legacy_test/test_attribute_var.py index 9ed3bffc0d9dad..9da566783ef4db 100644 --- a/test/legacy_test/test_attribute_var.py +++ b/test/legacy_test/test_attribute_var.py @@ -51,7 +51,7 @@ def infer_prog(self): config = paddle_infer.Config( self.save_path + '.pdmodel', self.save_path + '.pdiparams' ) - config.disable_mkldnn() + config.disable_onednn() predictor = paddle_infer.create_predictor(config) input_names = predictor.get_input_names() for i, shape in enumerate(self.shapes): diff --git a/test/legacy_test/test_blha_get_max_len_op.py b/test/legacy_test/test_blha_get_max_len_op.py index ab8b410a8c15ab..790e654dd4f1f6 100644 --- a/test/legacy_test/test_blha_get_max_len_op.py +++ b/test/legacy_test/test_blha_get_max_len_op.py @@ -15,6 +15,7 @@ import unittest import numpy as np +from op_test import is_custom_device import paddle from paddle.base import core @@ -109,7 +110,8 @@ def test_static_api(self): @unittest.skipIf( - not core.is_compiled_with_cuda() and not core.is_compiled_with_xpu(), + not (core.is_compiled_with_cuda() or is_custom_device()) + and not core.is_compiled_with_xpu(), "Only support XPU or GPU in CUDA mode.", ) class TestBlhaGetMaxLenOp_ZeroSize(unittest.TestCase): diff --git a/test/legacy_test/test_cartesian_prod.py b/test/legacy_test/test_cartesian_prod.py index 7246df017f8f7d..f7d0548a76527b 100644 --- a/test/legacy_test/test_cartesian_prod.py +++ b/test/legacy_test/test_cartesian_prod.py @@ -16,7 +16,7 @@ from itertools import product import numpy as np -from op_test import get_places +from op_test import get_devices import paddle from paddle.base import core @@ -36,7 +36,7 @@ def setUp(self): self.c_np = np.random.random(self.c_shape).astype(self.dtype_np) self.d_np = np.empty(0, self.dtype_np) - self.place = get_places(string_format=True) + self.place = get_devices() def init_setting(self): self.dtype_np = 'float32' @@ -119,7 +119,7 @@ def setUp(self): self.a_np = np.random.random(self.a_shape).astype(self.dtype_np) self.b_np = np.empty(0, self.dtype_np) - self.place = get_places(string_format=True) + self.place = get_devices() def init_setting(self): self.dtype_np = 'float32' diff --git a/test/legacy_test/test_cauchy_inplace.py b/test/legacy_test/test_cauchy_inplace.py index 4aa41ce0ca130e..ebe03c9acf8f57 100644 --- a/test/legacy_test/test_cauchy_inplace.py +++ b/test/legacy_test/test_cauchy_inplace.py @@ -15,7 +15,7 @@ import unittest import numpy as np -from op_test import get_places +from op_test import get_devices import paddle @@ -35,7 +35,7 @@ def test_fp64(): tensor_fp64.cauchy_() self.assertEqual(tensor_fp64.dtype, paddle.float64) - for place in get_places(string_format=True): + for place in get_devices(): paddle.set_device(place) test_fp32() test_fp64() @@ -92,7 +92,7 @@ def test_cauchy_inplace_distribution(self): class TestCauchyInplaceEmptyTensor(unittest.TestCase): def test_cauchy_inplace_op_empty_tensor(self): test_shapes = [(200, 1), (1, 200)] - for place in get_places(string_format=True): + for place in get_devices(): paddle.set_device(place) for test_shape in test_shapes: tensor = paddle.empty(shape=test_shape) @@ -118,7 +118,7 @@ def test_grad(): cauchy_grad = tensor_b.grad.numpy() self.assertTrue((cauchy_grad == 0).all()) - for place in get_places(string_format=True): + for place in get_devices(): paddle.set_device(place) test_grad() diff --git a/test/legacy_test/test_class_center_sample_op.py b/test/legacy_test/test_class_center_sample_op.py index ad8a19acc15770..8302df224bb2de 100644 --- a/test/legacy_test/test_class_center_sample_op.py +++ b/test/legacy_test/test_class_center_sample_op.py @@ -15,10 +15,9 @@ import unittest import numpy as np -from op_test import OpTest, paddle_static_guard +from op_test import OpTest, get_places, paddle_static_guard import paddle -from paddle.base import core def class_center_sample_numpy(label, classes_list, num_samples): @@ -135,9 +134,7 @@ def setUp(self): self.initParams() np.random.seed(self.seed) paddle.framework.random._manual_program_seed(2021) - self.places = [paddle.base.CPUPlace()] - if core.is_compiled_with_cuda(): - self.places.append(paddle.base.CUDAPlace(0)) + self.places = get_places() def initParams(self): self.batch_size = 10 @@ -235,9 +232,7 @@ class TestClassCenterSampleAPIError(unittest.TestCase): def setUp(self): self.initParams() np.random.seed(self.seed) - self.places = [paddle.base.CPUPlace()] - if core.is_compiled_with_cuda(): - self.places.append(paddle.base.CUDAPlace(0)) + self.places = get_places() def initParams(self): self.batch_size = 20 @@ -275,9 +270,7 @@ class TestClassCenterSampleAPIError1(unittest.TestCase): def setUp(self): self.initParams() np.random.seed(self.seed) - self.places = [paddle.base.CPUPlace()] - if core.is_compiled_with_cuda(): - self.places.append(paddle.base.CUDAPlace(0)) + self.places = get_places() def initParams(self): self.batch_size = 5 diff --git a/test/legacy_test/test_combinations.py b/test/legacy_test/test_combinations.py index 1390fa90265895..f2f0e49fdd2748 100644 --- a/test/legacy_test/test_combinations.py +++ b/test/legacy_test/test_combinations.py @@ -16,7 +16,7 @@ from itertools import combinations, combinations_with_replacement import numpy as np -from op_test import get_places +from op_test import get_devices import paddle from paddle.base import Program @@ -47,7 +47,7 @@ def setUp(self): self.modify_setting() self.x_np = np.random.random(self.x_shape).astype(self.dtype_np) - self.place = get_places(string_format=True) + self.place = get_devices() def init_setting(self): self.dtype_np = 'float64' @@ -120,7 +120,7 @@ def modify_setting(self): class TestCombinationsEmpty(unittest.TestCase): def setUp(self): - self.place = get_places(string_format=True) + self.place = get_devices() def test_dygraph(self): paddle.disable_static() diff --git a/test/legacy_test/test_compat_split.py b/test/legacy_test/test_compat_split.py new file mode 100644 index 00000000000000..8410e10e1e1caf --- /dev/null +++ b/test/legacy_test/test_compat_split.py @@ -0,0 +1,177 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np + +import paddle +from paddle.compat import split + + +class TestCompatSplit(unittest.TestCase): + def _compare_with_origin(self, input_tensor, size, axis=0): + pd_results = split(input_tensor, size, dim=axis) + + if isinstance(size, int): + shape_on_axis = input_tensor.shape[axis] + remaining_num = shape_on_axis % size + num_sections = shape_on_axis // size + if remaining_num == 0: + size = num_sections + else: + size = [size for _ in range(num_sections)] + size.append(remaining_num) + + origin_results = paddle.split( + input_tensor, num_or_sections=size, axis=axis + ) + + self.assertEqual(len(origin_results), len(pd_results)) + + # check shape and output section size of the output + for origin_ts, pd_ts in zip(origin_results, pd_results): + np.testing.assert_allclose(origin_ts.numpy(), pd_ts.numpy()) + + def test_basic_split(self): + """Test basic splitting with integer size""" + data = paddle.arange(12).reshape([3, 4]).astype('float32') + self._compare_with_origin(data, 1, 0) + self._compare_with_origin(data, 2, 1) + + def test_split_with_list_sections(self): + """Test splitting with list of section sizes""" + data = paddle.rand([10, 5]) + self._compare_with_origin(data, [3, 2, 5], 0) + self._compare_with_origin(data, [1, 4], -1) + + def test_chained_operations(self): + """Test split with complex operation chain""" + x = paddle.rand([8, 12]) + y = paddle.sin(x) * 2.0 + paddle.exp(x) / 3.0 + z = paddle.nn.functional.relu(y) + + z1, z2 = split(z, 7, dim=1) + + self.assertEqual(z1.shape, [8, 7]) + self.assertEqual(z2.shape, [8, 5]) + + z_np = z.numpy() + np.testing.assert_allclose(z_np[:, :7], z1.numpy()) + np.testing.assert_allclose(z_np[:, 7:], z2.numpy()) + + def test_split_grad(self): + """Test backprop for split, in1 and in2 are computed by + compat.split and original split""" + + def get_tensors(): + np.random.seed(114514) + np_arr = np.random.normal(0, 1, [2, 3, 4, 5]) + return paddle.to_tensor(np_arr), paddle.to_tensor(np_arr) + + in1, in2 = get_tensors() + in1.stop_gradient = False + in2.stop_gradient = False + + def computation_graph(in_tensor): + y = in_tensor * 2.3 + 3.0 + y = paddle.maximum(y, paddle.to_tensor([0], dtype=paddle.float32)) + return y.mean(axis=0) + + out1 = computation_graph(in1) + out2 = computation_graph(in2) + + packs1 = paddle.compat.split(out1, 2, dim=2) + packs2 = paddle.split(out2, [2, 2, 1], axis=2) + + res1 = packs1[0] + packs1[1] + packs1[2] + res2 = packs2[0] + packs2[1] + packs2[2] + res1.backward() + res2.backward() + np.testing.assert_allclose(in1.grad.numpy(), in2.grad.numpy()) + + def test_empty_dim(self): + """Split with empty dim""" + in_tensor = paddle.arange(72, dtype=paddle.int64).reshape([3, 12, 2]) + self._compare_with_origin(in_tensor, [5, 0, 7], axis=1) + + def test_split_with_one_block(self): + """Resulting tuple should be of length 1""" + in_tensor = paddle.arange(60, dtype=paddle.float32).reshape([3, 4, 5]) + self._compare_with_origin(in_tensor, 5, paddle.to_tensor([-1])) + self._compare_with_origin(in_tensor, [5], paddle.to_tensor(2)) + + def test_edge_cases(self): + """Test edge cases and error handling""" + x = paddle.arange(5) + s1, s2 = split(x, [3, 2]) + np.testing.assert_allclose(s1.numpy(), [0, 1, 2]) + np.testing.assert_allclose(s2.numpy(), [3, 4]) + + x = paddle.rand([2, 2, 2]) + a, b = split(x, 1, 2) + self.assertEqual(a.shape, [2, 2, 1]) + + # invalid split sections + with self.assertRaises(ValueError): + split(x, [3, 1], 1) + + # invalid split axis + with self.assertRaises(ValueError): + split(x, 2, 3) + + def test_error_hint(self): + """Test whether there will be correct exception when users pass paddle.split kwargs in paddle.compat.split, vice versa.""" + x = paddle.randn([3, 9, 5]) + + msg_gt_1 = ( + "paddle.split() received unexpected keyword arguments 'tensor', 'split_size_or_sections', 'dim'. " + "\nDid you mean to use paddle.compat.split() instead?" + ) + msg_gt_2 = ( + "paddle.compat.split() received unexpected keyword argument 'num_or_sections'. " + "\nDid you mean to use paddle.split() instead?" + ) + msg_gt_3 = "(InvalidArgument) The dim is expected to be in range of [-3, 3), but got 3" + msg_gt_4 = "paddle.compat.split expects split_sizes have only non-negative entries, but got size = -5 on dim 2" + + split_size = paddle.to_tensor([3]) + msg_gt_5 = ( + "The type of 'split_size_or_sections' in split must be int, list or tuple in imperative mode, but " + f"received {type(split_size)}." + ) + + with self.assertRaises(TypeError) as cm: + tensors = paddle.split(tensor=x, split_size_or_sections=3, dim=0) + self.assertEqual(str(cm.exception), msg_gt_1) + + with self.assertRaises(TypeError) as cm: + tensors = split(x, num_or_sections=3, dim=0) + self.assertEqual(str(cm.exception), msg_gt_2) + + with self.assertRaises(ValueError) as cm: + tensors = split(x, 3, dim=3) + self.assertEqual(str(cm.exception), msg_gt_3) + + with self.assertRaises(ValueError) as cm: + tensors = split(x, [3, 3, -5], -2) + self.assertEqual(str(cm.exception), msg_gt_4) + + with self.assertRaises(TypeError) as cm: + tensors = split(x, split_size, 1) + self.assertEqual(str(cm.exception), msg_gt_5) + + +if __name__ == '__main__': + unittest.main() diff --git a/test/legacy_test/test_compat_split_static.py b/test/legacy_test/test_compat_split_static.py new file mode 100644 index 00000000000000..f685121aabd750 --- /dev/null +++ b/test/legacy_test/test_compat_split_static.py @@ -0,0 +1,221 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np + +import paddle +from paddle.compat import split + + +class TestCompatSplitStatic(unittest.TestCase): + def _compare_with_origin_static( + self, input_shape, size, axis=0, dim_rank=-1 + ): + """size_dim: -1 means we input size by int, 0 means 0-size tensor, 1 means tensor with shape [1]""" + numel = 1 + for v in input_shape: + numel *= v + input_axis = axis + if dim_rank == 0: + input_axis = paddle.to_tensor(axis) + elif dim_rank == 1: + input_axis = paddle.to_tensor([axis]) + paddle.enable_static() + with paddle.static.program_guard(paddle.static.Program()): + input_tensor = paddle.arange(numel, dtype=paddle.float32).reshape( + input_shape + ) + pd_results = split(input_tensor, size, dim=input_axis) + + if isinstance(size, int): + shape_on_axis = input_tensor.shape[axis] + remaining_num = shape_on_axis % size + num_sections = shape_on_axis // size + if remaining_num == 0: + size = num_sections + else: + size = [size for _ in range(num_sections)] + size.append(remaining_num) + + origin_results = paddle.split( + input_tensor, num_or_sections=size, axis=axis + ) + assert len(pd_results) == len(origin_results), "length mismatched" + place = ( + paddle.CUDAPlace(0) + if paddle.is_compiled_with_cuda() + else paddle.CPUPlace() + ) + exe = paddle.static.Executor(place) + results = exe.run(fetch_list=[*origin_results, *pd_results]) + length_needed = len(results) // 2 + for i in range(length_needed): + np.testing.assert_allclose( + results[i], results[i + length_needed] + ) + paddle.disable_static() + + def test_split_composite_static(self): + paddle.seed(114514) + + def get_tensors(): + np.random.seed(114514) + np_arr = np.random.normal(0, 1, [2, 3, 4, 5]) + return paddle.to_tensor(np_arr), paddle.to_tensor(np_arr) + + in1, in2 = get_tensors() + in1.stop_gradient = False + in2.stop_gradient = False + + @paddle.jit.to_static + def computation_graph(in1: paddle.Tensor, in2: paddle.Tensor): + y1 = in1 * 1.5 + 1.0 + y1 = paddle.minimum(y1, paddle.to_tensor([0], dtype=paddle.float32)) + out1 = y1.mean(axis=0) + + y2 = in2 * 1.5 + 1.0 + y2 = paddle.minimum(y2, paddle.to_tensor([0], dtype=paddle.float32)) + out2 = y2.mean(axis=0) + + packs1 = paddle.compat.split(out1, 2, dim=2) + packs2 = paddle.split(out2, [2, 2, 1], axis=2) + + res1 = packs1[0] + packs1[1] + packs1[2] + res2 = packs2[0] + packs2[1] + packs2[2] + + return res1, res2 + + res1, res2 = computation_graph(in1, in2) + np.testing.assert_allclose(res1.numpy(), res2.numpy()) + + def test_static_graph(self): + """Test static graph execution""" + # fixed random seed for reproducibility + np.random.seed(114514) + # old static graph mode + paddle.enable_static() + + with paddle.static.program_guard(paddle.static.Program()): + x = paddle.static.data(name='x', shape=[None, 6], dtype='float32') + result0, result1 = split(x, split_size_or_sections=[3, 3], dim=1) + output = result0 * 2.0 + paddle.sin(result1) + + place = ( + paddle.CUDAPlace(0) + if paddle.is_compiled_with_cuda() + else paddle.CPUPlace() + ) + exe = paddle.static.Executor(place) + + input_data = np.random.rand(3, 6).astype('float32') + feed = {'x': input_data} + + results = exe.run(feed=feed, fetch_list=[result0, result1, output]) + + pd_result0, pd_result1 = results[0], results[1] + np.testing.assert_allclose(input_data[:, :3], pd_result0) + np.testing.assert_allclose(input_data[:, 3:], pd_result1) + + expected_output = input_data[:, :3] * 2.0 + np.sin( + input_data[:, 3:] + ) + np.testing.assert_allclose( + expected_output, results[2], rtol=1e-4, atol=1e-4 + ) + + paddle.disable_static() + + def test_static_graph_2(self): + """Test static graph execution""" + np.random.seed(114514) + axis = paddle.to_tensor(-1) + paddle.enable_static() + with paddle.static.program_guard(paddle.static.Program()): + x = paddle.static.data(name='x', shape=[None, 9], dtype='float32') + result0, result1, result2 = split(x, 4, dim=axis) + output = result0 + result1 * result2 + + place = ( + paddle.CUDAPlace(0) + if paddle.is_compiled_with_cuda() + else paddle.CPUPlace() + ) + exe = paddle.static.Executor(place) + + input_data = np.random.rand(3, 9).astype('float32') + feed = {'x': input_data} + + results = exe.run( + feed=feed, fetch_list=[result0, result1, result2, output] + ) + + np.testing.assert_allclose(input_data[:, 0:4], results[0]) + np.testing.assert_allclose(input_data[:, 4:8], results[1]) + np.testing.assert_allclose(input_data[:, 8:9], results[2]) + + expected_output = ( + input_data[:, 0:4] + input_data[:, 4:8] * input_data[:, -1:] + ) + np.testing.assert_allclose( + expected_output, results[3], rtol=1e-4, atol=1e-4 + ) + + paddle.disable_static() + + def test_error_hint(self): + """Test whether there will be correct exception when users pass paddle.split kwargs in paddle.compat.split, vice versa.""" + + msg_gt_1 = "split_size_or_sections must be greater than 0." + msg_gt_2 = "len(split_size_or_sections) must not be more than input.shape[dim]." + msg_gt_3 = "The type of 'split_size_or_sections' in split must be int, list or tuple in imperative mode." + msg_gt_4 = ( + "'dim' is not allowed to be a pir.Value in a static graph: " + "\npir.Value can not be used for indexing python lists/tuples." + ) + + paddle.enable_static() + with self.assertRaises(AssertionError) as cm: + x = paddle.randn([3, 4, 5]) + tensors = split(x, -2, dim=0) + self.assertEqual(str(cm.exception), msg_gt_1) + + with self.assertRaises(AssertionError) as cm: + x = paddle.randn([3, 4, 5]) + tensors = split(x, (1, 1, 1, 1, 2, 2), dim=-1) + self.assertEqual(str(cm.exception), msg_gt_2) + + with self.assertRaises(TypeError) as cm: + x = paddle.randn([3, 4, 5]) + tensors = split(x, paddle.to_tensor(2), dim=2) + self.assertEqual(str(cm.exception), msg_gt_3) + + with self.assertRaises(TypeError) as cm: + x = paddle.randn([3, 4, 5]) + tensors = split(x, 2, dim=paddle.to_tensor(2)) + paddle.disable_static() + self.assertEqual(str(cm.exception), msg_gt_4) + + def test_basic_split(self): + """Test basic splitting with integer size""" + input_shape = [3, 6] + self._compare_with_origin_static(input_shape, 1, 0) + self._compare_with_origin_static(input_shape, 3, -1) + self._compare_with_origin_static(input_shape, 4, dim_rank=0) + self._compare_with_origin_static(input_shape, 3, dim_rank=1) + + +if __name__ == '__main__': + unittest.main() diff --git a/test/legacy_test/test_cross_op.py b/test/legacy_test/test_cross_op.py index 573021b0d07f88..601bb87927cef5 100644 --- a/test/legacy_test/test_cross_op.py +++ b/test/legacy_test/test_cross_op.py @@ -15,7 +15,7 @@ import unittest import numpy as np -from op_test import OpTest, convert_float_to_uint16 +from op_test import OpTest, convert_float_to_uint16, is_custom_device import paddle from paddle import base @@ -77,7 +77,8 @@ def init_output(self): @unittest.skipIf( - not core.is_compiled_with_cuda(), "core is not compiled with CUDA" + not (core.is_compiled_with_cuda() or is_custom_device()), + "core is not compiled with CUDA", ) class TestCrossFP16Op(TestCrossOp): def initTestCase(self): diff --git a/test/legacy_test/test_determinant_op.py b/test/legacy_test/test_determinant_op.py index 7301fbeafd0610..1362f4a6dd30a9 100644 --- a/test/legacy_test/test_determinant_op.py +++ b/test/legacy_test/test_determinant_op.py @@ -15,7 +15,7 @@ import unittest import numpy as np -from op_test import OpTest +from op_test import OpTest, get_places import paddle @@ -430,9 +430,7 @@ def setUp(self): self.x = np.vectorize(complex)( np.random.random(self.shape), np.random.random(self.shape) ).astype(self.dtype) - self.places = [paddle.CPUPlace()] - if paddle.base.core.is_compiled_with_cuda(): - self.places.append(paddle.CUDAPlace(0)) + self.places = get_places() self.out_grad = ( np.array([1 + 0j, 1 + 0j] * 3 * 3) .reshape(2, 3, 3) @@ -502,9 +500,7 @@ def setUp(self): self.x = np.vectorize(complex)( np.random.random(self.shape), np.random.random(self.shape) ).astype(self.dtype) - self.places = [paddle.CPUPlace()] - if paddle.base.core.is_compiled_with_cuda(): - self.places.append(paddle.CUDAPlace(0)) + self.places = get_places() self.out_grad = np.array([3 + 0j, 3 + 0j] * 6).reshape(2, 6) self.x_grad_ref_dy = self.get_numeric_grad( self.x, self.shape, self.out_grad diff --git a/test/legacy_test/test_dropout_op.py b/test/legacy_test/test_dropout_op.py index 170ee389b552f6..81cccded682c89 100644 --- a/test/legacy_test/test_dropout_op.py +++ b/test/legacy_test/test_dropout_op.py @@ -919,6 +919,121 @@ def test_dygraph(self): ) +class TestDropout1DFAPI(unittest.TestCase): + def setUp(self): + np.random.seed(123) + self.places = get_places() + + def check_static_result( + self, place, input_name, input_shape, training=False, p=0.0 + ): + paddle.enable_static() + main_prog = paddle.static.Program() + startup_prog = paddle.static.Program() + with paddle.static.program_guard(main_prog, startup_prog): + input_var = paddle.static.data( + name=input_name, shape=input_shape, dtype="float32" + ) + res = paddle.nn.functional.dropout1d( + input=input_var, p=p, training=training + ) + in_np = np.random.random(input_shape).astype("float32") + exe = base.Executor(place) + fetches = exe.run( + main_prog, + feed={input_name: in_np}, + fetch_list=[res], + ) + + np.testing.assert_allclose(fetches[0], in_np, rtol=1e-05) + + def test_static(self): + for place in self.places: + self.check_static_result( + place=place, + input_name="input_2d", + input_shape=[3, 4], + training=False, + p=0.0, + ) + + self.check_static_result( + place=place, + input_name="input_3d", + input_shape=[2, 3, 4], + training=False, + p=0.0, + ) + + self.check_static_result( + place=place, + input_name="input_2d_1", + input_shape=[3, 4], + training=False, + p=1.0, + ) + + self.check_static_result( + place=place, + input_name="input_3d_1", + input_shape=[2, 3, 4], + training=False, + p=1.0, + ) + + def test_dygraph(self): + for place in self.places: + with base.dygraph.guard(place): + # Test 2D input + in_np_2d = np.random.random([3, 4]).astype("float32") + input_2d = paddle.to_tensor(in_np_2d) + res1 = paddle.nn.functional.dropout1d( + input=input_2d, p=0.0, training=False + ) + np.testing.assert_allclose(res1.numpy(), in_np_2d, rtol=1e-05) + + # Test 3D input + in_np_3d = np.random.random([2, 3, 4]).astype("float32") + input_3d = paddle.to_tensor(in_np_3d) + res2 = paddle.nn.functional.dropout1d( + input=input_3d, p=0.0, training=False + ) + np.testing.assert_allclose(res2.numpy(), in_np_3d, rtol=1e-05) + + +class TestDropout1DFAPIError(unittest.TestCase): + def test_errors(self): + paddle.enable_static() + main_prog = paddle.static.Program() + startup_prog = paddle.static.Program() + with paddle.static.program_guard(main_prog, startup_prog): + + def test_xdim_1d(): + # dimensions of x should be 2 or 3 + x = paddle.static.data(name='x1', shape=[4], dtype="float32") + paddle.nn.functional.dropout1d(x) + + self.assertRaises(RuntimeError, test_xdim_1d) + + def test_xdim_4d(): + # dimensions of x should be 2 or 3 + x = paddle.static.data( + name='x2', shape=[2, 3, 4, 5], dtype="float32" + ) + paddle.nn.functional.dropout1d(x) + + self.assertRaises(RuntimeError, test_xdim_4d) + + def test_prob_range(): + # p should be in [0, 1] + x = paddle.static.data( + name='x3', shape=[2, 3, 4], dtype="float32" + ) + paddle.nn.functional.dropout1d(x, p=1.5) + + self.assertRaises(ValueError, test_prob_range) + + class TestDropout2DFAPI(unittest.TestCase): def setUp(self): np.random.seed(123) @@ -1404,6 +1519,12 @@ def test_p_tensor(self): np.testing.assert_array_equal(static_res, dygraph_res) +class TestDropOut1DWithProbTensor(TestDropOutWithProbTensor): + def init_info(self): + self.shape = [2, 3, 4] + self.api = paddle.nn.functional.dropout1d + + class TestDropOut2DWithProbTensor(TestDropOutWithProbTensor): def init_info(self): self.shape = [2, 3, 10, 10] diff --git a/test/legacy_test/test_elementwise_mod_op.py b/test/legacy_test/test_elementwise_mod_op.py index 982b1a310093b9..3620215c186114 100644 --- a/test/legacy_test/test_elementwise_mod_op.py +++ b/test/legacy_test/test_elementwise_mod_op.py @@ -16,7 +16,12 @@ import unittest import numpy as np -from op_test import OpTest, convert_float_to_uint16, convert_uint16_to_float +from op_test import ( + OpTest, + convert_float_to_uint16, + convert_uint16_to_float, + is_custom_device, +) from utils import dygraph_guard, static_guard import paddle @@ -124,7 +129,8 @@ def test_check_output(self): @unittest.skipIf( - not core.is_compiled_with_cuda(), "core is not compiled with CUDA" + not (core.is_compiled_with_cuda() or is_custom_device()), + "core is not compiled with CUDA", ) class TestElementwiseModFP16Op(TestElementwiseModOp): def init_dtype(self): diff --git a/test/legacy_test/test_elementwise_mul_op.py b/test/legacy_test/test_elementwise_mul_op.py index 9f4fcb43bec869..a4f365ea92b1a8 100644 --- a/test/legacy_test/test_elementwise_mul_op.py +++ b/test/legacy_test/test_elementwise_mul_op.py @@ -15,7 +15,12 @@ import unittest import numpy as np -from op_test import OpTest, convert_float_to_uint16, skip_check_grad_ci +from op_test import ( + OpTest, + convert_float_to_uint16, + is_custom_device, + skip_check_grad_ci, +) import paddle from paddle import base @@ -472,7 +477,8 @@ def init_input_attr_output(self): @unittest.skipIf( - not core.is_compiled_with_cuda(), "core is not compiled with CUDA" + not (core.is_compiled_with_cuda() or is_custom_device()), + "core is not compiled with CUDA", ) class TestElementwiseMulOpFp16(ElementwiseMulOp): def init_dtype(self): diff --git a/test/legacy_test/test_embedding_scale_grad_by_freq.py b/test/legacy_test/test_embedding_scale_grad_by_freq.py index 63e408a88422be..e996fc66c41033 100644 --- a/test/legacy_test/test_embedding_scale_grad_by_freq.py +++ b/test/legacy_test/test_embedding_scale_grad_by_freq.py @@ -15,6 +15,7 @@ import unittest import numpy as np +from op_test import get_places import paddle from paddle.nn.functional import embedding @@ -32,9 +33,7 @@ def ref_embedding_scale_grad_(x, weight_unscaled_grad): class TestEmbeddingAPIScaleGradByFreq(unittest.TestCase): def setUp(self): self.init_data() - self.places = [paddle.CPUPlace()] - if paddle.core.is_compiled_with_cuda(): - self.places.append(paddle.CUDAPlace(0)) + self.places = get_places() def init_data(self): self.dtype = "float32" diff --git a/test/legacy_test/test_fused_gate_attention_op.py b/test/legacy_test/test_fused_gate_attention_op.py index 43ee9ab844ee08..49f44c7f9b9d40 100644 --- a/test/legacy_test/test_fused_gate_attention_op.py +++ b/test/legacy_test/test_fused_gate_attention_op.py @@ -20,7 +20,12 @@ import unittest import numpy as np -from op_test import OpTest, convert_float_to_uint16, convert_uint16_to_float +from op_test import ( + OpTest, + convert_float_to_uint16, + convert_uint16_to_float, + is_custom_device, +) from test_sparse_attention_op import get_cuda_version import paddle @@ -30,7 +35,8 @@ @unittest.skipIf( - not core.is_compiled_with_cuda(), "Paddle is not compiled with CUDA" + not (core.is_compiled_with_cuda() or is_custom_device()), + "Paddle is not compiled with CUDA", ) class TestFusedGateAttentionOp(OpTest): def setUp(self): @@ -474,7 +480,7 @@ def setUp(self): ] def test_api(self): - if not core.is_compiled_with_cuda(): + if not (core.is_compiled_with_cuda() or is_custom_device()): pass query = paddle.rand(shape=self.query_shape, dtype="float32") diff --git a/test/legacy_test/test_fused_rotary_position_embedding.py b/test/legacy_test/test_fused_rotary_position_embedding.py index ce26cdff7ec858..b3a9ed4a09ffee 100644 --- a/test/legacy_test/test_fused_rotary_position_embedding.py +++ b/test/legacy_test/test_fused_rotary_position_embedding.py @@ -16,6 +16,7 @@ import numpy as np import parameterized as param +from op_test import is_custom_device import paddle from paddle.base import core @@ -158,7 +159,8 @@ def paddle_fused_rotary_position_embedding( @unittest.skipIf( - not core.is_compiled_with_cuda() and not paddle.is_compiled_with_rocm(), + not (core.is_compiled_with_cuda() or is_custom_device()) + and not paddle.is_compiled_with_rocm(), "core is not compiled with CUDA or ROCM ", ) @param.parameterized_class( @@ -693,7 +695,8 @@ def test_error2(): @unittest.skipIf( - not core.is_compiled_with_cuda() and not paddle.is_compiled_with_rocm(), + not (core.is_compiled_with_cuda() or is_custom_device()) + and not paddle.is_compiled_with_rocm(), "core is not compiled with CUDA or ROCM ", ) class TestFusedRotaryPositionEmbeddingZeroSize(unittest.TestCase): diff --git a/test/legacy_test/test_gaussian_random_op.py b/test/legacy_test/test_gaussian_random_op.py index 59b80920234233..c4f860bcc7e973 100644 --- a/test/legacy_test/test_gaussian_random_op.py +++ b/test/legacy_test/test_gaussian_random_op.py @@ -15,7 +15,12 @@ import unittest import numpy as np -from op_test import OpTest, convert_uint16_to_float, paddle_static_guard +from op_test import ( + OpTest, + convert_uint16_to_float, + is_custom_device, + paddle_static_guard, +) import paddle from paddle import base @@ -61,7 +66,8 @@ def verify_output(self, outs): @unittest.skipIf( - not core.is_compiled_with_cuda(), "core is not compiled with CUDA" + not (core.is_compiled_with_cuda() or is_custom_device()), + "core is not compiled with CUDA", ) class TestGaussianRandomFP16Op(OpTest): def setUp(self): @@ -111,7 +117,8 @@ def gauss_wrapper(shape, mean, std, seed, dtype=np.uint16, name=None): @unittest.skipIf( - not core.is_compiled_with_cuda(), "core is not compiled with CUDA" + not (core.is_compiled_with_cuda() or is_custom_device()), + "core is not compiled with CUDA", ) class TestGaussianRandomBF16Op(OpTest): def setUp(self): diff --git a/test/legacy_test/test_geometric_inplace.py b/test/legacy_test/test_geometric_inplace.py index 9b5177eac04b8b..baed59705189aa 100644 --- a/test/legacy_test/test_geometric_inplace.py +++ b/test/legacy_test/test_geometric_inplace.py @@ -16,7 +16,7 @@ import numpy as np import scipy.stats -from op_test import get_places +from op_test import get_devices import paddle @@ -36,7 +36,7 @@ def test_fp64(): tensor_fp64.geometric_(probs=0.3) self.assertEqual(tensor_fp64.dtype, paddle.float64) - for place in get_places(string_format=True): + for place in get_devices(): paddle.set_device(place) test_fp32() test_fp64() @@ -96,7 +96,7 @@ def test_geometric_inplace_distribution(self): class TestGeometricInplaceEmptyTensor(unittest.TestCase): def test_geometric_inplace_op_empty_tensor(self): test_shapes = [(200, 1), (1, 200)] - for place in get_places(string_format=True): + for place in get_devices(): paddle.set_device(place) for test_shape in test_shapes: tensor = paddle.empty(shape=test_shape) @@ -122,7 +122,7 @@ def test_grad(): geometric_grad = tensor_b.grad.numpy() self.assertTrue((geometric_grad == 0).all()) - for place in get_places(string_format=True): + for place in get_devices(): paddle.set_device(place) test_grad() diff --git a/test/legacy_test/test_group_norm_op_v2.py b/test/legacy_test/test_group_norm_op_v2.py index 2ae1a72c2c2b29..1a6c5aeafd8781 100644 --- a/test/legacy_test/test_group_norm_op_v2.py +++ b/test/legacy_test/test_group_norm_op_v2.py @@ -15,7 +15,7 @@ import unittest import numpy as np -from op_test import get_places +from op_test import get_places, is_custom_device from utils import dygraph_guard import paddle @@ -243,7 +243,7 @@ def test_numerical_accuracy(self): class TestGroupNormAPIV2_With_General_Dimensions_fp16(unittest.TestCase): def test_numerical_accuracy(self): # fp16 only supported in cuda - if not core.is_compiled_with_cuda(): + if not (core.is_compiled_with_cuda() or is_custom_device()): return paddle.disable_static() shapes = [ @@ -286,7 +286,7 @@ def test_numerical_accuracy(self): class TestGroupNormAPIV2_With_NCL_fp16(unittest.TestCase): def test_numerical_accuracy(self): - if not core.is_compiled_with_cuda(): + if not (core.is_compiled_with_cuda() or is_custom_device()): return paddle.disable_static() shape = (2, 6, 4) @@ -327,7 +327,7 @@ def test_numerical_accuracy(self): class TestGroupNormAPIV2_With_NCDHW_fp16(unittest.TestCase): def test_numerical_accuracy(self): - if not core.is_compiled_with_cuda(): + if not (core.is_compiled_with_cuda() or is_custom_device()): return paddle.disable_static() shape = (2, 6, 4, 2, 2) @@ -368,7 +368,7 @@ def test_numerical_accuracy(self): class TestGroupNormAPIV2_With_NLC_fp16(unittest.TestCase): def test_numerical_accuracy(self): - if not core.is_compiled_with_cuda(): + if not (core.is_compiled_with_cuda() or is_custom_device()): return paddle.disable_static() shape = (2, 4, 6) @@ -409,7 +409,7 @@ def test_numerical_accuracy(self): class TestGroupNormAPIV2_With_NHWC_fp16(unittest.TestCase): def test_numerical_accuracy(self): - if not core.is_compiled_with_cuda(): + if not (core.is_compiled_with_cuda() or is_custom_device()): return paddle.disable_static() shape = (2, 4, 2, 6) @@ -450,7 +450,7 @@ def test_numerical_accuracy(self): class TestGroupNormAPIV2_With_NDHWC_fp16(unittest.TestCase): def test_numerical_accuracy(self): - if not core.is_compiled_with_cuda(): + if not (core.is_compiled_with_cuda() or is_custom_device()): return paddle.disable_static() shape = (2, 4, 2, 2, 6) diff --git a/test/legacy_test/test_imperative_triple_grad.py b/test/legacy_test/test_imperative_triple_grad.py index 2cec3112913fd2..a873b58768279e 100644 --- a/test/legacy_test/test_imperative_triple_grad.py +++ b/test/legacy_test/test_imperative_triple_grad.py @@ -16,6 +16,7 @@ from unittest import TestCase import numpy as np +from op_test import get_devices import paddle from paddle import base @@ -327,9 +328,7 @@ def setUp(self): self.input_numpy_dout = None self.input_numpy_ddx = None self.input_numpy_ddy = None - self.places = ["cpu"] - if paddle.is_compiled_with_cuda(): - self.places.append("gpu") + self.places = get_devices() def actual(self): x = paddle.to_tensor( @@ -657,9 +656,7 @@ def setUp(self): self.input_numpy_dout = None self.input_numpy_ddx = None self.input_numpy_ddy = None - self.places = ["cpu"] - if paddle.is_compiled_with_cuda(): - self.places.append("gpu") + self.places = get_devices() def actual(self): x = paddle.to_tensor( @@ -961,9 +958,7 @@ def setUp(self): self.input_numpy_dout = None self.input_numpy_ddx = None self.input_numpy_ddy = None - self.places = ["cpu"] - if paddle.is_compiled_with_cuda(): - self.places.append("gpu") + self.places = get_devices() def actual(self): x = paddle.to_tensor( diff --git a/test/legacy_test/test_index_add_op.py b/test/legacy_test/test_index_add_op.py index c98652902aa845..b3383e1ce14cef 100644 --- a/test/legacy_test/test_index_add_op.py +++ b/test/legacy_test/test_index_add_op.py @@ -15,7 +15,7 @@ import unittest import numpy as np -from op_test import OpTest, convert_float_to_uint16 +from op_test import OpTest, convert_float_to_uint16, get_devices import paddle from paddle.base import core @@ -199,10 +199,7 @@ def setType(self): self.index_type = np.int32 def setPlace(self): - self.place = [] - self.place.append('cpu') - if paddle.is_compiled_with_cuda(): - self.place.append('gpu') + self.place = get_devices() def config(self): self.axis = 0 diff --git a/test/legacy_test/test_index_fill.py b/test/legacy_test/test_index_fill.py index 32035caa8c3975..147439e7aa929d 100644 --- a/test/legacy_test/test_index_fill.py +++ b/test/legacy_test/test_index_fill.py @@ -16,7 +16,7 @@ from itertools import combinations import numpy as np -from op_test import get_places +from op_test import get_devices import paddle from paddle.base import Program @@ -44,7 +44,7 @@ def setUp(self): self.index_type ) - self.place = get_places(string_format=True) + self.place = get_devices() if self.dtype_np == 'float16' and 'cpu' in self.place: self.place.remove('cpu') @@ -150,7 +150,7 @@ def setUp(self): self.index_type ) - self.place = get_places(string_format=True) + self.place = get_devices() if self.dtype_np == 'float16' and 'cpu' in self.place: self.place.remove('cpu') diff --git a/test/legacy_test/test_index_put_op.py b/test/legacy_test/test_index_put_op.py index 8ef3499026e2b3..722742f2e84f97 100644 --- a/test/legacy_test/test_index_put_op.py +++ b/test/legacy_test/test_index_put_op.py @@ -16,7 +16,7 @@ import unittest import numpy as np -from op_test import get_places +from op_test import get_devices import paddle @@ -120,7 +120,7 @@ def init_dtype_type(self): self.accumulate = False def setPlace(self): - self.place = get_places(string_format=True) + self.place = get_devices() if self.dtype_np is np.float16 and "cpu" in self.place: self.place.remove("cpu") @@ -620,7 +620,7 @@ def init_dtype_type(self): self.accumulate = False def setPlace(self): - self.place = get_places(string_format=True) + self.place = get_devices() def test_dygraph_forward(self): paddle.disable_static() @@ -661,7 +661,7 @@ def setUp(self): self.setPlace() def setPlace(self): - self.place = get_places(string_format=True) + self.place = get_devices() def test_backward(self): paddle.disable_static() @@ -1019,7 +1019,7 @@ def init_dtype_type(self): self.index_type_pd = paddle.int64 def setPlace(self): - self.place = get_places(string_format=True) + self.place = get_devices() if self.dtype_np is np.float16 and "cpu" in self.place: self.place.remove("cpu") diff --git a/test/legacy_test/test_inplace.py b/test/legacy_test/test_inplace.py index 41ea4ebbf7625d..fa176448470075 100755 --- a/test/legacy_test/test_inplace.py +++ b/test/legacy_test/test_inplace.py @@ -2090,9 +2090,7 @@ def test_broadcast_error(self): class TestDygraphInplaceSet(unittest.TestCase): def setUp(self): self.init_data() - self.places = [paddle.CPUPlace()] - if paddle.base.core.is_compiled_with_cuda(): - self.places.append(paddle.CUDAPlace(0)) + self.places = get_places() self.support_dtypes = [ 'float32', 'float64', @@ -2274,7 +2272,7 @@ def leaf_inplace_error(): class TestDygraphInplaceSetFP16(TestDygraphInplaceSet): def setUp(self): self.init_data() - self.places = [paddle.CUDAPlace(0)] + self.places = get_places() def init_data(self): self.x_np = np.random.uniform(-5, 5, [7, 20, 2]) @@ -2304,7 +2302,7 @@ def test_inplace_api(self): class TestDygraphInplaceSetBF16(TestDygraphInplaceSet): def setUp(self): self.init_data() - self.places = [paddle.CUDAPlace(0)] + self.places = get_places() def init_data(self): self.x_np = np.random.uniform(-5, 5, [7, 20, 2]) @@ -2329,9 +2327,7 @@ def test_inplace_api(self): class TestDygraphInplaceResize(unittest.TestCase): def setUp(self): self.init_data() - self.places = [paddle.CPUPlace()] - if paddle.base.core.is_compiled_with_cuda(): - self.places.append(paddle.CUDAPlace(0)) + self.places = get_places() self.support_dtypes = [ 'float32', 'float64', @@ -2444,7 +2440,7 @@ def argument_error(): class TestDygraphInplaceResizeFP16(TestDygraphInplaceResize): def setUp(self): self.init_data() - self.places = [paddle.CUDAPlace(0)] + self.places = get_places() def init_data(self): self.x_np = np.random.uniform(-5, 5, [3, 10, 2]) @@ -2472,7 +2468,7 @@ def test_inplace_api(self): class TestDygraphInplaceResizeBF16(TestDygraphInplaceResize): def setUp(self): self.init_data() - self.places = [paddle.CUDAPlace(0)] + self.places = get_places() def init_data(self): self.x_np = np.random.uniform(-5, 5, [3, 10, 2]) diff --git a/test/legacy_test/test_ldexp.py b/test/legacy_test/test_ldexp.py index d4edd57e0cb39f..47d3025cd047bc 100644 --- a/test/legacy_test/test_ldexp.py +++ b/test/legacy_test/test_ldexp.py @@ -15,7 +15,7 @@ import unittest import numpy as np -from op_test import get_places +from op_test import get_devices, get_places import paddle @@ -86,7 +86,7 @@ def check_dtype(input, desired_dtype): class TestLdexpAPIWithDynamic(unittest.TestCase): def setUp(self): - self.places = get_places(string_format=True) + self.places = get_devices() def test_ldexp_dynamic(self): np.random.seed(7) @@ -136,7 +136,7 @@ def test_ldexp_dynamic(self): class TestLdexpAPIWithStatic(unittest.TestCase): def setUp(self): - self.places = get_places(string_format=True) + self.places = get_devices() def test_ldexp_static(self): np.random.seed(7) diff --git a/test/legacy_test/test_linalg_vecdot.py b/test/legacy_test/test_linalg_vecdot.py index 2dafe849ad2bcd..7a251943e6a990 100644 --- a/test/legacy_test/test_linalg_vecdot.py +++ b/test/legacy_test/test_linalg_vecdot.py @@ -17,6 +17,7 @@ import unittest import numpy as np +from op_test import get_places import paddle from paddle.base import core @@ -34,9 +35,7 @@ def setUp(self): self.init_config() self.generate_input() self.generate_expected_output() - self.places = [paddle.CPUPlace()] - if paddle.is_compiled_with_cuda(): - self.places.append(paddle.CUDAPlace(0)) + self.places = get_places() def generate_input(self): np.random.seed(123) diff --git a/test/legacy_test/test_log_normal_inplace.py b/test/legacy_test/test_log_normal_inplace.py index 5cb29367ee7929..e2b25289a34128 100644 --- a/test/legacy_test/test_log_normal_inplace.py +++ b/test/legacy_test/test_log_normal_inplace.py @@ -15,7 +15,7 @@ import unittest import numpy as np -from op_test import get_places +from op_test import get_devices import paddle @@ -44,7 +44,7 @@ def test_fp64(): tensor_fp64.log_normal_() self.assertEqual(tensor_fp64.dtype, paddle.float64) - places = get_places(string_format=True) + places = get_devices() for place in places: paddle.set_device(place) test_fp32() @@ -105,7 +105,7 @@ def test_log_normal_inplace_op_distribution(self): class TestLogNormalRandomInplaceOpEmptyTensor(unittest.TestCase): def test_log_normal_inplace_op_empty_tensor(self): - places = get_places(string_format=True) + places = get_devices() test_shapes = [(200, 0), (0, 200)] for place in places: paddle.set_device(place) @@ -133,7 +133,7 @@ def test_grad(): log_normal_grad = tensor_b.grad.numpy() self.assertTrue((log_normal_grad == 0).all()) - places = get_places(string_format=True) + places = get_devices() for place in places: paddle.set_device(place) test_grad() diff --git a/test/legacy_test/test_margin_cross_entropy_op.py b/test/legacy_test/test_margin_cross_entropy_op.py index e8f3de35941639..e7bbb93e7a072f 100644 --- a/test/legacy_test/test_margin_cross_entropy_op.py +++ b/test/legacy_test/test_margin_cross_entropy_op.py @@ -15,7 +15,13 @@ import unittest import numpy as np -from op_test import OpTest, convert_float_to_uint16, paddle_static_guard +from op_test import ( + OpTest, + convert_float_to_uint16, + get_places, + is_custom_device, + paddle_static_guard, +) import paddle from paddle.base import core @@ -329,16 +335,15 @@ def test_check_grad(self): @unittest.skipIf( - not core.is_compiled_with_cuda(), "core is not compiled with CUDA" + not (core.is_compiled_with_cuda() or is_custom_device()), + "core is not compiled with CUDA", ) class TestMarginCrossEntropyOpV2(unittest.TestCase): def setUp(self): self.initParams() np.random.seed(self.seed) paddle.framework.random._manual_program_seed(self.seed) - self.places = [] - if core.is_compiled_with_cuda(): - self.places.append(paddle.base.CUDAPlace(0)) + self.places = get_places() def initParams(self): self.python_out_sig = ["Loss"] @@ -501,16 +506,15 @@ def init_reduction(self): @unittest.skipIf( - not core.is_compiled_with_cuda(), "core is not compiled with CUDA" + not (core.is_compiled_with_cuda() or is_custom_device()), + "core is not compiled with CUDA", ) class TestMarginCrossEntropyOpAPIError(unittest.TestCase): def setUp(self): self.initParams() np.random.seed(self.seed) paddle.framework.random._manual_program_seed(self.seed) - self.places = [] - if core.is_compiled_with_cuda(): - self.places.append(paddle.base.CUDAPlace(0)) + self.places = get_places() def initParams(self): self.python_api = python_api diff --git a/test/legacy_test/test_matmul_0_size_op.py b/test/legacy_test/test_matmul_0_size_op.py index fc3f3c3230044b..795ffb1d9ce89a 100644 --- a/test/legacy_test/test_matmul_0_size_op.py +++ b/test/legacy_test/test_matmul_0_size_op.py @@ -14,13 +14,16 @@ import unittest +from op_test import is_custom_device + import paddle from paddle import _C_ops from paddle.base import core @unittest.skipIf( - not core.is_compiled_with_cuda(), "mamtul 0 size only with in cuda" + not (core.is_compiled_with_cuda() or is_custom_device()), + "mamtul 0 size only with in cuda", ) class TestMatmulDygraph(unittest.TestCase): def test_matmul(self): diff --git a/test/legacy_test/test_max_op.py b/test/legacy_test/test_max_op.py index 64e3cd15362003..741024f8059de4 100644 --- a/test/legacy_test/test_max_op.py +++ b/test/legacy_test/test_max_op.py @@ -156,9 +156,7 @@ def setUp(self): self.expect_res = np.max( self.data, axis=tuple(self.axis), keepdims=self.keepdims ) - self.places = [core.CPUPlace()] - if core.is_compiled_with_cuda(): - self.places.append(core.CUDAPlace(0)) + self.places = get_places() def test_static(self): with static_guard(): diff --git a/test/legacy_test/test_mean_op.py b/test/legacy_test/test_mean_op.py index 464f8852ab3861..01ecd450383ec7 100644 --- a/test/legacy_test/test_mean_op.py +++ b/test/legacy_test/test_mean_op.py @@ -828,9 +828,7 @@ def setUp(self): self.x_np = np.random.randint(-1, 10000, self.x_shape).astype( self.dtype ) - self.places = [paddle.CPUPlace()] - if core.is_compiled_with_cuda(): - self.places.append(paddle.CUDAPlace(0)) + self.places = get_places() def test_dygraph(self): for place in self.places: @@ -864,9 +862,7 @@ def setUp(self): self.x_np = np.random.randint(-1, 10000, self.x_shape).astype( self.dtype ) - self.places = [paddle.CPUPlace()] - if core.is_compiled_with_cuda(): - self.places.append(paddle.CUDAPlace(0)) + self.places = get_places() class TestMeanAPIBool(TestMeanAPIInt32): @@ -874,9 +870,7 @@ def setUp(self): self.x_shape = [2, 3, 4, 5] self.dtype = "bool" self.x_np = np.random.uniform(-1, 1, self.x_shape).astype(self.dtype) - self.places = [paddle.CPUPlace()] - if core.is_compiled_with_cuda(): - self.places.append(paddle.CUDAPlace(0)) + self.places = get_places() class TestMeanWithTensorAxis1(TestReduceOPTensorAxisBase): diff --git a/test/legacy_test/test_merged_adam_op.py b/test/legacy_test/test_merged_adam_op.py index e590f7cfa9c900..e474a8978b4fea 100644 --- a/test/legacy_test/test_merged_adam_op.py +++ b/test/legacy_test/test_merged_adam_op.py @@ -15,7 +15,7 @@ import unittest import numpy as np -from op_test import get_places +from op_test import get_devices import paddle from paddle import _C_ops @@ -205,7 +205,7 @@ def run_op(use_merged): def test_main(self): for multi_precision in [False, True]: - for place in get_places(string_format=True): + for place in get_devices(): self.check_with_place(place, multi_precision) diff --git a/test/legacy_test/test_min_op.py b/test/legacy_test/test_min_op.py index ef0cc06b117ab7..f162bfcc347938 100644 --- a/test/legacy_test/test_min_op.py +++ b/test/legacy_test/test_min_op.py @@ -143,9 +143,7 @@ def setUp(self): self.expect_res = np.min( self.data, axis=tuple(self.axis), keepdims=self.keepdims ) - self.places = [core.CPUPlace()] - if core.is_compiled_with_cuda(): - self.places.append(core.CUDAPlace(0)) + self.places = get_places() def test_static(self): with static_guard(): diff --git a/test/legacy_test/test_mode_op.py b/test/legacy_test/test_mode_op.py index 227e966b47c05a..8064c53ac5bd9e 100644 --- a/test/legacy_test/test_mode_op.py +++ b/test/legacy_test/test_mode_op.py @@ -15,7 +15,12 @@ import unittest import numpy as np -from op_test import OpTest, convert_float_to_uint16, convert_uint16_to_float +from op_test import ( + OpTest, + convert_float_to_uint16, + convert_uint16_to_float, + is_custom_device, +) import paddle from paddle import base @@ -121,7 +126,8 @@ def test_check_grad(self): @unittest.skipIf( - not core.is_compiled_with_cuda(), "core is not compiled with CUDA" + not (core.is_compiled_with_cuda() or is_custom_device()), + "core is not compiled with CUDA", ) class TestModeFP16Op(TestModeOp): def init_dtype(self): @@ -168,7 +174,8 @@ def init_args(self): @unittest.skipIf( - not core.is_compiled_with_cuda(), "core is not compiled with CUDA" + not (core.is_compiled_with_cuda() or is_custom_device()), + "core is not compiled with CUDA", ) class TestModeFP16OpLastdim(TestModeFP16Op): def init_args(self): @@ -177,7 +184,8 @@ def init_args(self): @unittest.skipIf( - not core.is_compiled_with_cuda(), "core is not compiled with CUDA" + not (core.is_compiled_with_cuda() or is_custom_device()), + "core is not compiled with CUDA", ) class TestModeBF16OpLastdim(TestModeBF16Op): def init_args(self): diff --git a/test/legacy_test/test_momentum_op.py b/test/legacy_test/test_momentum_op.py index fb68dc9d91a23c..ec7411770ff3a9 100644 --- a/test/legacy_test/test_momentum_op.py +++ b/test/legacy_test/test_momentum_op.py @@ -16,7 +16,7 @@ import numpy as np from op import Operator -from op_test import OpTest, get_places +from op_test import OpTest, get_devices, get_places import paddle from paddle import base @@ -1036,7 +1036,7 @@ def _check_with_param_group(self, place, use_amp): np.testing.assert_allclose(params1[idx], params2[idx], rtol=1e-05) def test_main(self): - for place in get_places(string_format=True): + for place in get_devices(): use_amp_list = [True, False] for use_amp in use_amp_list: self._check_with_place_amp(place, use_amp) diff --git a/test/legacy_test/test_msort_op.py b/test/legacy_test/test_msort_op.py new file mode 100644 index 00000000000000..aac9e4764e2702 --- /dev/null +++ b/test/legacy_test/test_msort_op.py @@ -0,0 +1,74 @@ +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np + +import paddle +from paddle import base +from paddle.base import core + + +class TestMsortOnCPU(unittest.TestCase): + def setUp(self): + self.place = core.CPUPlace() + + def test_api_0(self): + with base.program_guard(base.Program()): + input = paddle.static.data( + name="input", shape=[2, 3, 4], dtype="float32" + ) + output = paddle.msort(input=input) + exe = base.Executor(self.place) + data = np.array( + [ + [[5, 8, 9, 5], [0, 0, 1, 7], [6, 9, 2, 4]], + [[5, 2, 4, 2], [4, 7, 7, 9], [1, 7, 0, 6]], + ], + dtype='float32', + ) + (result,) = exe.run(feed={'input': data}, fetch_list=[output]) + np_result = np.sort(result, axis=0) + self.assertEqual((result == np_result).all(), True) + + +class TestMsortOnGPU(TestMsortOnCPU): + def init_place(self): + if core.is_compiled_with_cuda(): + self.place = core.CUDAPlace(0) + else: + self.place = core.CPUPlace() + + +class TestMsortDygraph(unittest.TestCase): + def setUp(self): + self.input_data = np.random.rand(10, 10) + if core.is_compiled_with_cuda(): + self.place = core.CUDAPlace(0) + else: + self.place = core.CPUPlace() + + def test_api_0(self): + paddle.disable_static(self.place) + var_x = paddle.to_tensor(self.input_data) + out = paddle.msort(input=var_x) + self.assertEqual( + (np.sort(self.input_data, axis=0) == out.numpy()).all(), True + ) + paddle.enable_static() + + +if __name__ == '__main__': + unittest.main() diff --git a/test/legacy_test/test_multi_label_soft_margin_loss.py b/test/legacy_test/test_multi_label_soft_margin_loss.py index 29cf724d7e69f3..5f4e8b6e33fa55 100644 --- a/test/legacy_test/test_multi_label_soft_margin_loss.py +++ b/test/legacy_test/test_multi_label_soft_margin_loss.py @@ -15,7 +15,7 @@ import unittest import numpy as np -from op_test import get_places +from op_test import get_devices import paddle @@ -145,7 +145,7 @@ def test_MultiLabelSoftMarginLoss(self): input = np.random.uniform(0.1, 0.8, size=(5, 5)).astype(np.float64) label = np.random.randint(0, 2, size=(5, 5)).astype(np.float64) - places = get_places(string_format=True) + places = get_devices() reductions = ['sum', 'mean', 'none'] for place in places: for reduction in reductions: diff --git a/test/legacy_test/test_nadam_op.py b/test/legacy_test/test_nadam_op.py index 509eba6dc66176..e84723ffed7e4a 100644 --- a/test/legacy_test/test_nadam_op.py +++ b/test/legacy_test/test_nadam_op.py @@ -16,7 +16,7 @@ from copy import deepcopy import numpy as np -from op_test import OpTest, get_device_place, get_places +from op_test import OpTest, get_device_place, get_devices, get_places import paddle from paddle import base @@ -460,7 +460,7 @@ def _test_nadam_dygraph_place_amp(self, place, use_amp=False): optimizer.clear_grad() def test_main(self): - for place in get_places(string_format=True): + for place in get_devices(): use_amp_list = [True, False] for use_amp in use_amp_list: self._test_nadam_dygraph_place_amp(place, use_amp) diff --git a/test/legacy_test/test_normal_inplace.py b/test/legacy_test/test_normal_inplace.py index 762595bdd52ae8..775d38fdbaff4d 100644 --- a/test/legacy_test/test_normal_inplace.py +++ b/test/legacy_test/test_normal_inplace.py @@ -15,7 +15,7 @@ import unittest import numpy as np -from op_test import get_places +from op_test import get_devices import paddle @@ -43,7 +43,7 @@ def test_fp64(): tensor_fp64.normal_() self.assertEqual(tensor_fp64.dtype, paddle.float64) - for place in get_places(string_format=True): + for place in get_devices(): paddle.set_device(place) test_fp32() test_fp64() @@ -64,7 +64,7 @@ def test_fp64(): tensor_fp64.normal_() self.assertEqual(tensor_fp64.dtype, paddle.complex128) - for place in get_places(string_format=True): + for place in get_devices(): paddle.set_device(place) test_fp32() test_fp64() @@ -164,7 +164,7 @@ def test_normal_inplace_op_distribution(self): class TestNormalRandomInplaceOpEmptyTensor(unittest.TestCase): def test_normal_inplace_op_empty_tensor(self): test_shapes = [(200, 0), (0, 200)] - for place in get_places(string_format=True): + for place in get_devices(): paddle.set_device(place) for test_shape in test_shapes: tensor = paddle.empty(shape=test_shape) @@ -190,7 +190,7 @@ def test_grad(): normal_grad = tensor_b.grad.numpy() self.assertTrue((normal_grad == 0).all()) - for place in get_places(string_format=True): + for place in get_devices(): paddle.set_device(place) test_grad() @@ -215,7 +215,7 @@ def test_grad(): self.assertTrue((normal_grad.real == 0).all()) self.assertTrue((normal_grad.imag == 0).all()) - for place in get_places(string_format=True): + for place in get_devices(): paddle.set_device(place) test_grad() diff --git a/test/legacy_test/test_pad3d_op.py b/test/legacy_test/test_pad3d_op.py index 46c3ab42ab99f3..e1ed377e851841 100644 --- a/test/legacy_test/test_pad3d_op.py +++ b/test/legacy_test/test_pad3d_op.py @@ -15,7 +15,12 @@ import unittest import numpy as np -from op_test import OpTest, convert_float_to_uint16, get_places +from op_test import ( + OpTest, + convert_float_to_uint16, + get_places, + is_custom_device, +) import paddle import paddle.nn.functional as F @@ -221,7 +226,8 @@ def test_check_output(self): def create_test_fp16(parent): @unittest.skipIf( - not core.is_compiled_with_cuda(), "core is not compiled with CUDA" + not (core.is_compiled_with_cuda() or is_custom_device()), + "core is not compiled with CUDA", ) class TestPad3dFp16(parent): def get_dtype(self): @@ -304,7 +310,8 @@ def test_check_grad_normal(self): # ----------------Pad3d complex64---------------- def create_test_complex64(parent): @unittest.skipIf( - not core.is_compiled_with_cuda(), "core is not compiled with CUDA" + not (core.is_compiled_with_cuda() or is_custom_device()), + "core is not compiled with CUDA", ) class TestPad3dComplex64(parent): def get_dtype(self): @@ -344,7 +351,8 @@ def test_check_grad_normal(self): def create_test_complex128(parent): @unittest.skipIf( - not core.is_compiled_with_cuda(), "core is not compiled with CUDA" + not (core.is_compiled_with_cuda() or is_custom_device()), + "core is not compiled with CUDA", ) class TestPad3dComplex128(parent): def get_dtype(self): diff --git a/test/legacy_test/test_pow.py b/test/legacy_test/test_pow.py index 087c748337bf67..b3f32797cb43d7 100755 --- a/test/legacy_test/test_pow.py +++ b/test/legacy_test/test_pow.py @@ -15,7 +15,7 @@ import unittest import numpy as np -from op_test import get_places +from op_test import get_devices import paddle from paddle.static import Program, program_guard @@ -79,7 +79,7 @@ class TestPowerAPI(unittest.TestCase): """TestPowerAPI.""" def setUp(self): - self.places = get_places(string_format=True) + self.places = get_devices() def test_power(self): """test_power.""" @@ -227,7 +227,7 @@ class TestPowerAPI_ZeroSize(unittest.TestCase): """TestPowerAPI.""" def setUp(self): - self.places = get_places(string_format=True) + self.places = get_devices() def _test_power(self, shape): np.random.seed(7) diff --git a/test/legacy_test/test_pow_op.py b/test/legacy_test/test_pow_op.py index 9cab82ca7f9755..cd8d5200b6b258 100644 --- a/test/legacy_test/test_pow_op.py +++ b/test/legacy_test/test_pow_op.py @@ -15,7 +15,7 @@ import unittest import numpy as np -from op_test import OpTest +from op_test import OpTest, get_places import paddle from paddle.framework import core @@ -39,9 +39,7 @@ def setUp(self): self.outputs = { 'Out': np.power(self.inputs['X'], self.attrs["factor"]) } - self.places = [core.CPUPlace()] - if core.is_compiled_with_cuda(): - self.places.append(core.CUDAPlace(0)) + self.places = get_places() def custom_setting(self): self.inputs = { diff --git a/test/legacy_test/test_psroi_pool_op.py b/test/legacy_test/test_psroi_pool_op.py index 1f954aa102ee05..aac28c59297ebe 100644 --- a/test/legacy_test/test_psroi_pool_op.py +++ b/test/legacy_test/test_psroi_pool_op.py @@ -16,7 +16,7 @@ import unittest import numpy as np -from op_test import OpTest, get_places +from op_test import OpTest, get_devices, get_places import paddle @@ -228,7 +228,7 @@ def test_dytype_is_float64(): ) np.testing.assert_allclose(out, expect_out, rtol=1e-05) - places = get_places(string_format=True) + places = get_devices() for place in places: paddle.set_device(place) test_output_size_is_int() @@ -282,7 +282,7 @@ def test_dytype_is_float64(): np.testing.assert_allclose(out, expect_out, rtol=1e-05) paddle.disable_static() - places = get_places(string_format=True) + places = get_devices() for place in places: paddle.set_device(place) test_output_size_is_int() diff --git a/test/legacy_test/test_radam_op.py b/test/legacy_test/test_radam_op.py index 27124e841a58d1..23efcbf887ba25 100644 --- a/test/legacy_test/test_radam_op.py +++ b/test/legacy_test/test_radam_op.py @@ -16,7 +16,7 @@ from copy import deepcopy import numpy as np -from op_test import OpTest, get_device_place, get_places +from op_test import OpTest, get_device_place, get_devices, get_places import paddle from paddle import base @@ -471,7 +471,7 @@ def _test_radam_dygraph_place_amp(self, place, use_amp=False): optimizer.clear_grad() def test_main(self): - for place in get_places(string_format=True): + for place in get_devices(): use_amp_list = [True, False] for use_amp in use_amp_list: self._test_radam_dygraph_place_amp(place, use_amp) diff --git a/test/legacy_test/test_random_seed.py b/test/legacy_test/test_random_seed.py index 2af2bfff71551b..2ef5fdc7e4a23d 100644 --- a/test/legacy_test/test_random_seed.py +++ b/test/legacy_test/test_random_seed.py @@ -16,6 +16,7 @@ import unittest import numpy as np +from op_test import is_custom_device import paddle from paddle import base @@ -51,7 +52,10 @@ def test_generator_uniform_random_dygraph(self): x2_np = x2.numpy() x3_np = x3.numpy() - if not core.is_compiled_with_cuda() and not core.is_compiled_with_xpu(): + if ( + not (core.is_compiled_with_cuda() or is_custom_device()) + and not core.is_compiled_with_xpu() + ): np.testing.assert_allclose(x1_np, x2_np, rtol=1e-05) np.testing.assert_allclose(x_np, x3_np, rtol=1e-05) @@ -85,7 +89,7 @@ def test_generator_uniform_random_static(self): out2_res2 = np.array(out2[1]) if ( - not core.is_compiled_with_cuda() + not (core.is_compiled_with_cuda() or is_custom_device()) and not core.is_compiled_with_xpu() ): np.testing.assert_allclose(out1_res1, out2_res1, rtol=1e-05) @@ -107,7 +111,10 @@ def test_gen_dropout_dygraph(self): y_np = y.numpy() y1_np = y1.numpy() - if not core.is_compiled_with_cuda() and not core.is_compiled_with_xpu(): + if ( + not (core.is_compiled_with_cuda() or is_custom_device()) + and not core.is_compiled_with_xpu() + ): print(">>>>>>> dropout dygraph >>>>>>>") np.testing.assert_allclose(y_np, y1_np, rtol=1e-05) @@ -132,7 +139,10 @@ def test_gen_dropout_static(self): out1_np = np.array(out1[0]) out2_np = np.array(out2[0]) - if not core.is_compiled_with_cuda() and not core.is_compiled_with_xpu(): + if ( + not (core.is_compiled_with_cuda() or is_custom_device()) + and not core.is_compiled_with_xpu() + ): print(">>>>>>> dropout static >>>>>>>") np.testing.assert_allclose(out1_np, out2_np, rtol=1e-05) @@ -153,7 +163,10 @@ def test_generator_gaussian_random_dygraph(self): x2_np = x2.numpy() x3_np = x3.numpy() - if not core.is_compiled_with_cuda() and not core.is_compiled_with_xpu(): + if ( + not (core.is_compiled_with_cuda() or is_custom_device()) + and not core.is_compiled_with_xpu() + ): print(">>>>>>> gaussian random dygraph >>>>>>>") np.testing.assert_allclose(x1_np, x2_np, rtol=1e-05) np.testing.assert_allclose(x_np, x3_np, rtol=1e-05) @@ -188,7 +201,7 @@ def test_generator_gaussian_random_static(self): out2_res2 = np.array(out2[1]) if ( - not core.is_compiled_with_cuda() + not (core.is_compiled_with_cuda() or is_custom_device()) and not core.is_compiled_with_xpu() ): print(">>>>>>> gaussian random static >>>>>>>") @@ -213,7 +226,10 @@ def test_generator_randint_dygraph(self): x2_np = x2.numpy() x3_np = x3.numpy() - if not core.is_compiled_with_cuda() and not core.is_compiled_with_xpu(): + if ( + not (core.is_compiled_with_cuda() or is_custom_device()) + and not core.is_compiled_with_xpu() + ): print(">>>>>>> randint dygraph >>>>>>>") np.testing.assert_allclose(x1_np, x2_np, rtol=1e-05) np.testing.assert_allclose(x_np, x3_np, rtol=1e-05) @@ -248,7 +264,7 @@ def test_generator_uniform_random_static_1(self): out2_res2 = np.array(out2[1]) if ( - not core.is_compiled_with_cuda() + not (core.is_compiled_with_cuda() or is_custom_device()) and not core.is_compiled_with_xpu() ): np.testing.assert_allclose(out1_res1, out2_res1, rtol=1e-05) @@ -271,7 +287,10 @@ def test_generator_randint_dygraph_1(self): x1_np = x1.numpy() x2_np = x2.numpy() x3_np = x3.numpy() - if not core.is_compiled_with_cuda() and not core.is_compiled_with_xpu(): + if ( + not (core.is_compiled_with_cuda() or is_custom_device()) + and not core.is_compiled_with_xpu() + ): np.testing.assert_allclose(x1_np, x2_np, rtol=1e-05) np.testing.assert_allclose(x_np, x3_np, rtol=1e-05) @@ -305,7 +324,7 @@ def test_generator_ranint_static(self): out2_res2 = np.array(out2[1]) if ( - not core.is_compiled_with_cuda() + not (core.is_compiled_with_cuda() or is_custom_device()) and not core.is_compiled_with_xpu() ): print(">>>>>>> randint static >>>>>>>") @@ -331,7 +350,10 @@ def test_generator_randperm_dygraph(self): x2_np = x2.numpy() x3_np = x3.numpy() - if not core.is_compiled_with_cuda() and not core.is_compiled_with_xpu(): + if ( + not (core.is_compiled_with_cuda() or is_custom_device()) + and not core.is_compiled_with_xpu() + ): print(">>>>>>> randperm dygraph >>>>>>>") np.testing.assert_allclose(x1_np, x2_np, rtol=1e-05) np.testing.assert_allclose(x_np, x3_np, rtol=1e-05) @@ -366,7 +388,7 @@ def test_generator_randperm_static(self): out2_res2 = np.array(out2[1]) if ( - not core.is_compiled_with_cuda() + not (core.is_compiled_with_cuda() or is_custom_device()) and not core.is_compiled_with_xpu() ): print(">>>>>>> randperm static >>>>>>>") diff --git a/test/legacy_test/test_ravel.py b/test/legacy_test/test_ravel.py new file mode 100644 index 00000000000000..05a21e156219d5 --- /dev/null +++ b/test/legacy_test/test_ravel.py @@ -0,0 +1,132 @@ +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np + +import paddle +from paddle import base + + +class TestPaddleRavel(unittest.TestCase): + def setUp(self): + self.input_np = np.array([[1, 2, 3], [4, 5, 6]], dtype="float32") + self.input_shape = self.input_np.shape + self.input_dtype = "float32" + self.op_static = lambda x: paddle.ravel(x) + self.op_dygraph = lambda x: paddle.ravel(x) + self.expected = lambda x: x.flatten() + self.places = [None, paddle.CPUPlace()] + + def check_static_result(self, place): + paddle.enable_static() + main_prog = paddle.static.Program() + startup_prog = paddle.static.Program() + with paddle.static.program_guard(main_prog, startup_prog): + input_name = 'input' + input_var = paddle.static.data( + name=input_name, shape=self.input_shape, dtype=self.input_dtype + ) + res = self.op_static(input_var) + exe = base.Executor(place) + fetches = exe.run( + main_prog, + feed={input_name: self.input_np}, + fetch_list=[res], + ) + expect = ( + self.expected(self.input_np) + if callable(self.expected) + else self.expected + ) + np.testing.assert_allclose(fetches[0], expect, rtol=1e-05) + + def test_static(self): + for place in self.places: + self.check_static_result(place=place) + + def check_dygraph_result(self, place): + with base.dygraph.guard(place): + input = paddle.to_tensor(self.input_np, stop_gradient=False) + result = self.op_dygraph(input) + expect = ( + self.expected(self.input_np) + if callable(self.expected) + else self.expected + ) + # check forward + np.testing.assert_allclose(result.numpy(), expect, rtol=1e-05) + + # check backward + paddle.autograd.backward([result]) + np.testing.assert_allclose( + input.grad.numpy(), np.ones_like(self.input_np), rtol=1e-05 + ) + + def test_dygraph(self): + for place in self.places: + self.check_dygraph_result(place=place) + + +class TestPaddleRavel_case1(TestPaddleRavel): + def setUp(self): + # check Ravel 1d + self.input_np = np.array([7, 8, 9], dtype="float32") + self.input_shape = self.input_np.shape + self.input_dtype = "float32" + self.op_static = lambda x: paddle.ravel(x) + self.op_dygraph = lambda x: paddle.ravel(x) + self.expected = lambda x: x.flatten() + self.places = [None, paddle.CPUPlace()] + + +class TestPaddleRavel_case2(TestPaddleRavel): + def setUp(self): + # check Ravel 3d + self.input_np = np.arange(24, dtype="float32").reshape(2, 3, 4) + self.input_shape = self.input_np.shape + self.input_dtype = "float32" + self.op_static = lambda x: paddle.ravel(x) + self.op_dygraph = lambda x: paddle.ravel(x) + self.expected = lambda x: x.flatten() + self.places = [None, paddle.CPUPlace()] + + +class TestPaddleRavel_case3(TestPaddleRavel): + def setUp(self): + # check Ravel 0d (scalar) + self.input_np = np.array(5.0, dtype="float32") + self.input_shape = self.input_np.shape + self.input_dtype = "float32" + self.op_static = lambda x: paddle.ravel(x) + self.op_dygraph = lambda x: paddle.ravel(x) + self.expected = lambda x: x.flatten() + self.places = [None, paddle.CPUPlace()] + + +class TestPaddleRavel_case4(TestPaddleRavel): + def setUp(self): + # check Ravel empty array + self.input_np = np.array([], dtype="float32").reshape(0, 3) + self.input_shape = self.input_np.shape + self.input_dtype = "float32" + self.op_static = lambda x: paddle.ravel(x) + self.op_dygraph = lambda x: paddle.ravel(x) + self.expected = lambda x: x.flatten() + self.places = [None, paddle.CPUPlace()] + + +if __name__ == "__main__": + unittest.main() diff --git a/test/legacy_test/test_reduce_op.py b/test/legacy_test/test_reduce_op.py index 76b7a4a5b761a7..85e8b036d2b2fd 100644 --- a/test/legacy_test/test_reduce_op.py +++ b/test/legacy_test/test_reduce_op.py @@ -19,6 +19,7 @@ OpTest, convert_float_to_uint16, get_places, + is_custom_device, skip_check_grad_ci, ) from utils import dygraph_guard, static_guard @@ -192,7 +193,8 @@ def test_check_grad(self): def create_test_fp16_class(parent): @unittest.skipIf( - not core.is_compiled_with_cuda(), "core is not compiled with CUDA" + not (core.is_compiled_with_cuda() or is_custom_device()), + "core is not compiled with CUDA", ) class TestSumOpFp16(parent): def init_dtype(self): @@ -341,9 +343,7 @@ class TestSumAPIZeroDimKeepDim(unittest.TestCase): def setUp(self): np.random.seed(123) paddle.enable_static() - self.places = [paddle.CPUPlace()] - if paddle.is_compiled_with_cuda(): - self.places.append(paddle.CUDAPlace(0)) + self.places = get_places() def test_static(self): for place in self.places: @@ -2365,9 +2365,7 @@ def setUp(self): "complex64", "complex128", ] - self.places = [base.CPUPlace()] - if core.is_compiled_with_cuda(): - self.places.append(base.CUDAPlace(0)) + self.places = get_places() def calculate_expected_result(self, x_np, axis, keepdim): expected_result = np.all(x_np, axis=axis, keepdims=keepdim) @@ -2454,9 +2452,7 @@ def setUp(self): "complex64", "complex128", ] - self.places = [base.CPUPlace()] - if core.is_compiled_with_cuda(): - self.places.append(base.CUDAPlace(0)) + self.places = get_places() def calculate_expected_result(self, x_np, axis, keepdim): expected_result = np.any(x_np, axis=axis, keepdims=keepdim) diff --git a/test/legacy_test/test_restrict_nonzero.py b/test/legacy_test/test_restrict_nonzero.py index a8d072710f0a7c..62a7607f193491 100644 --- a/test/legacy_test/test_restrict_nonzero.py +++ b/test/legacy_test/test_restrict_nonzero.py @@ -15,13 +15,15 @@ import unittest import numpy as np +from op_test import is_custom_device import paddle from paddle.base import core @unittest.skipIf( - not core.is_compiled_with_cuda(), "core is not compiled with CUDA" + not (core.is_compiled_with_cuda() or is_custom_device()), + "core is not compiled with CUDA", ) class TestRestrictNonzero(unittest.TestCase): def test_restrict_nonzero(self): diff --git a/test/legacy_test/test_rmsprop_op.py b/test/legacy_test/test_rmsprop_op.py index 56f682bffabc50..e814eb112ded27 100644 --- a/test/legacy_test/test_rmsprop_op.py +++ b/test/legacy_test/test_rmsprop_op.py @@ -16,7 +16,7 @@ import numpy as np from op import Operator -from op_test import get_device_place, get_places +from op_test import get_device_place, get_devices, get_places import paddle from paddle import base @@ -416,7 +416,7 @@ def _test_rms_op_dygraph_place_amp(self, place, use_amp=False): paddle.enable_static() def test_main(self): - for place in get_places(string_format=True): + for place in get_devices(): use_amp_list = [True, False] for use_amp in use_amp_list: self._test_rms_op_dygraph_place_amp(place, use_amp) diff --git a/test/legacy_test/test_rrelu_op.py b/test/legacy_test/test_rrelu_op.py index 97be548fcdf48f..e00ed4daba380a 100644 --- a/test/legacy_test/test_rrelu_op.py +++ b/test/legacy_test/test_rrelu_op.py @@ -15,7 +15,7 @@ import unittest import numpy as np -from op_test import OpTest, convert_float_to_uint16 +from op_test import OpTest, convert_float_to_uint16, get_device_place import paddle import paddle.nn.functional as F @@ -50,13 +50,7 @@ def setUp(self): self.upper_0 = 0.25 self.upper_1 = 0.33 - self.places = [ - ( - base.CUDAPlace(0) - if core.is_compiled_with_cuda() - else base.CPUPlace() - ) - ] + self.places = [get_device_place()] def check_static_result(self, place): with paddle.static.program_guard( diff --git a/test/legacy_test/test_set_value_op.py b/test/legacy_test/test_set_value_op.py index 2ff97d7ea7defc..c4ad490c8defb3 100644 --- a/test/legacy_test/test_set_value_op.py +++ b/test/legacy_test/test_set_value_op.py @@ -17,7 +17,7 @@ import unittest import numpy as np -from op_test import OpTest, convert_float_to_uint16, get_places +from op_test import OpTest, convert_float_to_uint16, get_devices import paddle from paddle.base import core @@ -1277,7 +1277,7 @@ def _call_setitem_static_api(self, x): return x def test_api(self): - for place in get_places(string_format=True): + for place in get_devices(): paddle.set_device(place) static_out = self._run_static() diff --git a/test/legacy_test/test_sign_op.py b/test/legacy_test/test_sign_op.py index f664f70a3b9917..be6ef62b1c0da0 100644 --- a/test/legacy_test/test_sign_op.py +++ b/test/legacy_test/test_sign_op.py @@ -194,10 +194,7 @@ def run(place): class TestSignComplexAPI(TestSignAPI): def setUp(self): - self.place = [] - self.place.append(base.CPUPlace()) - if core.is_compiled_with_cuda(): - self.place.append(base.CUDAPlace(0)) + self.place = get_places() def test_dygraph(self): with base.dygraph.guard(): diff --git a/test/legacy_test/test_soft_margin_loss.py b/test/legacy_test/test_soft_margin_loss.py index 2bb726b4bcf71c..2dc2d9f76ed600 100644 --- a/test/legacy_test/test_soft_margin_loss.py +++ b/test/legacy_test/test_soft_margin_loss.py @@ -15,7 +15,7 @@ import unittest import numpy as np -from op_test import get_places +from op_test import get_devices, get_places import paddle @@ -127,7 +127,7 @@ class TestSoftMarginLoss(unittest.TestCase): def test_SoftMarginLoss(self): input_np = np.random.uniform(0.1, 0.8, size=(5, 5)).astype(np.float64) types = [np.int32, np.int64, np.float32, np.float64] - places = get_places(string_format=True) + places = get_devices() reductions = ['sum', 'mean', 'none'] for place in places: for reduction in reductions: diff --git a/test/legacy_test/test_sum_op.py b/test/legacy_test/test_sum_op.py index 6d8b4becc9b48e..0746cc46d022a9 100644 --- a/test/legacy_test/test_sum_op.py +++ b/test/legacy_test/test_sum_op.py @@ -889,9 +889,7 @@ class TestSum_BoolToInt64_ZeroSize(unittest.TestCase): def setUp(self): np.random.seed(123) self.shape = [3, 0, 2] - self.places = [base.CPUPlace()] - if core.is_compiled_with_cuda(): - self.places.append(base.CUDAPlace(0)) + self.places = get_places() def check_result( self, dygraph_result, expected_result, axis, keepdim, dtype, place diff --git a/test/legacy_test/test_tensor_type_autocast.py b/test/legacy_test/test_tensor_type_autocast.py index 865fc590bc159a..ee85c391cd415a 100644 --- a/test/legacy_test/test_tensor_type_autocast.py +++ b/test/legacy_test/test_tensor_type_autocast.py @@ -15,6 +15,7 @@ import unittest import numpy as np +from op_test import get_places import paddle @@ -22,9 +23,7 @@ class TestAutocastBase(unittest.TestCase): def setUp(self): self.set_api_and_dtypes() - self.places = [paddle.CPUPlace()] - if paddle.core.is_compiled_with_cuda(): - self.places.append(paddle.CUDAPlace(0)) + self.places = get_places() def set_api_and_dtypes(self): pass diff --git a/test/legacy_test/test_tensor_type_convert_api.py b/test/legacy_test/test_tensor_type_convert_api.py new file mode 100644 index 00000000000000..0021c1d448d93b --- /dev/null +++ b/test/legacy_test/test_tensor_type_convert_api.py @@ -0,0 +1,245 @@ +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np + +import paddle + + +class TensorDtypeConversionsTest(unittest.TestCase): + """ + Unit tests for all supported tensor dtype conversion methods. + """ + + _supported_dtype_conversions = { + # float + 'float16': 'float16', + 'half': 'float16', + 'bfloat16': 'bfloat16', + 'float32': 'float32', + 'float': 'float32', + 'float64': 'float64', + 'double': 'float64', + # int + 'int8': 'int8', + 'char': 'int8', + 'uint8': 'uint8', + 'byte': 'uint8', + 'int16': 'int16', + 'short': 'int16', + 'int32': 'int32', + 'int': 'int32', + 'int64': 'int64', + 'long': 'int64', + # other + 'bool': 'bool', + 'complex64': 'complex64', + 'complex128': 'complex128', + 'cfloat': 'complex64', + 'cdouble': 'complex128', + } + _device = paddle.device.get_device() + _total_init_dtype = [ + 'float16', + 'float32', + 'float64', + 'int8', + 'uint8', + 'int16', + 'int32', + 'int64', + 'bool', + 'complex64', + 'complex128', + ] + + def setUp(self): + self.shape = [10, 1000] + + def _get_paddle_dtype(self, dtype_str): + """Get the Paddle dtype constant by string name.""" + return getattr(paddle, dtype_str) + + def test_bfloat16_conversion(self): + for init_dtype in self._total_init_dtype: + if self._device.startswith('xpu') and init_dtype == 'complex128': + continue + tensor = paddle.randn(self.shape).astype(init_dtype) + converted_tensor = tensor.bfloat16() + self.assertEqual(converted_tensor.dtype, paddle.bfloat16) + self.assertEqual(converted_tensor.shape, tensor.shape) + + for ( + method_name, + target_dtype, + ) in self._supported_dtype_conversions.items(): + if self._device.startswith('xpu') and target_dtype == 'complex128': + continue + tensor = paddle.randn(self.shape).astype('bfloat16') + converted_tensor = getattr(tensor, method_name)() + self.assertEqual( + converted_tensor.dtype, self._get_paddle_dtype(target_dtype) + ) + self.assertEqual(converted_tensor.shape, tensor.shape) + + def test_all_dtype_conversions(self): + """Test all dtype conversion methods.""" + for ( + method_name, + target_dtype, + ) in self._supported_dtype_conversions.items(): + if target_dtype == 'bfloat16': + continue + for init_dtype in self._total_init_dtype: + if self._device.startswith('xpu') and ( + target_dtype == 'complex128' or init_dtype == 'complex128' + ): + self.skipTest("Skipping complex conversion tests on XPU") + + with self.subTest( + method=method_name, + init_dtype=init_dtype, + target_dtype=target_dtype, + ): + self._test_single_dtype_conversion( + method_name, init_dtype, target_dtype + ) + + def _test_single_dtype_conversion( + self, method_name, init_dtype, target_dtype + ): + """Test a single dtype conversion method.""" + if init_dtype.startswith('float'): + data_np = np.random.randn(*self.shape).astype(init_dtype) + elif init_dtype.startswith('complex'): + data_np_real = np.random.randn(*self.shape) + data_np_imag = np.random.randn(*self.shape) + data_np = data_np_real + data_np_imag * 1j + data_np = data_np.astype(init_dtype) + else: + data_np = np.random.randint(-100, 100, size=self.shape).astype( + init_dtype + ) + + tensor = paddle.to_tensor(data_np, dtype=init_dtype) + + # Check if conversion method exists + self.assertTrue( + hasattr(tensor, method_name), + f"Tensor should have method '{method_name}'", + ) + # Perform dtype conversion + converted_tensor = getattr(tensor, method_name)() + + # Check the dtype after conversion + expected_dtype = self._get_paddle_dtype(target_dtype) + self.assertEqual( + converted_tensor.dtype, + expected_dtype, + f"Expected dtype {expected_dtype}, but got {converted_tensor.dtype} for method '{method_name}'", + ) + + # Check that the shape remains unchanged + self.assertEqual( + tensor.shape, + converted_tensor.shape, + f"Shape should remain unchanged after {method_name} conversion", + ) + + if target_dtype.endswith('float16'): + rtol = 1e-3 + atol = 1e-3 + else: + rtol = 1e-7 + atol = 0 + + # Check the value after conversion + np.testing.assert_allclose( + converted_tensor.numpy(), + data_np.astype(target_dtype), + rtol=rtol, + atol=atol, + err_msg=f"Value mismatch after {method_name} conversion", + ) + + def test_method_chaining(self): + """Test method chaining for dtype conversions.""" + tensor = paddle.to_tensor([1.5, 2.5, 3.5], dtype='float32') + + # float32 -> int32 -> float64 -> int64 + result = tensor.int32().float64().int64() + self.assertEqual(result.dtype, paddle.int64) + + def test_pir_all_dtype_conversions(self): + """Test all dtype conversion methods for pir.Value in static graph.""" + paddle.enable_static() + startup_prog = paddle.static.Program() + main_prog = paddle.static.Program() + with paddle.static.program_guard(main_prog, startup_prog): + for ( + method_name, + target_dtype, + ) in self._supported_dtype_conversions.items(): + + if target_dtype == 'bfloat16': + continue + for init_dtype in self._total_init_dtype: + if ( + self._device.startswith('xpu') + and target_dtype == 'complex128' + ): + self.skipTest( + "Skipping complex conversion tests on XPU" + ) + with self.subTest( + pir_method=method_name, + pir_init_dtype=init_dtype, + pir_target_dtype=target_dtype, + ): + self._pir_single_dtype_conversion( + method_name, init_dtype, target_dtype + ) + + def _pir_single_dtype_conversion( + self, method_name, init_dtype, target_dtype + ): + + # Create static graph input + x = paddle.static.data(name="x", shape=self.shape, dtype=init_dtype) + # Check if the method exists + self.assertTrue( + hasattr(x, method_name), + f"pir.Value should have method '{method_name}'", + ) + # Perform dtype conversion + converted = getattr(x, method_name)() + # Check the dtype + expected_dtype = self._get_paddle_dtype(target_dtype) + self.assertEqual( + converted.dtype, + expected_dtype, + f"Expected pir.Value dtype {expected_dtype}, but got {converted.dtype} for method '{method_name}'", + ) + # Check the shape + self.assertEqual( + tuple(x.shape), + tuple(converted.shape), + f"pir.Value shape should remain unchanged after {method_name} conversion", + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/test/legacy_test/test_trace_op.py b/test/legacy_test/test_trace_op.py index a902b346432792..e5a9228219c7d1 100644 --- a/test/legacy_test/test_trace_op.py +++ b/test/legacy_test/test_trace_op.py @@ -15,7 +15,7 @@ import unittest import numpy as np -from op_test import OpTest, convert_float_to_uint16 +from op_test import OpTest, convert_float_to_uint16, get_places import paddle from paddle import base, tensor @@ -202,9 +202,7 @@ def test_case1(self): class TestTraceAPIZerodimCase(unittest.TestCase): def setUp(self): - self.places = [paddle.CPUPlace()] - if paddle.is_compiled_with_cuda(): - self.places.append(paddle.CUDAPlace(0)) + self.places = get_places() self.x = np.random.random([5, 0, 0, 0]).astype('float32') def test_dygraph(self): diff --git a/test/legacy_test/test_transforms.py b/test/legacy_test/test_transforms.py index a797c4eb7fe6a3..310df4f116104a 100644 --- a/test/legacy_test/test_transforms.py +++ b/test/legacy_test/test_transforms.py @@ -19,7 +19,7 @@ import cv2 import numpy as np -from op_test import get_places +from op_test import get_devices from PIL import Image import paddle @@ -819,7 +819,7 @@ def test_color_jitter_sub_function(self): np_img_gray = (np.random.rand(28, 28, 1) * 255).astype('uint8') tensor_img_gray = F.to_tensor(np_img_gray) - places = get_places(string_format=True) + places = get_devices() def test_adjust_brightness(np_img, tensor_img): result_cv2 = np.array(F.adjust_brightness(np_img, 1.2)) @@ -956,7 +956,7 @@ def test_erase(self): np.testing.assert_equal(np.array(pil_result), expected) np_data = np.random.rand(3, 28, 28).astype('float32') - for place in get_places(string_format=True): + for place in get_devices(): paddle.set_device(place) tensor_img = paddle.to_tensor(np_data) expected_tensor = tensor_img.clone() diff --git a/test/legacy_test/test_uniform_random_inplace_op.py b/test/legacy_test/test_uniform_random_inplace_op.py index 7424b5d982d452..5e560acdc7e9e5 100644 --- a/test/legacy_test/test_uniform_random_inplace_op.py +++ b/test/legacy_test/test_uniform_random_inplace_op.py @@ -15,7 +15,7 @@ import unittest import numpy as np -from op_test import OpTest, convert_uint16_to_float, get_places +from op_test import OpTest, convert_uint16_to_float, get_devices import paddle from paddle.base import core @@ -44,7 +44,7 @@ def test_fp64(): tensor_fp64.uniform_() self.assertEqual(tensor_fp64.dtype, paddle.float64) - for place in get_places(string_format=True): + for place in get_devices(): paddle.set_device(place) test_fp32() test_fp64() @@ -215,7 +215,7 @@ def test_attr_error(): class TestUniformRandomInplaceOpEmptyTensor(unittest.TestCase): def test_uniform_random_inplace_op_empty_tensor(self): test_shapes = [(200, 0), (0, 200)] - for place in get_places(string_format=True): + for place in get_devices(): paddle.set_device(place) for test_shape in test_shapes: tensor = paddle.empty(shape=test_shape) @@ -241,7 +241,7 @@ def test_grad(): uniform_grad = tensor_b.grad.numpy() self.assertTrue((uniform_grad == 0).all()) - for place in get_places(string_format=True): + for place in get_devices(): paddle.set_device(place) test_grad() diff --git a/test/legacy_test/test_uniform_random_op.py b/test/legacy_test/test_uniform_random_op.py index ce47ce69d32e56..43fe75fed5810d 100644 --- a/test/legacy_test/test_uniform_random_op.py +++ b/test/legacy_test/test_uniform_random_op.py @@ -16,7 +16,12 @@ import numpy as np from op import Operator -from op_test import OpTest, convert_uint16_to_float, get_places +from op_test import ( + OpTest, + convert_uint16_to_float, + get_places, + is_custom_device, +) import paddle from paddle import base @@ -187,7 +192,8 @@ def test_check_api(self): @unittest.skipIf( - not core.is_compiled_with_cuda(), "core is not compiled with CUDA" + not (core.is_compiled_with_cuda() or is_custom_device()), + "core is not compiled with CUDA", ) class TestUniformRandomFP16Op(TestUniformRandomOp): def init_dtype(self): diff --git a/test/legacy_test/test_zero_dim_no_backward_api.py b/test/legacy_test/test_zero_dim_no_backward_api.py index f73d72ad4bcace..55d37af35e823e 100644 --- a/test/legacy_test/test_zero_dim_no_backward_api.py +++ b/test/legacy_test/test_zero_dim_no_backward_api.py @@ -21,7 +21,7 @@ import numpy as np from decorator_helper import prog_scope -from op_test import get_places +from op_test import get_devices import paddle @@ -182,7 +182,7 @@ def test_one_hot_label(self): self.assertEqual(one_hot_label.numpy()[2], 1) def test_unique_consecutive(self): - for place in get_places(string_format=True): + for place in get_devices(): paddle.set_device(place) x = paddle.rand([]) y, inverse, counts = paddle.unique_consecutive( @@ -199,7 +199,7 @@ def test_unique_consecutive(self): self.assertEqual(counts.shape, [1]) def test_unique(self): - for place in get_places(string_format=True): + for place in get_devices(): paddle.set_device(place) x = paddle.rand([]) y, index, inverse, counts = paddle.unique( diff --git a/test/legacy_test/test_zero_dim_sundry_dygraph_api.py b/test/legacy_test/test_zero_dim_sundry_dygraph_api.py index c0e238bf3fb5f8..bc958ca42bf242 100644 --- a/test/legacy_test/test_zero_dim_sundry_dygraph_api.py +++ b/test/legacy_test/test_zero_dim_sundry_dygraph_api.py @@ -21,7 +21,7 @@ import unittest import numpy as np -from op_test import get_device_place, get_places +from op_test import get_device_place, get_devices import paddle import paddle.nn.functional as F @@ -1691,7 +1691,7 @@ def test_lerp(self): self.assertEqual(y2.grad.shape, []) def test_repeat_interleave(self): - for place in get_places(string_format=True): + for place in get_devices(): paddle.set_device(place) x = paddle.randn(()) diff --git a/test/mkldnn/test_onnx_format_quantization_mobilenetv1.py b/test/mkldnn/test_onnx_format_quantization_mobilenetv1.py index ff654a52c878a5..0761e4cf84e26d 100644 --- a/test/mkldnn/test_onnx_format_quantization_mobilenetv1.py +++ b/test/mkldnn/test_onnx_format_quantization_mobilenetv1.py @@ -208,12 +208,12 @@ def run_program( image_shape = [3, 224, 224] config = paddle.inference.Config(model_path) config.disable_gpu() - config.enable_mkldnn() + config.enable_onednn() config.switch_ir_optim() config.set_cpu_math_library_num_threads(1) config.disable_glog_info() if is_quantized_model: - config.enable_mkldnn_int8() + config.enable_onednn_int8() predictor = paddle.inference.create_predictor(config) input_names = predictor.get_input_names() diff --git a/test/mkldnn/test_shape_mkldnn_op.py b/test/mkldnn/test_shape_mkldnn_op.py index 4ae0e02b98f99e..1531980cda91da 100644 --- a/test/mkldnn/test_shape_mkldnn_op.py +++ b/test/mkldnn/test_shape_mkldnn_op.py @@ -26,7 +26,7 @@ def setUp(self): self.op_type = "shape" self.python_api = paddle.tensor.shape self.config() - self.attrs = {'use_mkldnn': True} + self.attrs = {'use_onednn': True} self.inputs = {'Input': np.zeros(self.shape).astype(self.dtype)} self.outputs = {'Out': np.array(self.shape)} diff --git a/test/quantization/CMakeLists.txt b/test/quantization/CMakeLists.txt index 20e2c49c9ea4a4..20082befcba268 100644 --- a/test/quantization/CMakeLists.txt +++ b/test/quantization/CMakeLists.txt @@ -179,7 +179,7 @@ function(inference_quant2_int8_lstm_model_test target fp32_model quant_model ${dataset_path} --num_threads 1 - --mkldnn_cache_capacity + --onednn_cache_capacity 100 --warmup_iter 100 diff --git a/test/quantization/README.md b/test/quantization/README.md index 4ab0a8fa06aff1..eeb4b838fe7648 100644 --- a/test/quantization/README.md +++ b/test/quantization/README.md @@ -28,7 +28,7 @@ A procedure on how to transform an FP32 model into a Quant model supported by th ## 3. How to turn a Quant model into an INT8 MKL-DNN model? -A Quant model can be transformed into an INT8 quantized model if it contains enough information about quantization scales for every quantized operator in the graph. The process of quantization is done by the `Quant2Int8MkldnnPass` pass which comprises several steps: +A Quant model can be transformed into an INT8 quantized model if it contains enough information about quantization scales for every quantized operator in the graph. The process of quantization is done by the `Quant2Int8OnednnPass` pass which comprises several steps: ### Gathering scales @@ -88,12 +88,12 @@ Having gathered all the data needed for quantization we apply the `cpu_quantize_ ## 4. Code example -The code snipped shows how the `Quant2Int8MkldnnPass` can be applied to a model graph: +The code snipped shows how the `Quant2Int8OnednnPass` can be applied to a model graph: ```python import paddle import paddle.static as static - from paddle.static.quantization import Quant2Int8MkldnnPass + from paddle.static.quantization import Quant2Int8OnednnPass from paddle.base.framework import IrGraph from paddle.framework import core @@ -101,10 +101,10 @@ The code snipped shows how the `Quant2Int8MkldnnPass` can be applied to a model graph = IrGraph(core.Graph(static.Program().desc), for_test=False) place = paddle.CPUPlace() # Convert the IrGraph to MKL-DNN supported INT8 IrGraph using the - # Quant2Int8MkldnnPass. It requires a list of operators to be quantized - mkldnn_pass = Quant2Int8MkldnnPass({'conv2d', 'pool2d'}, static.global_scope(), place, core, False) - # Apply Quant2Int8MkldnnPass to IrGraph - mkldnn_pass.apply(graph) + # Quant2Int8OnednnPass. It requires a list of operators to be quantized + onednn_pass = Quant2Int8OnednnPass({'conv2d', 'pool2d'}, static.global_scope(), place, core, False) + # Apply Quant2Int8OnednnPass to IrGraph + onednn_pass.apply(graph) ``` @@ -273,7 +273,7 @@ OMP_NUM_THREADS=28 FLAGS_use_mkldnn=true python python/paddle/static/quantizatio To reproduce the performance results, the environment variable `OMP_NUM_THREADS=1` and `--batch_size=1` option should be set. -1. Transform the Quant model into INT8 model by applying the `Quant2Int8MkldnnPass` pass and save the result. You can use the script `save_quant_model.py` for this purpose. It also accepts the option `--ops_to_quantize` with a list of operators to quantize. +1. Transform the Quant model into INT8 model by applying the `Quant2Int8OnednnPass` pass and save the result. You can use the script `save_quant_model.py` for this purpose. It also accepts the option `--ops_to_quantize` with a list of operators to quantize. ```bash cd /PATH/TO/PADDLE/build diff --git a/test/quantization/quant2_int8_image_classification_comparison.py b/test/quantization/quant2_int8_image_classification_comparison.py index 7f6666c7b6a90d..edda63d5d0f532 100644 --- a/test/quantization/quant2_int8_image_classification_comparison.py +++ b/test/quantization/quant2_int8_image_classification_comparison.py @@ -25,7 +25,7 @@ import paddle from paddle.base.framework import IrGraph from paddle.framework import core -from paddle.static.quantization import Quant2Int8MkldnnPass +from paddle.static.quantization import Quant2Int8OnednnPass paddle.enable_static() @@ -211,7 +211,7 @@ def _predict( graph = IrGraph(core.Graph(inference_program.desc), for_test=True) if self._debug: graph.draw('.', 'quant_orig', graph.all_op_nodes()) - quant_transform_pass = Quant2Int8MkldnnPass( + quant_transform_pass = Quant2Int8OnednnPass( self._quantized_ops, _op_ids_to_skip=self._op_ids_to_skip, _scope=inference_scope, diff --git a/test/quantization/quant2_int8_lstm_model.py b/test/quantization/quant2_int8_lstm_model.py index 8cfa3ab04666e9..f7d8553ce38cab 100644 --- a/test/quantization/quant2_int8_lstm_model.py +++ b/test/quantization/quant2_int8_lstm_model.py @@ -49,7 +49,7 @@ def parse_args(): '--num_threads', type=int, default=1, help='Number of threads.' ) parser.add_argument( - '--mkldnn_cache_capacity', + '--onednn_cache_capacity', type=int, default=0, help='Mkldnn cache capacity. The default value in Python API is 15, which can slow down int8 models. Default 0 means unlimited cache.', @@ -101,7 +101,7 @@ def set_config( self, model_path, num_threads, - mkldnn_cache_capacity, + onednn_cache_capacity, warmup_data=None, use_analysis=False, mode="fp32", @@ -112,16 +112,16 @@ def set_config( config.disable_gpu() config.switch_use_feed_fetch_ops(True) config.switch_ir_optim(True) - config.enable_mkldnn() - config.disable_mkldnn_fc_passes() # fc passes caused dnnl error + config.enable_onednn() + config.disable_onednn_fc_passes() # fc passes caused dnnl error config.pass_builder().insert_pass(5, "fc_lstm_fuse_pass") - config.set_mkldnn_cache_capacity(mkldnn_cache_capacity) + config.set_onednn_cache_capacity(onednn_cache_capacity) if mode == "ptq": config.enable_quantizer() config.quantizer_config().set_quant_data(warmup_data) config.quantizer_config().set_quant_batch_size(1) elif mode == "qat": - config.enable_mkldnn_int8() + config.enable_onednn_int8() return config @@ -130,7 +130,7 @@ def run_program( model_path, data_path, num_threads, - mkldnn_cache_capacity, + onednn_cache_capacity, warmup_iter, use_analysis=False, mode="fp32", @@ -141,7 +141,7 @@ def run_program( config = self.set_config( model_path, num_threads, - mkldnn_cache_capacity, + onednn_cache_capacity, warmup_data, use_analysis, mode, @@ -216,7 +216,7 @@ def test_lstm_model(self): infer_data ), 'The dataset path cannot be empty. Please, use the --infer_data option.' num_threads = test_case_args.num_threads - mkldnn_cache_capacity = test_case_args.mkldnn_cache_capacity + onednn_cache_capacity = test_case_args.onednn_cache_capacity warmup_iter = test_case_args.warmup_iter acc_diff_threshold = test_case_args.acc_diff_threshold @@ -224,7 +224,7 @@ def test_lstm_model(self): fp32_model, infer_data, num_threads, - mkldnn_cache_capacity, + onednn_cache_capacity, warmup_iter, False, mode="fp32", @@ -234,7 +234,7 @@ def test_lstm_model(self): fp32_model, infer_data, num_threads, - mkldnn_cache_capacity, + onednn_cache_capacity, warmup_iter, True, mode="ptq", @@ -244,7 +244,7 @@ def test_lstm_model(self): quant_model, infer_data, num_threads, - mkldnn_cache_capacity, + onednn_cache_capacity, warmup_iter, True, mode="qat", diff --git a/test/quantization/quant2_int8_nlp_comparison.py b/test/quantization/quant2_int8_nlp_comparison.py index bc2c0c4acbc66e..215441823f4a1c 100644 --- a/test/quantization/quant2_int8_nlp_comparison.py +++ b/test/quantization/quant2_int8_nlp_comparison.py @@ -158,9 +158,9 @@ def set_config( config.switch_specify_input_names(True) config.switch_ir_optim(True) config.switch_use_feed_fetch_ops(True) - config.enable_mkldnn() + config.enable_onednn() if target == 'int8': - config.enable_mkldnn_int8(self._quantized_ops) + config.enable_onednn_int8(self._quantized_ops) config.delete_pass( "constant_folding_pass" ) # same reason as in analyzer_ernie_int8_tester.cc diff --git a/test/quantization/quant_int8_image_classification_comparison.py b/test/quantization/quant_int8_image_classification_comparison.py index f0944eb34b3afe..4fc176c45c0d43 100644 --- a/test/quantization/quant_int8_image_classification_comparison.py +++ b/test/quantization/quant_int8_image_classification_comparison.py @@ -25,7 +25,7 @@ import paddle from paddle.base.framework import IrGraph from paddle.framework import core -from paddle.static.quantization import QuantInt8MkldnnPass +from paddle.static.quantization import QuantInt8OnednnPass paddle.enable_static() @@ -190,10 +190,10 @@ def _predict( if self._debug: graph.draw('.', 'quant_orig', graph.all_op_nodes()) if transform_to_int8: - mkldnn_int8_pass = QuantInt8MkldnnPass( + onednn_int8_pass = QuantInt8OnednnPass( _scope=inference_scope, _place=place ) - graph = mkldnn_int8_pass.apply(graph) + graph = onednn_int8_pass.apply(graph) else: graph = self._prepare_for_fp32_mkldnn(graph) diff --git a/test/xpu/test_xpu_stream_event.py b/test/xpu/test_xpu_stream_event.py index 9bf1d21c5ee57e..b739bc9f7ad390 100644 --- a/test/xpu/test_xpu_stream_event.py +++ b/test/xpu/test_xpu_stream_event.py @@ -12,12 +12,30 @@ # See the License for the specific language governing permissions and # limitations under the License. +import ctypes import unittest +import numpy as np + import paddle from paddle.device import xpu +class TestCurrentStream(unittest.TestCase): + def test_current_stream(self): + if paddle.is_compiled_with_xpu(): + s = xpu.current_stream() + self.assertTrue(isinstance(s, xpu.Stream)) + + s1 = xpu.current_stream(0) + self.assertTrue(isinstance(s1, xpu.Stream)) + + s2 = xpu.current_stream(paddle.XPUPlace(0)) + self.assertTrue(isinstance(s2, xpu.Stream)) + self.assertEqual(s1, s2) + self.assertRaises(ValueError, xpu.current_stream, "xpu:0") + + class TestSynchronize(unittest.TestCase): def test_synchronize(self): if paddle.is_compiled_with_xpu(): @@ -28,5 +46,120 @@ def test_synchronize(self): self.assertRaises(ValueError, xpu.synchronize, "xpu:0") +class TestXPUStream(unittest.TestCase): + def test_xpu_stream(self): + if paddle.is_compiled_with_xpu(): + s = paddle.device.xpu.Stream() + self.assertIsNotNone(s) + + def test_xpu_stream_synchronize(self): + if paddle.is_compiled_with_xpu(): + s = paddle.device.xpu.Stream() + e1 = paddle.device.xpu.Event() + e2 = paddle.device.xpu.Event() + + e1.record(s) + e1.query() + tensor1 = paddle.to_tensor(paddle.rand([1000, 1000])) + tensor2 = paddle.matmul(tensor1, tensor1) + s.synchronize() + e2.record(s) + e2.synchronize() + + self.assertTrue(e2.query()) + + def test_xpu_stream_wait_event_and_record_event(self): + if paddle.is_compiled_with_xpu(): + s1 = xpu.Stream(0) + tensor1 = paddle.to_tensor(paddle.rand([1000, 1000])) + tensor2 = paddle.matmul(tensor1, tensor1) + e1 = xpu.Event() + s1.record_event(e1) + + s2 = xpu.Stream(0) + s2.wait_event(e1) + s2.synchronize() + + self.assertTrue(e1.query()) + + +class TestXPUEvent(unittest.TestCase): + def test_xpu_event(self): + if paddle.is_compiled_with_xpu(): + e = paddle.device.xpu.Event() + self.assertIsNotNone(e) + s = paddle.device.xpu.current_stream() + + def test_xpu_event_methods(self): + if paddle.is_compiled_with_xpu(): + e = paddle.device.xpu.Event() + s = paddle.device.xpu.current_stream() + event_query_1 = e.query() + tensor1 = paddle.to_tensor(paddle.rand([1000, 1000])) + tensor2 = paddle.matmul(tensor1, tensor1) + s.record_event(e) + e.synchronize() + event_query_2 = e.query() + + self.assertTrue(event_query_1) + self.assertTrue(event_query_2) + + +class TestStreamGuard(unittest.TestCase): + ''' + Note: + The asynchronous execution property of XPU Stream can only be tested offline. + ''' + + def test_stream_guard_normal(self): + if paddle.is_compiled_with_xpu(): + s = paddle.device.Stream() + a = paddle.to_tensor(np.array([0, 2, 4], dtype="int32")) + b = paddle.to_tensor(np.array([1, 3, 5], dtype="int32")) + c = a + b + with paddle.device.stream_guard(s): + d = a + b + s.synchronize() + + np.testing.assert_array_equal(np.array(c), np.array(d)) + + def test_stream_guard_default_stream(self): + if paddle.is_compiled_with_xpu(): + s1 = paddle.device.current_stream() + with paddle.device.stream_guard(s1): + pass + s2 = paddle.device.current_stream() + + self.assertTrue(id(s1.stream_base) == id(s2.stream_base)) + + def test_set_current_stream_default_stream(self): + if paddle.is_compiled_with_xpu(): + cur_stream = paddle.device.current_stream() + new_stream = paddle.device.set_stream(cur_stream) + + self.assertTrue( + id(cur_stream.stream_base) == id(new_stream.stream_base) + ) + + def test_stream_guard_raise_error(self): + if paddle.is_compiled_with_xpu(): + + def test_not_correct_stream_guard_input(): + tmp = np.zeros(5) + with paddle.device.stream_guard(tmp): + pass + + self.assertRaises(TypeError, test_not_correct_stream_guard_input) + + +class TestRawStream(unittest.TestCase): + def test_xpu_stream(self): + if paddle.is_compiled_with_xpu(): + xpu_stream = paddle.device.xpu.current_stream().xpu_stream + print(xpu_stream) + self.assertTrue(type(xpu_stream) is int) + ptr = ctypes.c_void_p(xpu_stream) + + if __name__ == "__main__": unittest.main()